In [2]:
%pip install pandas numpy matplotlib seaborn scikit-learn


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# 1. Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

In [4]:
# 3. Load the Dataset into the pandas data frame
# useing the seaborn library which has the iris dataset built-in
iris = sns.load_dataset('iris')
print(iris.head())

   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa


In [5]:
# 4. Display the initial statistics
print("First 5 rows of the dataset:")
print(iris.head())
print("\nDataset shape:")
print(iris.shape)
print("\nDataset information:")
print(iris.info())
print("\nDescriptive statistics:")
print(iris.describe())
print("\nClass distribution:")
print(iris['species'].value_counts())

First 5 rows of the dataset:
   sepal_length  sepal_width  petal_length  petal_width species
0           5.1          3.5           1.4          0.2  setosa
1           4.9          3.0           1.4          0.2  setosa
2           4.7          3.2           1.3          0.2  setosa
3           4.6          3.1           1.5          0.2  setosa
4           5.0          3.6           1.4          0.2  setosa

Dataset shape:
(150, 5)

Dataset information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
None

Descriptive statistics:
       sepal_length  sepal_width  petal_length  petal

In [6]:
# 5. Scan all variables for missing values and inconsistencies
print("\nMissing values in each column:")
print(iris.isnull().sum())
# Check for duplicates
print("\nNumber of duplicate rows:")
print(iris.duplicated().sum())



Missing values in each column:
sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

Number of duplicate rows:
1


In [8]:
# 6. Identify outliers using IQR method
numeric_columns = iris.select_dtypes(include=[np.number]).columns
# Calculate Q1, Q3, and IQR for each numeric column
Q1 = iris[numeric_columns].quantile(0.25)
Q3 = iris[numeric_columns].quantile(0.75)
IQR = Q3 - Q1
# Identify outliers
outliers = ((iris[numeric_columns] < (Q1 - 1.5 * IQR)) | 
           (iris[numeric_columns] > (Q3 + 1.5 * IQR))).any(axis=1)
print("\nNumber of rows with outliers:", outliers.sum())
print("\nOutlier rows:")
print(iris[outliers])


Number of rows with outliers: 4

Outlier rows:
    sepal_length  sepal_width  petal_length  petal_width     species
15           5.7          4.4           1.5          0.4      setosa
32           5.2          4.1           1.5          0.1      setosa
33           5.5          4.2           1.4          0.2      setosa
60           5.0          2.0           3.5          1.0  versicolor


In [11]:
# 7. Apply data transformations - Min-Max scaling
# Apply Min-Max scaling to the numeric variables
scaler = MinMaxScaler()
iris_scaled = iris.copy()
iris_scaled[numeric_columns] = scaler.fit_transform(iris_scaled[numeric_columns])
print("\nAfter Min-Max scaling - descriptive statistics:")
print(iris_scaled[numeric_columns].describe())


After Min-Max scaling - descriptive statistics:
       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       0.428704     0.440556      0.467458     0.458056
std        0.230018     0.181611      0.299203     0.317599
min        0.000000     0.000000      0.000000     0.000000
25%        0.222222     0.333333      0.101695     0.083333
50%        0.416667     0.416667      0.567797     0.500000
75%        0.583333     0.541667      0.694915     0.708333
max        1.000000     1.000000      1.000000     1.000000


In [12]:
# Label Encoding
label_encoder = LabelEncoder()
iris_encoded = iris.copy()
iris_encoded['species_encoded'] = label_encoder.fit_transform(iris_encoded['species'])

print("\nAfter Label Encoding:")
print(iris_encoded[['species', 'species_encoded']].head(10))



After Label Encoding:
  species  species_encoded
0  setosa                0
1  setosa                0
2  setosa                0
3  setosa                0
4  setosa                0
5  setosa                0
6  setosa                0
7  setosa                0
8  setosa                0
9  setosa                0
