In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv(r"D:\data sets\IRIS.csv")

In [5]:
df.head(10)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [6]:
print(df.isnull().sum())

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64


In [7]:
# Example: Fill missing numeric columns with median
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())


In [8]:
# Example: Fill missing categorical columns with mode
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    df[col] = df[col].fillna(df[col].mode()[0])


In [9]:
# Confirm again
print(df.isnull().sum())

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64


In [10]:
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")
df.drop_duplicates(inplace=True)
print(f"Data shape after removing duplicates: {df.shape}")

Number of duplicate rows: 0
Data shape after removing duplicates: (150, 6)


In [11]:
print("Null values before dropping:\n", df.isnull().sum())
df.dropna(inplace=True)
print(f"Shape after dropping nulls: {df.shape}")
print("Null values after dropping:\n", df.isnull().sum())

Null values before dropping:
 Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64
Shape after dropping nulls: (150, 6)
Null values after dropping:
 Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64


In [12]:
# Outlier handling for Iris dataset using IQR
numeric_cols = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']

for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Filter rows within bounds
    df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    
    print(f"{col}:")
    print(f"  Lower bound = {lower_bound}, Upper bound = {upper_bound}")
    print(f"  Data shape after removing outliers: {df.shape}\n")


SepalLengthCm:
  Lower bound = 3.1499999999999986, Upper bound = 8.350000000000001
  Data shape after removing outliers: (150, 6)

SepalWidthCm:
  Lower bound = 2.05, Upper bound = 4.05
  Data shape after removing outliers: (146, 6)

PetalLengthCm:
  Lower bound = -3.649999999999999, Upper bound = 10.349999999999998
  Data shape after removing outliers: (146, 6)

PetalWidthCm:
  Lower bound = -1.95, Upper bound = 4.05
  Data shape after removing outliers: (146, 6)

