# ✅ Day 5 Goals

1️⃣ Missing Values	Detect & handle NaN values

2️⃣ Duplicates	Detect & remove duplicate rows

3️⃣ Fill / Drop Methods	dropna(), fillna(), isnull()

4️⃣ Practical Data Cleaning	Apply on a real or mock DataFrame

# 🔹 1. Detect Missing Values

       NaN = Not a Number = missing data

       Use isnull() to detect

In [1]:
import pandas as pd
import numpy as np

# Sample data with missing values
data = {
    'Name': ['Ali', 'Sara', 'Ahmed', 'Sana'],
    'Age': [25, np.nan, 28, 24],
    'City': ['Lahore', 'Karachi', np.nan, 'Islamabad']
}

df = pd.DataFrame(data)
print(df)

# Detect missing values
print(df.isnull())       # True where missing
print(df.isnull().sum()) # Count of missing in each column


    Name   Age       City
0    Ali  25.0     Lahore
1   Sara   NaN    Karachi
2  Ahmed  28.0        NaN
3   Sana  24.0  Islamabad
    Name    Age   City
0  False  False  False
1  False   True  False
2  False  False   True
3  False  False  False
Name    0
Age     1
City    1
dtype: int64


# 🔹 2. Remove Missing Data


In [2]:
# Drop rows with any missing value
df_dropped = df.dropna()
print(df_dropped)


   Name   Age       City
0   Ali  25.0     Lahore
3  Sana  24.0  Islamabad


# 🔹 3. Fill Missing Values

  fillna() is used to handle missing values smartly

  Useful in data preparation

In [3]:
# Fill with a fixed value
df_filled = df.fillna('Unknown')
print(df_filled)

# Fill numeric column with mean
df['Age'] = df['Age'].fillna(df['Age'].mean())
print(df)


    Name      Age       City
0    Ali     25.0     Lahore
1   Sara  Unknown    Karachi
2  Ahmed     28.0    Unknown
3   Sana     24.0  Islamabad
    Name        Age       City
0    Ali  25.000000     Lahore
1   Sara  25.666667    Karachi
2  Ahmed  28.000000        NaN
3   Sana  24.000000  Islamabad


# 🔹 4. Handling Duplicates



In [4]:
# Add duplicate row
df.loc[4] = ['Ali', 25, 'Lahore']

# Check for duplicates
print(df.duplicated())

# Remove duplicates
df_unique = df.drop_duplicates()
print(df_unique)


0    False
1    False
2    False
3    False
4     True
dtype: bool
    Name        Age       City
0    Ali  25.000000     Lahore
1   Sara  25.666667    Karachi
2  Ahmed  28.000000        NaN
3   Sana  24.000000  Islamabad


# 🎯 Mini Task

In [5]:
# Create a DataFrame with missing & duplicate values
df_task = pd.DataFrame({
    'Product': ['A', 'B', 'C', 'A'],
    'Price': [10, np.nan, 15, 10],
    'Stock': [np.nan, 20, 30, np.nan]
})

# Fill missing values
df_task['Price'] = df_task['Price'].fillna(df_task['Price'].mean())
df_task['Stock'] = df_task['Stock'].fillna(0)

# Remove duplicates
df_task = df_task.drop_duplicates()

print(df_task)


  Product      Price  Stock
0       A  10.000000    0.0
1       B  11.666667   20.0
2       C  15.000000   30.0
