# Handling Missing Values

ðŸŸ¦ 1 Import Libraries

In [1]:
import pandas as pd
import numpy as np

ðŸŸ¦ 2. Create Sample Messy Dataset

In [2]:
data = {
    "Name": ["Alice", "Bob", "Charlie", "David", np.nan],
    "Age": [25, np.nan, 30, 28, 40],
    "City": ["Toronto", "New York", np.nan, "Vancouver", "Chicago"],
    "Salary": [70000, 80000, None, np.nan, 90000]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,City,Salary
0,Alice,25.0,Toronto,70000.0
1,Bob,,New York,80000.0
2,Charlie,30.0,,
3,David,28.0,Vancouver,
4,,40.0,Chicago,90000.0


ðŸŸ¦ 3. Detect Missing Values

3.1 Check if values are missing

In [4]:
df.isna()

Unnamed: 0,Name,Age,City,Salary
0,False,False,False,False
1,False,True,False,False
2,False,False,True,True
3,False,False,False,True
4,True,False,False,False


3.2 Count missing values per column

In [5]:
df.isna().sum()

Name      1
Age       1
City      1
Salary    2
dtype: int64

3.3 Check rows containing missing values

In [6]:
df[df.isna().any(axis=1)]

Unnamed: 0,Name,Age,City,Salary
1,Bob,,New York,80000.0
2,Charlie,30.0,,
3,David,28.0,Vancouver,
4,,40.0,Chicago,90000.0


ðŸŸ¦ 4. Removing Missing Values

4.1 Remove rows with any missing values

In [7]:
df_drop_rows = df.dropna()
df_drop_rows

Unnamed: 0,Name,Age,City,Salary
0,Alice,25.0,Toronto,70000.0


4.2 Remove columns with missing values

In [8]:
df.dropna(axis=1)


0
1
2
3
4


4.3 Drop rows where Name is missing

In [9]:
df.dropna(subset=["Name"])


Unnamed: 0,Name,Age,City,Salary
0,Alice,25.0,Toronto,70000.0
1,Bob,,New York,80000.0
2,Charlie,30.0,,
3,David,28.0,Vancouver,


ðŸŸ¦ 5. Filling Missing Values (fillna())

5.1 Fill numeric columns with mean

In [11]:
df["Age"].fillna(df["Age"].mean())
df["Salary"].fillna(df["Salary"].mean())
df

Unnamed: 0,Name,Age,City,Salary
0,Alice,25.0,Toronto,70000.0
1,Bob,30.75,New York,80000.0
2,Charlie,30.0,,80000.0
3,David,28.0,Vancouver,80000.0
4,,40.0,Chicago,90000.0


5.2 Fill categorical columns with a placeholder

In [13]:
df["City"].fillna("Unknown")
df["Name"].fillna("No Name")
df

Unnamed: 0,Name,Age,City,Salary
0,Alice,25.0,Toronto,70000.0
1,Bob,30.75,New York,80000.0
2,Charlie,30.0,Unknown,80000.0
3,David,28.0,Vancouver,80000.0
4,No Name,40.0,Chicago,90000.0


ðŸŸ¦ 6. Forward Fill & Backward Fill

In [22]:
data = {
    "Name": ["Alice", "Bob", "Charlie", "David", np.nan],
    "Age": [25, np.nan, 30, 28, 40],
    "City": ["Toronto", "New York", np.nan, "Vancouver", "Chicago"],
    "Salary": [70000, 80000, None, np.nan, 90000]
}

df = pd.DataFrame(data)

df

Unnamed: 0,Name,Age,City,Salary
0,Alice,25.0,Toronto,70000.0
1,Bob,,New York,80000.0
2,Charlie,30.0,,
3,David,28.0,Vancouver,
4,,40.0,Chicago,90000.0


In [23]:
df_ffill = df.ffill()
df_ffill

Unnamed: 0,Name,Age,City,Salary
0,Alice,25.0,Toronto,70000.0
1,Bob,25.0,New York,80000.0
2,Charlie,30.0,New York,80000.0
3,David,28.0,Vancouver,80000.0
4,David,40.0,Chicago,90000.0


In [None]:


df = pd.DataFrame(data)

df_bfill = df.bfill()

df_bfill

Unnamed: 0,Name,Age,City,Salary
0,Alice,25.0,Toronto,70000.0
1,Bob,30.0,New York,80000.0
2,Charlie,30.0,Vancouver,90000.0
3,David,28.0,Vancouver,90000.0
4,,40.0,Chicago,90000.0


ðŸŸ¦ 7. Filling Using Group-Based Logic

In [26]:
df = pd.DataFrame(data)

df["Department"] = ["HR", "Tech", "Tech", "HR", "Tech"]

df["Age"] = df.groupby("Department")["Age"].transform(lambda x: x.fillna(x.mean()))
df



Unnamed: 0,Name,Age,City,Salary,Department
0,Alice,25.0,Toronto,70000.0,HR
1,Bob,35.0,New York,80000.0,Tech
2,Charlie,30.0,,,Tech
3,David,28.0,Vancouver,,HR
4,,40.0,Chicago,90000.0,Tech


ðŸŸ¦ 8. Using interpolate() for Numeric Data

Great for time-series or ordered numeric data.

In [27]:
df["Salary"] = df["Salary"].interpolate()
df

Unnamed: 0,Name,Age,City,Salary,Department
0,Alice,25.0,Toronto,70000.0,HR
1,Bob,35.0,New York,80000.0,Tech
2,Charlie,30.0,,83333.333333,Tech
3,David,28.0,Vancouver,86666.666667,HR
4,,40.0,Chicago,90000.0,Tech


ðŸŸ¦ 9. Replace Missing Values Conditionally

In [28]:
df = pd.DataFrame(data)

df.loc[(df["Salary"].isna()) & (df["Age"] > 25), "Salary"] = 85000
df

Unnamed: 0,Name,Age,City,Salary
0,Alice,25.0,Toronto,70000.0
1,Bob,,New York,80000.0
2,Charlie,30.0,,85000.0
3,David,28.0,Vancouver,85000.0
4,,40.0,Chicago,90000.0


# ðŸŸ¦ Summary


In this subsection, you learned how to:

- Detect missing values using `isna()` and `isna().sum()`
- Remove missing rows and columns using `dropna()`
- Fill missing values using `fillna()` with mean or constants
- Forward fill and backward fill missing values
- Use `groupby()` for intelligent filling
- Interpolate missing numeric values
- Apply conditional replacement using `.loc`
