# Different Techniques to Handle Missing Data

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np

# Creating a dummy dataset with missing values
data = {
    'Feature1': [1, 2, 3, np.nan, 5, 6, 7, 8, np.nan, 10],
    'Feature2': [10, 20, np.nan, 40, 50, 60, 70, np.nan, 90, 100],
    'Feature3': [5, np.nan, 15, 25, 35, 45, np.nan, 65, 75, np.nan],
    'Feature4': [5, 15, 25, 35, 45, np.nan, 65, 75, np.nan, 95],
    'Category': ['X', 'Y', 'X', 'X', 'Y', 'Z', 'Z', 'Y', np.nan, 'Z']
}

# Creating a DataFrame from the dummy dataset
df = pd.DataFrame(data)

In [2]:
# Displaying the original dataset
print("Original Dataset:")

# Show dataframe
df

Original Dataset:


Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Category
0,1.0,10.0,5.0,5.0,X
1,2.0,20.0,,15.0,Y
2,3.0,,15.0,25.0,X
3,,40.0,25.0,35.0,X
4,5.0,50.0,35.0,45.0,Y
5,6.0,60.0,45.0,,Z
6,7.0,70.0,,65.0,Z
7,8.0,,65.0,75.0,Y
8,,90.0,75.0,,
9,10.0,100.0,,95.0,Z


# Imputation techniques

In [3]:
# Technique 1: Imputing missing values in 'Feature1' with the mean of the column
df['Feature1_mean'] = df['Feature1'].fillna(df['Feature1'].mean())

# Show dataframe
df

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Category,Feature1_mean
0,1.0,10.0,5.0,5.0,X,1.0
1,2.0,20.0,,15.0,Y,2.0
2,3.0,,15.0,25.0,X,3.0
3,,40.0,25.0,35.0,X,5.25
4,5.0,50.0,35.0,45.0,Y,5.0
5,6.0,60.0,45.0,,Z,6.0
6,7.0,70.0,,65.0,Z,7.0
7,8.0,,65.0,75.0,Y,8.0
8,,90.0,75.0,,,5.25
9,10.0,100.0,,95.0,Z,10.0


In [4]:
# Technique 2: Forward filling missing values in 'Feature2'
df['Feature2_forward'] = df['Feature2'].fillna(method='ffill')

# Show dataframe
df

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Category,Feature1_mean,Feature2_forward
0,1.0,10.0,5.0,5.0,X,1.0,10.0
1,2.0,20.0,,15.0,Y,2.0,20.0
2,3.0,,15.0,25.0,X,3.0,20.0
3,,40.0,25.0,35.0,X,5.25,40.0
4,5.0,50.0,35.0,45.0,Y,5.0,50.0
5,6.0,60.0,45.0,,Z,6.0,60.0
6,7.0,70.0,,65.0,Z,7.0,70.0
7,8.0,,65.0,75.0,Y,8.0,70.0
8,,90.0,75.0,,,5.25,90.0
9,10.0,100.0,,95.0,Z,10.0,100.0


In [5]:
# Technique 3: Backward filling missing values in 'Feature2'
df['Feature2_backward'] = df['Feature2'].fillna(method='bfill')

# Show dataframe
df

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Category,Feature1_mean,Feature2_forward,Feature2_backward
0,1.0,10.0,5.0,5.0,X,1.0,10.0,10.0
1,2.0,20.0,,15.0,Y,2.0,20.0,20.0
2,3.0,,15.0,25.0,X,3.0,20.0,40.0
3,,40.0,25.0,35.0,X,5.25,40.0,40.0
4,5.0,50.0,35.0,45.0,Y,5.0,50.0,50.0
5,6.0,60.0,45.0,,Z,6.0,60.0,60.0
6,7.0,70.0,,65.0,Z,7.0,70.0,70.0
7,8.0,,65.0,75.0,Y,8.0,70.0,90.0
8,,90.0,75.0,,,5.25,90.0,90.0
9,10.0,100.0,,95.0,Z,10.0,100.0,100.0


In [6]:
# Technique 4: Interpolating missing values in 'Feature2' using linear interpolation
df['Feature2_interpolated'] = df['Feature2'].interpolate(method='linear')

# Show dataframe
df

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Category,Feature1_mean,Feature2_forward,Feature2_backward,Feature2_interpolated
0,1.0,10.0,5.0,5.0,X,1.0,10.0,10.0,10.0
1,2.0,20.0,,15.0,Y,2.0,20.0,20.0,20.0
2,3.0,,15.0,25.0,X,3.0,20.0,40.0,30.0
3,,40.0,25.0,35.0,X,5.25,40.0,40.0,40.0
4,5.0,50.0,35.0,45.0,Y,5.0,50.0,50.0,50.0
5,6.0,60.0,45.0,,Z,6.0,60.0,60.0,60.0
6,7.0,70.0,,65.0,Z,7.0,70.0,70.0,70.0
7,8.0,,65.0,75.0,Y,8.0,70.0,90.0,80.0
8,,90.0,75.0,,,5.25,90.0,90.0,90.0
9,10.0,100.0,,95.0,Z,10.0,100.0,100.0,100.0


In [7]:
# Technique 5: Imputing missing values in 'Feature3' with the median of the column
df['Feature3_median'] = df['Feature3'].fillna(df['Feature3'].median())

# Show dataframe
df

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Category,Feature1_mean,Feature2_forward,Feature2_backward,Feature2_interpolated,Feature3_median
0,1.0,10.0,5.0,5.0,X,1.0,10.0,10.0,10.0,5.0
1,2.0,20.0,,15.0,Y,2.0,20.0,20.0,20.0,35.0
2,3.0,,15.0,25.0,X,3.0,20.0,40.0,30.0,15.0
3,,40.0,25.0,35.0,X,5.25,40.0,40.0,40.0,25.0
4,5.0,50.0,35.0,45.0,Y,5.0,50.0,50.0,50.0,35.0
5,6.0,60.0,45.0,,Z,6.0,60.0,60.0,60.0,45.0
6,7.0,70.0,,65.0,Z,7.0,70.0,70.0,70.0,35.0
7,8.0,,65.0,75.0,Y,8.0,70.0,90.0,80.0,65.0
8,,90.0,75.0,,,5.25,90.0,90.0,90.0,75.0
9,10.0,100.0,,95.0,Z,10.0,100.0,100.0,100.0,35.0


In [8]:
# Technique 6: Imputing missing values in 'Feature4' with a constant value (e.g., 50)
df['Feature4_constant'] = df['Feature4'].fillna(50)

# Show dataframe
df

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Category,Feature1_mean,Feature2_forward,Feature2_backward,Feature2_interpolated,Feature3_median,Feature4_constant
0,1.0,10.0,5.0,5.0,X,1.0,10.0,10.0,10.0,5.0,5.0
1,2.0,20.0,,15.0,Y,2.0,20.0,20.0,20.0,35.0,15.0
2,3.0,,15.0,25.0,X,3.0,20.0,40.0,30.0,15.0,25.0
3,,40.0,25.0,35.0,X,5.25,40.0,40.0,40.0,25.0,35.0
4,5.0,50.0,35.0,45.0,Y,5.0,50.0,50.0,50.0,35.0,45.0
5,6.0,60.0,45.0,,Z,6.0,60.0,60.0,60.0,45.0,50.0
6,7.0,70.0,,65.0,Z,7.0,70.0,70.0,70.0,35.0,65.0
7,8.0,,65.0,75.0,Y,8.0,70.0,90.0,80.0,65.0,75.0
8,,90.0,75.0,,,5.25,90.0,90.0,90.0,75.0,50.0
9,10.0,100.0,,95.0,Z,10.0,100.0,100.0,100.0,35.0,95.0


In [9]:
# Technique 7: Imputing missing values in 'Category' with the mode of the column
df['Category_mode'] = df['Category'].fillna(df['Category'].mode()[0])

# Show dataframe
df

Unnamed: 0,Feature1,Feature2,Feature3,Feature4,Category,Feature1_mean,Feature2_forward,Feature2_backward,Feature2_interpolated,Feature3_median,Feature4_constant,Category_mode
0,1.0,10.0,5.0,5.0,X,1.0,10.0,10.0,10.0,5.0,5.0,X
1,2.0,20.0,,15.0,Y,2.0,20.0,20.0,20.0,35.0,15.0,Y
2,3.0,,15.0,25.0,X,3.0,20.0,40.0,30.0,15.0,25.0,X
3,,40.0,25.0,35.0,X,5.25,40.0,40.0,40.0,25.0,35.0,X
4,5.0,50.0,35.0,45.0,Y,5.0,50.0,50.0,50.0,35.0,45.0,Y
5,6.0,60.0,45.0,,Z,6.0,60.0,60.0,60.0,45.0,50.0,Z
6,7.0,70.0,,65.0,Z,7.0,70.0,70.0,70.0,35.0,65.0,Z
7,8.0,,65.0,75.0,Y,8.0,70.0,90.0,80.0,65.0,75.0,Y
8,,90.0,75.0,,,5.25,90.0,90.0,90.0,75.0,50.0,X
9,10.0,100.0,,95.0,Z,10.0,100.0,100.0,100.0,35.0,95.0,Z
