In [11]:
import pandas as pd
import numpy as np

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, np.nan, 35, 28, np.nan],
    'Score': [85, 88, np.nan, 85, 82],
    'Salary': [50000, np.nan, 62000, np.nan, 45000]
}

df = pd.DataFrame(data)

print(df)

      Name   Age  Score   Salary
0    Alice  25.0   85.0  50000.0
1      Bob   NaN   88.0      NaN
2  Charlie  35.0    NaN  62000.0
3    David  28.0   85.0      NaN
4      Eve   NaN   82.0  45000.0


In [2]:
#Mean Imputation
df['Age_mean_imputed'] = df['Age'].fillna(df['Age'].mean())
df

Unnamed: 0,Name,Age,Score,Salary,Age_mean_imputed
0,Alice,25.0,85.0,50000.0,25.0
1,Bob,,88.0,,29.333333
2,Charlie,35.0,,62000.0,35.0
3,David,28.0,92.0,,28.0
4,Eve,,77.0,45000.0,29.333333


df['Age_mean_imputed'] creates a new column with the name 'Age_mean_imputed'
df['Age'] selects the 'Age' column in the dataframe
.fillna() function is used to fill and nan (missing) values with whatever argument is given in the parenthesis
df['Age'].mean calculates the mean of the values in the 'Age' column excluding any missing values

In [6]:
#Median Imputation
df['Salary_median_imputed'] = df['Salary'].fillna(df['Salary'].median())
df

Unnamed: 0,Name,Age,Score,Salary,Salary_median_imputed
0,Alice,25.0,85.0,50000.0,50000.0
1,Bob,,88.0,,50000.0
2,Charlie,35.0,,62000.0,62000.0
3,David,28.0,92.0,,50000.0
4,Eve,,77.0,45000.0,45000.0


In [8]:
#Mode Imputation
df['Score_mode_imputed'] = df['Score'].fillna(df['Score'].median())
df

Unnamed: 0,Name,Age,Score,Salary,Score_mode_imputed
0,Alice,25.0,85.0,50000.0,85.0
1,Bob,,88.0,,88.0
2,Charlie,35.0,,62000.0,85.0
3,David,28.0,85.0,,85.0
4,Eve,,82.0,45000.0,82.0


In [10]:
#Row Removal
df_dropna = df.dropna()
df_dropna

Unnamed: 0,Name,Age,Score,Salary,Score_mode_imputed
0,Alice,25.0,85.0,50000.0,85.0


df_dropna() function is used to remove rows or columns with missing values
df_dropna - new dataframe that only contains the rows with no missing values

In [12]:
#Interpolation
df['Salary_interpolated'] = df['Salary'].interpolate(method='linear')
df

Unnamed: 0,Name,Age,Score,Salary,Salary_interpolated
0,Alice,25.0,85.0,50000.0,50000.0
1,Bob,,88.0,,56000.0
2,Charlie,35.0,,62000.0,62000.0
3,David,28.0,85.0,,53500.0
4,Eve,,82.0,45000.0,45000.0


Interpolation is a method of estimating missing values within a sequence of data points by using the known values surrounding them. It assumes that the missing data follows the general trend of the existing data and fills in the gaps accordingly.

The interpolate() function fills in missing values by estimating them based on surrounding data points.
The method='linear' argument specifies that linear interpolation should be used. Linear interpolation fills in missing values by calculating a straight-line trend between the known data points before and after the missing value(s).