Handling the Missing Values

In [1]:
import numpy as np
import pandas as pd

In [4]:
#dictionary to map the column names
dict = {
    'Marks' : [100,90,np.nan,95],
    'Rank' : [1,2,4,np.nan],
    'Avg Marks' : [np.nan,40,80,98],
}

#creating a DataFrame
df = pd.DataFrame(dict)

df

Unnamed: 0,Marks,Rank,Avg Marks
0,100.0,1.0,
1,90.0,2.0,40.0
2,,4.0,80.0
3,95.0,,98.0


In [None]:
#checking for null values in the DataFrame

df.isnull()

Unnamed: 0,Marks,Rank,Avg Marks
0,False,False,True
1,False,False,False
2,True,False,False
3,False,True,False


In [9]:
#filling the null values with 0

df.fillna(0)

Unnamed: 0,Marks,Rank,Avg Marks
0,100.0,1.0,0.0
1,90.0,2.0,40.0
2,0.0,4.0,80.0
3,95.0,0.0,98.0


In [None]:
# bfill is used to fill the NaN values with the next valid value in the column

df.fillna(method='bfill')

  df.fillna(method='bfill')


Unnamed: 0,Marks,Rank,Avg Marks
0,100.0,1.0,40.0
1,90.0,2.0,40.0
2,95.0,4.0,80.0
3,95.0,,98.0


In [None]:
# ffill is used to fill the NaN values with the previous valid value in the column
# pad can also be used in place of ffill

df.fillna(method='ffill')

  df.fillna(method='ffill')


Unnamed: 0,Marks,Rank,Avg Marks
0,100.0,1.0,
1,90.0,2.0,40.0
2,90.0,4.0,80.0
3,95.0,4.0,98.0


In [14]:
# linear interpolation is used to fill NaN values with the average of the surrounding values
# the limit direction can be set to 'forward' or 'backward' or 'both'
# using 'forward' will fill NaN values with the next valid value in the column
# using 'backward' will fill NaN values with the previous valid value in the column
# in forward, the cell before the NaN cell should be vaild, only then it will fill the NaN cell, otherwise it will not fill it

df.interpolate(method='linear' , limit_direction='forward')

Unnamed: 0,Marks,Rank,Avg Marks
0,100.0,1.0,
1,90.0,2.0,40.0
2,92.5,4.0,80.0
3,95.0,4.0,98.0


In [None]:
# dropping the rows with NaN values, not generally recommended as it can lead to loss of data

df.dropna()

Unnamed: 0,Marks,Rank,Avg Marks
1,90.0,2.0,40.0
