# Handling Missing Data - dropna(), fillna() & interpolate()

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('G:\Youtube\Pandas\example_files\WorldPopulation2023.csv')
pd.options.display.float_format = '{:.0f}'.format #This is just to suppress scientific format for large numbers like Population2023 column

population_df = df.head(20)
population_df

Unnamed: 0,Rank,Country,Population2023,YearlyChange,Trend
0,1,India,1428627663.0,0.81%,Increasing
1,2,China,1425671352.0,-0.02%,Decreasing
2,3,United States,339996563.0,0.50%,Increasing
3,4,Indonesia,277534122.0,,
4,5,Pakistan,240485658.0,1.98%,Increasing
5,6,Nigeria,,,Increasing
6,7,Brazil,216422446.0,0.52%,Increasing
7,8,Bangladesh,172954319.0,1.03%,Increasing
8,9,Russia,144444359.0,-0.19%,Decreasing
9,10,Mexico,128455567.0,0.75%,Increasing


### dropna() Example:

Reference: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html

In [3]:
population_df.dropna() #Removes the whole row even if there is a single NaN value

Unnamed: 0,Rank,Country,Population2023,YearlyChange,Trend
0,1,India,1428627663,0.81%,Increasing
1,2,China,1425671352,-0.02%,Decreasing
2,3,United States,339996563,0.50%,Increasing
4,5,Pakistan,240485658,1.98%,Increasing
6,7,Brazil,216422446,0.52%,Increasing
7,8,Bangladesh,172954319,1.03%,Increasing
8,9,Russia,144444359,-0.19%,Decreasing
9,10,Mexico,128455567,0.75%,Increasing
10,11,Ethiopia,126527060,2.55%,Increasing
11,12,Japan,123294513,-0.53%,Decreasing


In [4]:
population_df = population_df.dropna(thresh=3) #Removes the whole row if there is a 3 NaN values
population_df

Unnamed: 0,Rank,Country,Population2023,YearlyChange,Trend
0,1,India,1428627663.0,0.81%,Increasing
1,2,China,1425671352.0,-0.02%,Decreasing
2,3,United States,339996563.0,0.50%,Increasing
3,4,Indonesia,277534122.0,,
4,5,Pakistan,240485658.0,1.98%,Increasing
5,6,Nigeria,,,Increasing
6,7,Brazil,216422446.0,0.52%,Increasing
7,8,Bangladesh,172954319.0,1.03%,Increasing
8,9,Russia,144444359.0,-0.19%,Decreasing
9,10,Mexico,128455567.0,0.75%,Increasing


### fillna() Example:

Reference: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html

In [5]:
population_df.fillna(0) # Fills all the NaN as 0

Unnamed: 0,Rank,Country,Population2023,YearlyChange,Trend
0,1,India,1428627663,0.81%,Increasing
1,2,China,1425671352,-0.02%,Decreasing
2,3,United States,339996563,0.50%,Increasing
3,4,Indonesia,277534122,0,0
4,5,Pakistan,240485658,1.98%,Increasing
5,6,Nigeria,0,0,Increasing
6,7,Brazil,216422446,0.52%,Increasing
7,8,Bangladesh,172954319,1.03%,Increasing
8,9,Russia,144444359,-0.19%,Decreasing
9,10,Mexico,128455567,0.75%,Increasing


In [6]:
population_df = population_df.fillna({
    "Trend":"Not available",
    "YearlyChange":"0%"
}) # fillna() can take dict as its input, keys are column names

population_df

Unnamed: 0,Rank,Country,Population2023,YearlyChange,Trend
0,1,India,1428627663.0,0.81%,Increasing
1,2,China,1425671352.0,-0.02%,Decreasing
2,3,United States,339996563.0,0.50%,Increasing
3,4,Indonesia,277534122.0,0%,Not available
4,5,Pakistan,240485658.0,1.98%,Increasing
5,6,Nigeria,,0%,Increasing
6,7,Brazil,216422446.0,0.52%,Increasing
7,8,Bangladesh,172954319.0,1.03%,Increasing
8,9,Russia,144444359.0,-0.19%,Decreasing
9,10,Mexico,128455567.0,0.75%,Increasing


In [7]:
population_df.ffill() # By default fills the NaN value with same column of previous row.

Unnamed: 0,Rank,Country,Population2023,YearlyChange,Trend
0,1,India,1428627663,0.81%,Increasing
1,2,China,1425671352,-0.02%,Decreasing
2,3,United States,339996563,0.50%,Increasing
3,4,Indonesia,277534122,0%,Not available
4,5,Pakistan,240485658,1.98%,Increasing
5,6,Nigeria,240485658,0%,Increasing
6,7,Brazil,216422446,0.52%,Increasing
7,8,Bangladesh,172954319,1.03%,Increasing
8,9,Russia,144444359,-0.19%,Decreasing
9,10,Mexico,128455567,0.75%,Increasing


In [8]:
population_df.bfill() # By default fills the NaN value with same column of next row.

Unnamed: 0,Rank,Country,Population2023,YearlyChange,Trend
0,1,India,1428627663,0.81%,Increasing
1,2,China,1425671352,-0.02%,Decreasing
2,3,United States,339996563,0.50%,Increasing
3,4,Indonesia,277534122,0%,Not available
4,5,Pakistan,240485658,1.98%,Increasing
5,6,Nigeria,216422446,0%,Increasing
6,7,Brazil,216422446,0.52%,Increasing
7,8,Bangladesh,172954319,1.03%,Increasing
8,9,Russia,144444359,-0.19%,Decreasing
9,10,Mexico,128455567,0.75%,Increasing


### interpolate() Example:

Reference: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.interpolate.html

In [9]:
population_df.interpolate() # By default it is linear interpolation

Unnamed: 0,Rank,Country,Population2023,YearlyChange,Trend
0,1,India,1428627663,0.81%,Increasing
1,2,China,1425671352,-0.02%,Decreasing
2,3,United States,339996563,0.50%,Increasing
3,4,Indonesia,277534122,0%,Not available
4,5,Pakistan,240485658,1.98%,Increasing
5,6,Nigeria,228454052,0%,Increasing
6,7,Brazil,216422446,0.52%,Increasing
7,8,Bangladesh,172954319,1.03%,Increasing
8,9,Russia,144444359,-0.19%,Decreasing
9,10,Mexico,128455567,0.75%,Increasing
