## <font color="maroon"><h4 align="center">Handling Missing Data - fillna, interpolate, dropna</font>

In [29]:
import pandas as pd

df = pd.read_csv("weather_data.csv", parse_dates=["day"])
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.1,8.1,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [22]:
df.columns

Index(['day', 'temperature', 'windspeed', 'event'], dtype='object')

In [17]:
print(df.index)

RangeIndex(start=0, stop=9, step=1)


In [23]:
type(df.day[0])

pandas._libs.tslibs.timestamps.Timestamp

In [30]:
df.set_index('day',inplace=True)
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [34]:
# df.fillna(0, inplace=True)
# by using fillna method with inplace we can return a new data frame but without inplace we can't return new dataframe
# and without using inplace method the fillna also not work

df.fillna(0)
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [32]:
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


### we can use fillna method in dictionary for particular column 

In [35]:
df.fillna({
    'temperature': df.temperature.mean(),
    'windspeed': df.windspeed.mean(),
    'event': "Not Event"
})

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,33.22,9.0,Sunny
2017-01-05,28.0,8.42,Snow
2017-01-06,33.22,7.0,Not Event
2017-01-07,32.0,8.42,Rain
2017-01-08,33.22,8.42,Sunny
2017-01-09,33.22,8.42,Not Event
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [None]:
# By using ffill == forward fill we can fill empty values by upward values and same with backward 


In [36]:
df.fillna(method='ffill')

  df.fillna(method='ffill')


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,7.0,Sunny
2017-01-09,32.0,7.0,Sunny
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [38]:
df.fillna(method = 'bfill', axis = "columns") 
# by doing axis = columns you can follow the pattern in a horizontal way 

  df.fillna(method = 'bfill', axis = "columns")
  df.fillna(method = 'bfill', axis = "columns")


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,9.0,9.0,Sunny
2017-01-05,28.0,Snow,Snow
2017-01-06,7.0,7.0,
2017-01-07,32.0,Rain,Rain
2017-01-08,Sunny,Sunny,Sunny
2017-01-09,,,
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [40]:
new_df = df.fillna(method = "ffill", limit=1)
new_df

# by using limit method we can limit the pattern works only for 2 consecutive not for 3

  new_df = df.fillna(method = "ffill", limit=1)


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,,Sunny
2017-01-09,,,Sunny
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


## interpolate

In [41]:
# if we want to take avg of up and down values then predict middle value
new_df = df.interpolate()
new_df

  new_df = df.interpolate()


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,30.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.275,Rain
2017-01-08,32.7,7.55,Sunny
2017-01-09,33.4,7.825,
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


## dropna

In [42]:
new_df = df.dropna()
new_df

# by dropna method we can elimate rows who have even 1 Nan values

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [44]:
df_new = df.dropna(how = "all")
df_new

# if the row have all Nan values then it will drop that row

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [45]:
new_df = df.dropna(thresh=2)
new_df

# it will drop all those row who have atleast 2 Nan values

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-07,32.0,,Rain
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny
