**Handling Blank/NA Values**

In [1]:
import pandas as pd

df = pd.read_csv("weather_data.csv", parse_dates=["day"])
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.1,8.1,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [2]:
type(df.day[0])

pandas._libs.tslibs.timestamps.Timestamp

In [3]:
#convert the string day values to integer
df.set_index('day', inplace=True)
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [4]:
#fill the blank/na values with a specific value like 0
df.fillna(0)
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [7]:
#However, a better approach for this issue is to use a mean or median to replace the NA values
#Since the value varies based on column, we create a dictionary to help filter where we want what
df.fillna({
    'temperature': df.temperature.mean(),
    'windspeed': df.windspeed.mean(),
    'event': 'Normal'
})

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,33.22,9.0,Sunny
2017-01-05,28.0,8.42,Snow
2017-01-06,33.22,7.0,Normal
2017-01-07,32.0,8.42,Rain
2017-01-08,33.22,8.42,Sunny
2017-01-09,33.22,8.42,Normal
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [8]:
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [11]:
#you can also ffill which will take the previous index's value and place it here
df.ffill()

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,7.0,Sunny
2017-01-09,32.0,7.0,Sunny
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [12]:
#backward fill also present
df.bfill()

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,28.0,9.0,Sunny
2017-01-05,28.0,7.0,Snow
2017-01-06,32.0,7.0,Rain
2017-01-07,32.0,8.1,Rain
2017-01-08,34.1,8.1,Sunny
2017-01-09,34.1,8.1,Cloudy
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [16]:
#You can also have the fill happen between columns
df.fillna(method='bfill', axis='columns')

  df.fillna(method='bfill', axis='columns')


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,9.0,9.0,Sunny
2017-01-05,28.0,Snow,Snow
2017-01-06,7.0,7.0,
2017-01-07,32.0,Rain,Rain
2017-01-08,Sunny,Sunny,Sunny
2017-01-09,,,
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [18]:
#to have limitations when filling forward and you don't want it to fill for entire column
df.ffill(limit=1)

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,,Sunny
2017-01-09,,,Sunny
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [19]:
#To have it look at the values near it and decide what is the best, you can do interpolation
df.interpolate()

  df.interpolate()


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,30.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.275,Rain
2017-01-08,32.7,7.55,Sunny
2017-01-09,33.4,7.825,
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [20]:
#to delete any rows that has na in it
df.dropna()

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [23]:
#To specify how many NAs there have to be to drop it, you can use the how method
#For example, all would make it so that all of the options need to be NA for it to be dropped
df.dropna(how='all')

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [24]:
#To specify how many NA values there needs to be dropped, you can use the thresh method
df.dropna(thresh=2)

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-07,32.0,,Rain
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [26]:
df.fillna(-99999)

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,-99999.0,9.0,Sunny
2017-01-05,28.0,-99999.0,Snow
2017-01-06,-99999.0,7.0,-99999
2017-01-07,32.0,-99999.0,Rain
2017-01-08,-99999.0,-99999.0,Sunny
2017-01-09,-99999.0,-99999.0,-99999
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny
