In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("weather_data.csv")

In [None]:
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,,9.0,Sunny
2,1/5/2017,28.0,,Snow
3,1/6/2017,,7.0,
4,1/7/2017,32.0,,Rain
5,1/8/2017,,,Sunny
6,1/9/2017,,,
7,1/10/2017,34.0,8.0,Cloudy
8,1/11/2017,40.0,12.0,Sunny


In [None]:
# I want to Make day a date columns, by using the parse)dates function
# but first we have to convert string to integer since the dataype of day is str
type(df.day[0])
df = pd.read_csv("weather_data.csv", parse_dates=['day'])
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,9.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,,,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [None]:
type(df.day[0])

pandas._libs.tslibs.timestamps.Timestamp

In [None]:
# Make day the index
# Now you got day as your index
df.set_index('day',inplace=True)
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [None]:
# Now we have Na values in our dataframe and we have to clean it carefully so our data doesn't mess up
# Let's day I want to replace all Na values with some other values
# First method we are going to cover is fill na
# I won't modify my orginal dataframe for this instead creat a duplicate one
new_df = df.fillna(0)
# In the brackets, put the values you want to replace with
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,9.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,0
2017-01-07,32.0,0.0,Rain
2017-01-08,0.0,0.0,Sunny
2017-01-09,0.0,0.0,0
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [None]:
# Somtimes having zero is not probabbly the best guess
# So we have to come up with a better guess

# In the case of event, what does zero even mean?
# You can use fillna for this but you don't want to replace the values with all the NA values
# You might want to choose a specific column for this
# Pandas supports that also

# You can do this by passing a DICTIONARY

new_df = df.fillna({
    'temperature': 0,
    'windspeed': 0 ,
    'event': 'no event'
})

new_df
# So basically in fill na you passed a dictionary and specified Python certain columns in which you want to REPLACE VALUE and WHAT VALUE

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,9.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,no event
2017-01-07,32.0,0.0,Rain
2017-01-08,0.0,0.0,Sunny
2017-01-09,0.0,0.0,no event
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [None]:
# But still I'm not happy with the temperature, because if someone looks at the temperature at 1 Jan it was 32 and then on 2 JAN IT DROPPED BY -32
# So we need to come with an estimate

# First way is that we can carry forward the temperature from previous value
new_df = df.fillna(method='ffill')
new_df

# so basically ffill stands for forward fill and it just carries the pervious values as you can see that
# This looks a bit better instead of having zero values
# You can also see the event, it just carries the previous value and looks better than before

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,7.0,Sunny
2017-01-09,32.0,7.0,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [None]:
#You can also use bfill so instead of copying previous days values you copy next days value
new_df = df.fillna(method='bfill')
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,28.0,9.0,Sunny
2017-01-05,28.0,7.0,Snow
2017-01-06,32.0,7.0,Rain
2017-01-07,32.0,8.0,Rain
2017-01-08,34.0,8.0,Sunny
2017-01-09,34.0,8.0,Cloudy
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [None]:
# You can see more related ARGUMENTS to fillna on Pandas documentation
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.fillna.html

In [None]:
# We also have AXIS Argument in fillna
# perviously when we were using backfill it was copying valus vertically
# but with AXIS = COLUMNS it's copying values horizontally
# Based on what type of data you're dealing with, you can either copy it horizontally or verticaly

new_df = df.fillna(method='bfill', axis='columns')
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,9.0,9.0,Sunny
2017-01-05,28.0,Snow,Snow
2017-01-06,7.0,7.0,
2017-01-07,32.0,Rain,Rain
2017-01-08,Sunny,Sunny,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [None]:
# We also have LIMIT Argument in fillna
# Let's say you want to copy the previous values and sometimes you want to copy that one time and not the second time, in that case you can specify the limit
new_df = df.fillna(method='ffill', limit=1)
new_df

# So on 7 Jan my vlaue was 32 in temperature and it carried the values 32 one time to 8 Jan not 9 Jan since we have passed the LIMIT=1
# So I can only copy the pervious value ONCE since my LIMIT is set to 1
# Same thing goes for temperature and etc

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,,Sunny
2017-01-09,,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [None]:
new_df = df.fillna(method='ffill', limit=2)
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,7.0,Sunny
2017-01-09,32.0,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [None]:
# I'm still not happy with the guess that I'm making becuase if the TEMPERATURE on JAN 1 was 32 and on JAN 5 it was 28, 
# it is likely that temperature on JAN 4 was in BETWEEN, I mean it's not always gurnateed but that is something you would consider a better guess

# Now we're going to look at a method called "interpolate"
new_df = df.interpolate('linear')
new_df
# Now you can se on JAN 4 it came up with a better guess which is a linear interpolation
# Linear interpolation is a method of calculating intermediate data between known values by conceptually drawing a straight line between two adjacent known values. 
# An interpolated value is any point along that line. You use linear interpolation to, for example, draw graphs or animate between keyframes.

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,30.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [None]:
# You can also look at the interpolate documentation at https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.interpolate.html

In [None]:
# If you don't specify anything in interpolate method it is by default linear
# You can use so many other methods like quadratic, cubic, etc by looking at the DOCUMETATION
# Let's use the time method now

# You can also see using the linear interpolation it came up with a middle value like in the middle of 32 and 28 was 30, but look at the date which is not in the middle
# Date is more near to 5th JAN so I'm missing 2nd and 3rd JAN SO 30 for temperature still doesn't look like a better guess
# It should be relatively near to 28 in Temperature so when you use method = time

# # Let's use the time method now
new_df = df.interpolate('time')
new_df
# Now you cann see we got 29 in Temperature which a much better guess because now it'a considering this time and date also
# It's realizing that JAN 4 is near to JAN 5 hence the value should note be in the middle but it should be more near to 28 in JAN 5 temperature

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,29.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.25,Rain
2017-01-08,32.666667,7.5,Sunny
2017-01-09,33.333333,7.75,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [None]:
 # Sometimes based on the situation I just want to drop the rows with Na values
 # In that case you can use this method called "dropna"
new_df = df.dropna()
new_df
# So whichever row in my excel sheet had any Na values is dropped


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [None]:
# Sometimes you want to drop rows with conditons, let's say which has like at least 1 Na value
new_df = df.dropna(how='all')
new_df
# So it'll drop the row which has all Na values 

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [None]:
# Let's say that I want to say that If I have atleast 1 NON-NA Value then keep that row and drop any other rows
# thresh means that I need atleast 1 valid value to keep that row
# You can change the thereshold perimeter for this
new_df = df.dropna(thresh=1)
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [None]:
# How do you go about insert the mssing dates? like JAN 2 , 3
# First you create a date range
dt = pd.date_range('2017-01-01', '2017-01-11')
# then you pass that to datetime index
idx = pd.DatetimeIndex(dt)
# then you do reindexing in your dataframe using that index
df = df.reindex(idx)
df
# Now I have Na values but you can use any of the methods to fill that

Unnamed: 0,temperature,windspeed,event
2017-01-01,32.0,6.0,Rain
2017-01-02,,,
2017-01-03,,,
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy


In [None]:
# Made by @Shahzeb-A