## <font color="maroon"><h4 align="center">Handling Missing Data - fillna, interpolate, dropna</font>

In [13]:
import pandas as pd
df = pd.read_csv("weather_data.csv")
df

Unnamed: 0,day,temperature,windspeed,event
0,1/1/2017,32.0,6.0,Rain
1,1/4/2017,,9.0,Sunny
2,1/5/2017,28.0,,Snow
3,1/6/2017,,7.0,
4,1/7/2017,32.0,,Rain
5,1/8/2017,,,Sunny
6,1/9/2017,,,
7,1/10/2017,34.1,8.1,Cloudy
8,1/11/2017,40.0,12.0,Sunny


In [None]:
type(df.day) # This will show the type of the 'day' column, which should be datetime if parsed correctly.

pandas.core.series.Series

In [None]:
type(df.day[0]) # This will show the type of the first element in the 'day' column

str

WE'll covert this to date type.

In [20]:
df = pd.read_csv("weather_data.csv",parse_dates=['day'])
# This will parse the 'day' column as datetime objects.  So it convert data type of 'day' column from string to datetime. 


In [None]:
type(df.day[0])  
#the type should be timestamp, which is a datetime object in pandas.

pandas._libs.tslibs.timestamps.Timestamp

In [None]:
df.set_index('day',inplace=True)
# This will set the 'day' column as the index of the DataFrame, allowing for time series operations.
df

# inplace=True means that the changes will be made directly to the original DataFrame without creating a new one.

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-09,,,
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


## <font color="blue">fillna</font>

<font color="purple">**Fill all NaN with one specific value**</font>

In [23]:
# df.fillna(0, inplace=True)    # if we didn't pass inplace it will return new dataframe , but if we pass inplace=True it will modify the original dataframe.
# .fillna() will go through the DataFrame and replace all NaN values with 0.

new_df = df.fillna(0) # This will create a new DataFrame with NaN values replaced by 0, without modifying the original DataFrame.
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,9.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,0
2017-01-07,32.0,0.0,Rain
2017-01-08,0.0,0.0,Sunny
2017-01-09,0.0,0.0,0
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


<font color="purple">**Fill na using column names and dict**</font>

In [25]:
df.temperature.mean() # This will calculate the mean of the 'temperature' column, ignoring NaN values.

np.float64(33.22)

In [None]:
new_df = df.fillna({
        'temperature': df.temperature.mean(),
        'windspeed': df.windspeed.mean(),
        'event': 'No Event'
    })

# in this fillna() dictionary, keys are column names and values are the values to fill in for NaN entries.
# here we are filling NaN values in the 'temperature' and 'windspeed' columns with their respective means, and filling NaN values in the 'event' column with 'No Event'.
# This is useful for maintaining the integrity of the dataset while ensuring that missing values do not skew

new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,33.22,9.0,Sunny
2017-01-05,28.0,8.42,Snow
2017-01-06,33.22,7.0,No Event
2017-01-07,32.0,8.42,Rain
2017-01-08,33.22,8.42,Sunny
2017-01-09,33.22,8.42,No Event
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


<font color="purple">**Use method to determine how to fill na values**</font>

In [26]:
new_df = df.fillna(method="ffill")
# This will forward fill NaN values, meaning it will take the last valid value and fill it forward to the next NaN.
new_df

  new_df = df.fillna(method="ffill")


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,7.0,Sunny
2017-01-09,32.0,7.0,Sunny
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [27]:
new_df = df.fillna(method="bfill")
# This will backward fill NaN values, meaning it will take the next valid value and fill it backward to the previous NaN.
new_df

  new_df = df.fillna(method="bfill")


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,28.0,9.0,Sunny
2017-01-05,28.0,7.0,Snow
2017-01-06,32.0,7.0,Rain
2017-01-07,32.0,8.1,Rain
2017-01-08,34.1,8.1,Sunny
2017-01-09,34.1,8.1,Cloudy
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


<font color="purple">**Use of axis**</font>

In [None]:
new_df = df.fillna(method="bfill", axis="columns") # axis is either "index" or "columns"
# This will backward fill NaN values across columns, meaning it will take the next valid value in the same row and fill it backward to the previous NaN in that row.
new_df

  new_df = df.fillna(method="bfill", axis="columns") # axis is either "index" or "columns"
  new_df = df.fillna(method="bfill", axis="columns") # axis is either "index" or "columns"


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,9.0,9.0,Sunny
2017-01-05,28.0,Snow,Snow
2017-01-06,7.0,7.0,
2017-01-07,32.0,Rain,Rain
2017-01-08,Sunny,Sunny,Sunny
2017-01-09,,,
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


<font color="purple">**limit parameter**</font>

In [29]:
new_df = df.fillna(method="ffill",limit=1)
# This will forward fill NaN values, but only up to 1 consecutive NaN per column.
# If there are more than 1 consecutive NaN, they will remain NaN.
new_df

  new_df = df.fillna(method="ffill",limit=1)


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,9.0,Sunny
2017-01-05,28.0,9.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,32.0,,Sunny
2017-01-09,,,Sunny
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


### <font color="blue">interpolate</font>

In [None]:
new_df = df.interpolate()
# This will perform linear interpolation on the DataFrame, filling NaN values based on the surrounding data. It will store the median of the surrounding values.
# Interpolation is useful for estimating missing values in a time series or continuous data.
new_df

  new_df = df.interpolate()


Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,30.0,9.0,Sunny
2017-01-05,28.0,8.0,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,7.275,Rain
2017-01-08,32.7,7.55,Sunny
2017-01-09,33.4,7.825,
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


### <font color="blue">dropna</font>

In [None]:
new_df = df.dropna()
# This will drop all rows with any NaN values, resulting in a DataFrame with no missing data.
# This is useful when you want to ensure that your DataFrame has no missing values, but it may result in loss of data if many rows have NaN values.
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [None]:
new_df = df.dropna(how='all')
# This will drop rows where all elements are NaN, keeping rows with at least one valid value.
# This is useful when you want to retain rows that have some valid data while removing completely empty rows.
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [32]:
new_df = df.dropna(thresh=2)
# This will drop rows that have fewer than 2 non-NaN values, ensuring that only rows with sufficient data are retained.
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-07,32.0,,Rain
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [34]:
new_df = df.dropna(thresh=1)
new_df
# This will drop rows that have fewer than 1 non-NaN value, ensuring that only rows with at least one valid value are retained.

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,9.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,,,Sunny
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [33]:
new_df = df.dropna(subset=['temperature', 'windspeed'])
# This will drop rows where either 'temperature' or 'windspeed' is NaN, ensuring that only rows with valid values in these columns are kept.
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny


In [35]:
new_df = df.dropna(thresh=2, subset=['temperature', 'windspeed'])
# This will drop rows where the number of non-NaN values in the specified columns ('temperature' and 'windspeed') is less than 2.
# This ensures that only rows with sufficient data in these specific columns are retained.      
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-10,34.1,8.1,Cloudy
2017-01-11,40.0,12.0,Sunny
