In [30]:
import numpy as np
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt

In [49]:
#Load data and Preview
data = pd.read_csv('dataset/dataset.csv')
data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,12/1/2008,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,12/2/2008,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,12/3/2008,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,12/4/2008,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,12/5/2008,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No


In [46]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142193 entries, 0 to 142192
Data columns (total 24 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           142193 non-null  object 
 1   Location       142193 non-null  object 
 2   MinTemp        141556 non-null  float64
 3   MaxTemp        141871 non-null  float64
 4   Rainfall       140787 non-null  float64
 5   Evaporation    81350 non-null   float64
 6   Sunshine       74377 non-null   float64
 7   WindGustDir    132863 non-null  object 
 8   WindGustSpeed  132923 non-null  float64
 9   WindDir9am     132180 non-null  object 
 10  WindDir3pm     138415 non-null  object 
 11  WindSpeed9am   140845 non-null  float64
 12  WindSpeed3pm   139563 non-null  float64
 13  Humidity9am    140419 non-null  float64
 14  Humidity3pm    138583 non-null  float64
 15  Pressure9am    128179 non-null  float64
 16  Pressure3pm    128212 non-null  float64
 17  Cloud9am       88536 non-null

In [32]:
data.shape

(142193, 24)

In [50]:
#Check Null value %
df_null = data.isnull().sum().sort_values(ascending=False).reset_index()
df_null.columns = ['Variable','NullCount']
df_null['null%'] = (df_null['NullCount']/142193)*100
df_null

Unnamed: 0,Variable,NullCount,null%
0,Sunshine,67816,47.692924
1,Evaporation,60843,42.789026
2,Cloud3pm,57094,40.152469
3,Cloud9am,53657,37.735332
4,Pressure9am,14014,9.855619
5,Pressure3pm,13981,9.832411
6,WindDir9am,10013,7.041838
7,WindGustDir,9330,6.561504
8,WindGustSpeed,9270,6.519308
9,WindDir3pm,3778,2.656952


In [51]:
# Impute categorical variables

data['Location'] = data['Location'].fillna(data['Location'].mode()[0])
data['WindGustDir'] = data['WindGustDir'].fillna(data['WindGustDir'].mode()[0])
data['WindDir9am'] = data['WindDir9am'].fillna(data['WindDir9am'].mode()[0])
data['WindDir3pm'] = data['WindDir3pm'].fillna(data['WindDir3pm'].mode()[0])

data['RainToday'] = data['RainToday'].fillna(data['RainToday'].mode()[0])

In [52]:
# Encode categorical variables

from sklearn.preprocessing import LabelEncoder

lencoders = {}
for col in ['Location','WindGustDir','WindDir9am','WindDir3pm']:
    lencoders[col] = LabelEncoder()
    data[col] = lencoders[col].fit_transform(data[col])
    
data['RainToday'] = data['RainToday'].map({'Yes': '1', 'No': '0'})
data['RainTomorrow'] = data['RainTomorrow'].map({'Yes': '1', 'No': '0'})

data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,12/1/2008,2,13.4,22.9,0.6,,,13,44.0,13,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,0,0.0,0
1,12/2/2008,2,7.4,25.1,0.0,,,14,44.0,6,...,25.0,1010.6,1007.8,,,17.2,24.3,0,0.0,0
2,12/3/2008,2,12.9,25.7,0.0,,,15,46.0,13,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,0,0.0,0
3,12/4/2008,2,9.2,28.0,0.0,,,4,24.0,9,...,16.0,1017.6,1012.8,,,18.1,26.5,0,1.0,0
4,12/5/2008,2,17.5,32.3,1.0,,,13,41.0,1,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0,0.2,0


In [60]:
# Impute continuous variables

#Impute by location

frames = []
for loc in list(set(data['Location'])):
    data_filtered = data[data['Location'] == loc]
    data_filtered['Sunshine'].fillna(data_filtered['Sunshine'].mean(),inplace = True)
    data_filtered['Evaporation'].fillna(data_filtered['Evaporation'].mean(),inplace = True)
    data_filtered['Cloud3pm'].fillna(data_filtered['Cloud3pm'].mean(),inplace = True)
    data_filtered['Cloud9am'].fillna(data_filtered['Cloud9am'].mean(),inplace = True)
    data_filtered['Pressure9am'].fillna(data_filtered['Pressure9am'].mean(),inplace = True)
    data_filtered['Pressure3pm'].fillna(data_filtered['Pressure3pm'].mean(),inplace = True)
    data_filtered['WindGustSpeed'].fillna(data_filtered['WindGustSpeed'].mean(),inplace = True)
    data_filtered['Humidity3pm'].fillna(data_filtered['Humidity3pm'].mean(),inplace = True)
    data_filtered['Temp3pm'].fillna(data_filtered['Temp3pm'].mean(),inplace = True)
    data_filtered['WindSpeed3pm'].fillna(data_filtered['WindSpeed3pm'].mean(),inplace = True)
    data_filtered['Humidity9am'].fillna(data_filtered['Humidity9am'].mean(),inplace = True)
    data_filtered['Rainfall'].fillna(data_filtered['Rainfall'].mean(),inplace = True)
    data_filtered['WindSpeed9am'].fillna(data_filtered['WindSpeed9am'].mean(),inplace = True)
    data_filtered['Temp9am'].fillna(data_filtered['Temp9am'].mean(),inplace = True)
    data_filtered['MinTemp'].fillna(data_filtered['MinTemp'].mean(),inplace = True)
    data_filtered['MaxTemp'].fillna(data_filtered['MaxTemp'].mean(),inplace = True)
    frames.append(data_filtered)
    
final_df = pd.concat(frames)

#Some locations have not recorded some variables at all
final_df['Sunshine'].fillna(final_df['Sunshine'].mean(),inplace = True)
final_df['Evaporation'].fillna(final_df['Evaporation'].mean(),inplace = True)
final_df['WindGustSpeed'].fillna(final_df['WindGustSpeed'].mean(),inplace = True)
final_df['Pressure9am'].fillna(final_df['Pressure9am'].mean(),inplace = True)
final_df['Pressure3pm'].fillna(final_df['Pressure3pm'].mean(),inplace = True)
final_df['Evaporation'].fillna(final_df['Evaporation'].mean(),inplace = True)
final_df['Cloud9am'].fillna(final_df['Cloud9am'].mean(),inplace = True)
final_df['Cloud3pm'].fillna(final_df['Cloud3pm'].mean(),inplace = True)

final_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 142193 entries, 93856 to 105967
Data columns (total 24 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           142193 non-null  object 
 1   Location       142193 non-null  int32  
 2   MinTemp        142193 non-null  float64
 3   MaxTemp        142193 non-null  float64
 4   Rainfall       142193 non-null  float64
 5   Evaporation    142193 non-null  float64
 6   Sunshine       142193 non-null  float64
 7   WindGustDir    142193 non-null  int32  
 8   WindGustSpeed  142193 non-null  float64
 9   WindDir9am     142193 non-null  int32  
 10  WindDir3pm     142193 non-null  int32  
 11  WindSpeed9am   142193 non-null  float64
 12  WindSpeed3pm   142193 non-null  float64
 13  Humidity9am    142193 non-null  float64
 14  Humidity3pm    142193 non-null  float64
 15  Pressure9am    142193 non-null  float64
 16  Pressure3pm    142193 non-null  float64
 17  Cloud9am       142193 non

In [61]:
df_null = final_df.isnull().sum()
df_null

Date             0
Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
RISK_MM          0
RainTomorrow     0
dtype: int64

In [64]:
final_df.to_csv('dataset/dataset_processed.csv', index=False)
final_df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
93856,7/1/2008,0,8.8,15.7,5.0,1.6,2.6,7,48.0,12,...,67.0,1017.4,1017.7,4.587158,4.596907,13.5,14.9,1,0.8,0
93857,7/2/2008,0,12.7,15.8,0.8,1.4,7.8,12,35.0,11,...,52.0,1022.4,1022.6,4.587158,4.596907,13.7,15.5,0,0.0,0
93858,7/3/2008,0,6.2,15.1,0.0,1.8,2.1,13,20.0,5,...,56.0,1027.8,1026.5,4.587158,4.596907,9.3,13.9,0,0.0,0
93859,7/4/2008,0,5.3,15.9,0.0,1.4,8.0,5,30.0,5,...,46.0,1028.7,1025.6,4.587158,4.596907,10.2,15.3,0,0.0,0
93860,7/6/2008,0,11.3,15.7,1.572185,5.824924,1.5,6,52.0,5,...,62.0,1019.5,1016.2,4.587158,4.596907,13.0,14.4,0,16.2,1
