In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from plotnine import *

In [2]:
df = pd.read_csv("2.weatherAUS_impute_merge.csv")
df.drop(columns=['Unnamed: 0'], inplace=True)
df

Unnamed: 0,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Cloud9am,Cloud3pm,...,Longitude,Date,Location,WindGustDir,WindDir9am,WindDir3pm,RainToday,RainTomorrow,AveragePressure,AverageTemp
0,0.6,9.6,13.0,44.0,20.0,24.0,71.0,22.0,8.0,6.0,...,146.909485,2008-12-01,Albury,W,W,WNW,No,No,1007.40,18.750
1,0.0,13.0,13.2,44.0,4.0,22.0,44.0,25.0,1.0,4.0,...,146.909485,2008-12-02,Albury,WNW,NNW,WSW,No,No,1009.20,18.500
2,0.0,10.2,13.2,46.0,19.0,26.0,38.0,30.0,2.0,2.0,...,146.909485,2008-12-03,Albury,WSW,W,WSW,No,No,1008.15,20.700
3,0.0,11.0,10.8,24.0,11.0,9.0,45.0,16.0,5.0,7.0,...,146.909485,2008-12-04,Albury,NE,SE,E,No,No,1015.20,20.450
4,1.0,6.6,8.1,41.0,7.0,20.0,82.0,33.0,7.0,8.0,...,146.909485,2008-12-05,Albury,W,ENE,NW,No,No,1008.40,24.325
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123705,0.0,2.2,0.0,31.0,15.0,13.0,59.0,27.0,0.0,1.0,...,131.036882,2017-06-20,Uluru,E,ESE,E,No,No,1022.95,13.900
123706,0.0,2.0,9.1,31.0,13.0,11.0,51.0,24.0,8.0,0.0,...,131.036882,2017-06-21,Uluru,E,SE,ENE,No,No,1022.45,14.675
123707,0.0,1.4,8.7,22.0,13.0,9.0,56.0,21.0,8.0,7.0,...,131.036882,2017-06-22,Uluru,NNW,SE,N,No,No,1021.30,16.075
123708,0.0,4.0,5.9,37.0,9.0,9.0,53.0,24.0,7.0,3.0,...,131.036882,2017-06-23,Uluru,N,SE,WNW,No,No,1018.90,17.725


In [3]:
# test if have nan
df.isnull().sum()

Rainfall           0
Evaporation        0
Sunshine           0
WindGustSpeed      0
WindSpeed9am       0
WindSpeed3pm       0
Humidity9am        0
Humidity3pm        0
Cloud9am           0
Cloud3pm           0
Latitude           0
Longitude          0
Date               0
Location           0
WindGustDir        0
WindDir9am         0
WindDir3pm         0
RainToday          0
RainTomorrow       0
AveragePressure    0
AverageTemp        0
dtype: int64

In [4]:
# Create spring, summer, autumn, winter
df['Date'] = pd.to_datetime(df['Date'])
df['Month'] = df['Date'].dt.month
df['Spring'] = df['Month'].apply(lambda x: 1 if x in [9,10,11] else 0)
df['Summer'] = df['Month'].apply(lambda x: 1 if x in [12,1,2] else 0)
df['Autumn'] = df['Month'].apply(lambda x: 1 if x in [3,4,5] else 0)
df['Winter'] = df['Month'].apply(lambda x: 1 if x in [6,7,8] else 0)
df.drop(columns=['Month'], inplace=True)
df.drop(columns=['Date'], inplace=True)

In [5]:
# Convert RainTomorrow to binary
df["RainTomorrow"] = df["RainTomorrow"].apply(lambda x: 1 if x == "Yes" else 0)

# Drop rain today
df.drop(columns=['RainToday'], inplace=True)
df

Unnamed: 0,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Cloud9am,Cloud3pm,...,WindGustDir,WindDir9am,WindDir3pm,RainTomorrow,AveragePressure,AverageTemp,Spring,Summer,Autumn,Winter
0,0.6,9.6,13.0,44.0,20.0,24.0,71.0,22.0,8.0,6.0,...,W,W,WNW,0,1007.40,18.750,0,1,0,0
1,0.0,13.0,13.2,44.0,4.0,22.0,44.0,25.0,1.0,4.0,...,WNW,NNW,WSW,0,1009.20,18.500,0,1,0,0
2,0.0,10.2,13.2,46.0,19.0,26.0,38.0,30.0,2.0,2.0,...,WSW,W,WSW,0,1008.15,20.700,0,1,0,0
3,0.0,11.0,10.8,24.0,11.0,9.0,45.0,16.0,5.0,7.0,...,NE,SE,E,0,1015.20,20.450,0,1,0,0
4,1.0,6.6,8.1,41.0,7.0,20.0,82.0,33.0,7.0,8.0,...,W,ENE,NW,0,1008.40,24.325,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123705,0.0,2.2,0.0,31.0,15.0,13.0,59.0,27.0,0.0,1.0,...,E,ESE,E,0,1022.95,13.900,0,0,0,1
123706,0.0,2.0,9.1,31.0,13.0,11.0,51.0,24.0,8.0,0.0,...,E,SE,ENE,0,1022.45,14.675,0,0,0,1
123707,0.0,1.4,8.7,22.0,13.0,9.0,56.0,21.0,8.0,7.0,...,NNW,SE,N,0,1021.30,16.075,0,0,0,1
123708,0.0,4.0,5.9,37.0,9.0,9.0,53.0,24.0,7.0,3.0,...,N,SE,WNW,0,1018.90,17.725,0,0,0,1


In [6]:
# get categorical columns
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
cat_cols

['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']

In [7]:
# drop categorical columns
df.drop(columns=['Location','WindGustDir', 'WindDir9am', 'WindDir3pm'], inplace=True)
df

Unnamed: 0,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Cloud9am,Cloud3pm,Latitude,Longitude,RainTomorrow,AveragePressure,AverageTemp,Spring,Summer,Autumn,Winter
0,0.6,9.6,13.0,44.0,20.0,24.0,71.0,22.0,8.0,6.0,-36.075119,146.909485,0,1007.40,18.750,0,1,0,0
1,0.0,13.0,13.2,44.0,4.0,22.0,44.0,25.0,1.0,4.0,-36.075119,146.909485,0,1009.20,18.500,0,1,0,0
2,0.0,10.2,13.2,46.0,19.0,26.0,38.0,30.0,2.0,2.0,-36.075119,146.909485,0,1008.15,20.700,0,1,0,0
3,0.0,11.0,10.8,24.0,11.0,9.0,45.0,16.0,5.0,7.0,-36.075119,146.909485,0,1015.20,20.450,0,1,0,0
4,1.0,6.6,8.1,41.0,7.0,20.0,82.0,33.0,7.0,8.0,-36.075119,146.909485,0,1008.40,24.325,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123705,0.0,2.2,0.0,31.0,15.0,13.0,59.0,27.0,0.0,1.0,-25.344428,131.036882,0,1022.95,13.900,0,0,0,1
123706,0.0,2.0,9.1,31.0,13.0,11.0,51.0,24.0,8.0,0.0,-25.344428,131.036882,0,1022.45,14.675,0,0,0,1
123707,0.0,1.4,8.7,22.0,13.0,9.0,56.0,21.0,8.0,7.0,-25.344428,131.036882,0,1021.30,16.075,0,0,0,1
123708,0.0,4.0,5.9,37.0,9.0,9.0,53.0,24.0,7.0,3.0,-25.344428,131.036882,0,1018.90,17.725,0,0,0,1


In [8]:
# check nan
df.isnull().sum()

Rainfall           0
Evaporation        0
Sunshine           0
WindGustSpeed      0
WindSpeed9am       0
WindSpeed3pm       0
Humidity9am        0
Humidity3pm        0
Cloud9am           0
Cloud3pm           0
Latitude           0
Longitude          0
RainTomorrow       0
AveragePressure    0
AverageTemp        0
Spring             0
Summer             0
Autumn             0
Winter             0
dtype: int64

In [9]:
# check col
df.columns

Index(['Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am',
       'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Cloud9am', 'Cloud3pm',
       'Latitude', 'Longitude', 'RainTomorrow', 'AveragePressure',
       'AverageTemp', 'Spring', 'Summer', 'Autumn', 'Winter'],
      dtype='object')

In [10]:
# check type
df.dtypes

Rainfall           float64
Evaporation        float64
Sunshine           float64
WindGustSpeed      float64
WindSpeed9am       float64
WindSpeed3pm       float64
Humidity9am        float64
Humidity3pm        float64
Cloud9am           float64
Cloud3pm           float64
Latitude           float64
Longitude          float64
RainTomorrow         int64
AveragePressure    float64
AverageTemp        float64
Spring               int64
Summer               int64
Autumn               int64
Winter               int64
dtype: object

In [16]:
# train test split for bias data
from sklearn.model_selection import train_test_split
X = df.drop(columns=['RainTomorrow'])
y = df['RainTomorrow']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# combine X_train and y_train
train = pd.concat([X_train, y_train], axis=1)
# combine X_test and y_test
test = pd.concat([X_test, y_test], axis=1)

In [18]:
# save file
train.to_csv("train.csv")
test.to_csv("test.csv")