In [27]:
import numpy as np
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

In [21]:
data = pd.read_csv('ImputedData.csv')
data = data.drop('Unnamed: 0', axis = 1)
data['RainTomorrow'] = data['RainTomorrow'].astype('int')  # since y is a binary variable, turn it into integer
data

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,396.0,2.0,13.4,22.9,0.6,6.038831,7.270427,13.0,44.0,13.0,...,22.0,1007.7,1007.1,8.000000,5.037725,16.9,21.8,0.0,0.0,0
1,397.0,2.0,7.4,25.1,0.0,5.810055,11.109215,14.0,44.0,6.0,...,25.0,1010.6,1007.8,1.863332,2.652044,17.2,24.3,0.0,0.0,0
2,398.0,2.0,12.9,25.7,0.0,8.127255,11.995755,15.0,46.0,13.0,...,30.0,1007.6,1008.7,1.836479,2.000000,21.0,23.2,0.0,0.0,0
3,399.0,2.0,9.2,28.0,0.0,6.057760,11.662218,4.0,24.0,9.0,...,16.0,1017.6,1012.8,1.066184,1.957860,18.1,26.5,0.0,1.0,0
4,400.0,2.0,17.5,32.3,1.0,6.972119,6.006508,13.0,41.0,1.0,...,33.0,1010.8,1006.0,7.000000,8.000000,17.8,29.7,0.0,0.2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193287,4528.0,41.0,5.4,26.4,0.0,7.443548,11.226670,6.0,41.0,2.0,...,12.0,1018.9,1013.9,1.773233,2.029222,13.5,25.5,0.0,0.0,0
193288,4529.0,41.0,4.6,29.4,0.0,9.053744,12.684745,6.0,67.0,7.0,...,16.0,1012.8,1008.5,0.819580,2.055492,16.6,28.7,0.0,0.0,0
193289,4530.0,41.0,15.1,26.5,0.0,8.530583,5.898311,3.0,35.0,11.0,...,29.0,1014.7,1012.9,8.000000,5.163966,16.9,26.0,0.0,0.0,0
193290,4531.0,41.0,8.3,27.5,0.0,7.269085,9.855526,13.0,46.0,8.0,...,25.0,1016.4,1012.0,3.194620,2.917990,13.3,26.7,0.0,0.0,0


In [22]:
rain = data['RainTomorrow'].loc[data['RainTomorrow'] == 1].count()
not_rain = data['RainTomorrow'].loc[data['RainTomorrow'] == 0].count()
print('Total number of days it rained is: ', rain)
print('Total number of days it didnt rain is: ', not_rain)
print('The ratio is: ', rain / not_rain)

Total number of days it rained is:  42042
Total number of days it didnt rain is:  151250
The ratio is:  0.27796363636363636


In [23]:
X = data.drop(['RISK_MM', 'RainTomorrow'], axis = 1)
y = data['RainTomorrow']

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [36]:
print('Before applying SMOTE, number of days rained in training set is: ', y_train.loc[y_train == 1].count())
print('Before applying SMOTE, number of days it didnt rain in training set is: ', y_train.loc[y_train == 0].count())
print('The ratio is: ', y_train.loc[y_train == 1].count() / y_train.loc[y_train == 0].count())

Before applying SMOTE, number of days rained in training set is:  31521
Before applying SMOTE, number of days it didnt rain in training set is:  113448
The ratio is:  0.27784535646287284


In [37]:
smo = SMOTE(random_state = 42)
X_smo, y_smo = smo.fit_resample(X_train, y_train)

In [38]:
print('After applying SMOTE, number of days rained in training set is: ', y_smo.loc[y_smo == 1].count())
print('After applying SMOTE, number of days it didnt rain in training set is: ', y_smo.loc[y_smo == 0].count())
print('The ratio is: ', y_smo.loc[y_smo == 1].count() / y_smo.loc[y_smo == 0].count())

After applying SMOTE, number of days rained in training set is:  113448
After applying SMOTE, number of days it didnt rain in training set is:  113448
The ratio is:  1.0


In [42]:
train_data = pd.concat([X_smo, y_smo], axis = 1)
train_data.to_csv('TrainData.csv')

In [43]:
test_data = pd.concat([X_test, y_test], axis = 1)
test_data.to_csv('TestData.csv')