In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("weatherperth.csv")

In [3]:
exclude = ["Date","Location", "RISK_MM"]
for attributes in exclude:
    del df[attributes]

In [5]:
df.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2.7,18.8,0.0,0.8,9.1,ENE,20.0,,E,0,...,97.0,53.0,1027.6,1024.5,2.0,3.0,8.5,18.1,No,No
1,6.4,20.7,0.0,1.8,7.0,NE,22.0,ESE,ENE,6,...,80.0,39.0,1024.1,1019.0,0.0,6.0,11.1,19.7,No,No
2,6.5,19.9,0.4,2.2,7.3,NE,31.0,,WNW,0,...,84.0,71.0,1016.8,1015.6,1.0,3.0,12.1,17.7,No,Yes
3,9.5,19.2,1.8,1.2,4.7,W,26.0,NNE,NNW,11,...,93.0,73.0,1019.3,1018.4,6.0,6.0,13.2,17.7,Yes,Yes
4,9.5,16.4,1.8,1.4,4.9,WSW,44.0,W,SW,13,...,69.0,57.0,1020.4,1022.1,7.0,5.0,15.9,16.0,Yes,Yes


In [6]:
print(df.iloc[214,:])

MinTemp            23.3
MaxTemp            36.0
Rainfall            0.0
Evaporation         5.6
Sunshine            NaN
WindGustDir          SW
WindGustSpeed      31.0
WindDir9am            E
WindDir3pm           SE
WindSpeed9am         15
WindSpeed3pm        6.0
Humidity9am        63.0
Humidity3pm        42.0
Pressure9am      1008.6
Pressure3pm      1005.9
Cloud9am            3.0
Cloud3pm            5.0
Temp9am            26.6
Temp3pm            34.8
RainToday            No
RainTomorrow         No
Name: 214, dtype: object


In [4]:
df = df.dropna()

In [14]:
print(df.iloc[214,:])

MinTemp            16.3
MaxTemp            30.2
Rainfall            0.0
Evaporation         9.0
Sunshine           12.5
WindGustDir           S
WindGustSpeed      44.0
WindDir9am           SE
WindDir3pm          SSW
WindSpeed9am         13
WindSpeed3pm       11.0
Humidity9am        61.0
Humidity3pm        35.0
Pressure9am      1013.0
Pressure3pm      1009.8
Cloud9am            1.0
Cloud3pm            1.0
Temp9am            19.7
Temp3pm            30.1
RainToday            No
RainTomorrow         No
Name: 231, dtype: object


In [5]:
Mapping_array = ["RainToday", "RainTomorrow"]
for i in Mapping_array:
    if i in df.columns:
        df[i] = df[i].map({
        "Yes" : 1,
        "No" : 0,
    })
    else: print("Column {i} doesn't exist")
df.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
1,6.4,20.7,0.0,1.8,7.0,NE,22.0,ESE,ENE,6,...,80.0,39.0,1024.1,1019.0,0.0,6.0,11.1,19.7,0,0
3,9.5,19.2,1.8,1.2,4.7,W,26.0,NNE,NNW,11,...,93.0,73.0,1019.3,1018.4,6.0,6.0,13.2,17.7,1,1
4,9.5,16.4,1.8,1.4,4.9,WSW,44.0,W,SW,13,...,69.0,57.0,1020.4,1022.1,7.0,5.0,15.9,16.0,1,1
5,0.7,15.9,6.8,2.4,9.3,NNE,24.0,ENE,NE,4,...,86.0,41.0,1032.0,1029.6,0.0,1.0,6.9,15.5,1,0
6,0.7,18.3,0.0,0.8,9.3,N,37.0,NE,NNE,15,...,72.0,36.0,1028.9,1024.2,1.0,5.0,8.7,17.9,0,0


In [6]:
directions = ['N','NNE','NE','ENE','E','ESE','SE','SSE','S','SSW','SW','WSW','W','WNW','NW','NNW']
angles = np.arange(0, 2*np.pi,2*np.pi/16)
dir_ang = dict(zip(directions, angles))
print(dir_ang)

{'N': 0.0, 'NNE': 0.39269908169872414, 'NE': 0.7853981633974483, 'ENE': 1.1780972450961724, 'E': 1.5707963267948966, 'ESE': 1.9634954084936207, 'SE': 2.356194490192345, 'SSE': 2.748893571891069, 'S': 3.141592653589793, 'SSW': 3.5342917352885173, 'SW': 3.9269908169872414, 'WSW': 4.319689898685965, 'W': 4.71238898038469, 'WNW': 5.105088062083414, 'NW': 5.497787143782138, 'NNW': 5.890486225480862}


In [7]:
mappable_attributes = ['WindGustDir','WindDir9am', 'WindDir3pm']
for i in mappable_attributes:
    df[i] = df[i].map(dir_ang)
    df[i+ '_cos'] = np.cos(df[i])
    df[i + '_sin'] = np.sin(df[i])
df.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,...,Temp9am,Temp3pm,RainToday,RainTomorrow,WindGustDir_cos,WindGustDir_sin,WindDir9am_cos,WindDir9am_sin,WindDir3pm_cos,WindDir3pm_sin
1,6.4,20.7,0.0,1.8,7.0,0.785398,22.0,1.963495,1.178097,6,...,11.1,19.7,0,0,0.7071068,0.707107,-0.3826834,0.92388,0.382683,0.92388
3,9.5,19.2,1.8,1.2,4.7,4.712389,26.0,0.392699,5.890486,11,...,13.2,17.7,1,1,-1.83697e-16,-1.0,0.9238795,0.382683,0.92388,-0.382683
4,9.5,16.4,1.8,1.4,4.9,4.31969,44.0,4.712389,3.926991,13,...,15.9,16.0,1,1,-0.3826834,-0.92388,-1.83697e-16,-1.0,-0.707107,-0.707107
5,0.7,15.9,6.8,2.4,9.3,0.392699,24.0,1.178097,0.785398,4,...,6.9,15.5,1,0,0.9238795,0.382683,0.3826834,0.92388,0.707107,0.707107
6,0.7,18.3,0.0,0.8,9.3,0.0,37.0,0.785398,0.392699,15,...,8.7,17.9,0,0,1.0,0.0,0.7071068,0.707107,0.92388,0.382683


In [8]:
df = df.drop(columns = mappable_attributes)
df.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,Temp9am,Temp3pm,RainToday,RainTomorrow,WindGustDir_cos,WindGustDir_sin,WindDir9am_cos,WindDir9am_sin,WindDir3pm_cos,WindDir3pm_sin
1,6.4,20.7,0.0,1.8,7.0,22.0,6,9.0,80.0,39.0,...,11.1,19.7,0,0,0.7071068,0.707107,-0.3826834,0.92388,0.382683,0.92388
3,9.5,19.2,1.8,1.2,4.7,26.0,11,6.0,93.0,73.0,...,13.2,17.7,1,1,-1.83697e-16,-1.0,0.9238795,0.382683,0.92388,-0.382683
4,9.5,16.4,1.8,1.4,4.9,44.0,13,17.0,69.0,57.0,...,15.9,16.0,1,1,-0.3826834,-0.92388,-1.83697e-16,-1.0,-0.707107,-0.707107
5,0.7,15.9,6.8,2.4,9.3,24.0,4,7.0,86.0,41.0,...,6.9,15.5,1,0,0.9238795,0.382683,0.3826834,0.92388,0.707107,0.707107
6,0.7,18.3,0.0,0.8,9.3,37.0,15,13.0,72.0,36.0,...,8.7,17.9,0,0,1.0,0.0,0.7071068,0.707107,0.92388,0.382683


In [9]:
y = df['RainTomorrow']
print(y)

1       0
3       1
4       1
5       0
6       0
       ..
3188    1
3189    0
3190    0
3191    0
3192    0
Name: RainTomorrow, Length: 3025, dtype: int64


In [10]:
X = df.drop(columns = 'RainTomorrow')
print(X)

      MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  WindGustSpeed  \
1         6.4     20.7       0.0          1.8       7.0           22.0   
3         9.5     19.2       1.8          1.2       4.7           26.0   
4         9.5     16.4       1.8          1.4       4.9           44.0   
5         0.7     15.9       6.8          2.4       9.3           24.0   
6         0.7     18.3       0.0          0.8       9.3           37.0   
...       ...      ...       ...          ...       ...            ...   
3188     10.3     19.9       0.2          1.8       7.5           37.0   
3189     13.0     16.8      61.2          3.6       0.0           46.0   
3190     13.3     18.9       0.4          1.8       6.5           37.0   
3191     11.5     18.2       0.0          3.8       9.3           30.0   
3192      6.3     17.0       0.0          1.6       7.9           26.0   

      WindSpeed9am  WindSpeed3pm  Humidity9am  Humidity3pm  ...  Cloud3pm  \
1                6           9.0  

In [11]:
# Training data and Test data split, function from sklearn library
#What does random_state do? So, test size deals with how much data of the total data set we're allocating to testing. Without random_state, 
# the data proportion will remain the same but not the specific data itself. Each time you run the code, the data for testing and training will differ
# this affects reproducibility. Hence, setting random_state = 0 makes sure the data for testing and training ends the same every
# run of the code, even while the percentage allocated remains the same.
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = 0.33,
    random_state = 0
    )
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)

X_test: (999, 23)
y_test: (999,)
X_train: (2026, 23)
y_train: (2026,)


In [14]:
# feature scaling, we fit the training data and apply transformation on both traing and testing data.
# Why are we not fitting test data? Because this could lead to data leakage. The model should be unaware of the test data during training phae
# If it does use fitting on test data, then the performance statistics of the model will be overly optimistic i.e; accuracy will be higher than actual
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
# Now, we train the data
nn = MLPClassifier(
    hidden_layer_sizes = (50,50),
    random_state = 0,
    max_iter = 500
)
nn.fit(X_train,y_train)

In [26]:
#Now we find the accuracy of the model we just trained. Note: testing data is seperate from training data and is
# only used after training the NN. Once trained, we take the testing data and input it into the trained model. It produces an array, y_pred
# that contains predicted values based on X_test
y_pred = nn.predict(X_test)
score = accuracy_score(y_pred,y_test)
print("The accuracy of the model is:",round((score*100),2), "%")

The accuracy of the model is: 89.29 %


In [30]:
# Can we improve the accuracy by using different combinations of hidden layers? To vary one attribute and see the results while keeping other attributes
#unchanged, we can use grid search function."Cv = " is cross validation parameter. How much divisions of the dataset do you want for 
# your grid search ( all these divisions are going to be evaluated and each time they're going to have different training and test data)
p = {
 'hidden_layer_sizes' : (
     (10,5),(10,), (20, 6), (100, 10), (5,)
 )
}
nn = MLPClassifier (
    max_iter = 5000,
    random_state = 0
)
gs = GridSearchCV(nn, p, cv = 3)
gs.fit(X_train,y_train)

In [32]:
print(gs.cv_results_['params'])
print(gs.cv_results_['mean_test_score'])

[{'hidden_layer_sizes': (10, 5)}, {'hidden_layer_sizes': (10,)}, {'hidden_layer_sizes': (20, 6)}, {'hidden_layer_sizes': (100, 10)}, {'hidden_layer_sizes': (5,)}]
[0.88549127 0.89487179 0.87956461 0.88894587 0.90621886]


In [37]:
best_nn = gs.best_estimator_
y_pred = best_nn.predict(X_test)
best_acc = accuracy_score(y_pred,y_test)
print("The new improved accuracy of the model using GridSearch is:", round((best_acc*100),2), "%")

The new improved accuracy of the model using GridSearch is: 90.29 %
