## Weather Prediction Using Fully Connected Neural Networks.

In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical

In [75]:
data = pd.read_csv("weather-data/Weather Data.csv")

In [76]:
data.shape

(99516, 23)

In [77]:
data.head()

Unnamed: 0,row ID,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,Row0,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0
1,Row1,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,0
2,Row2,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0
3,Row3,Albury,14.6,29.7,0.2,,,WNW,56.0,W,...,55.0,23.0,1009.2,1005.4,,,20.6,28.9,No,0
4,Row4,Albury,7.7,26.7,0.0,,,W,35.0,SSE,...,48.0,19.0,1013.4,1010.1,,,16.3,25.5,No,0


In [78]:
data.describe()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainTomorrow
count,99073.0,99286.0,98537.0,56985.0,52199.0,93036.0,98581.0,97681.0,98283.0,97010.0,89768.0,89780.0,61944.0,59514.0,98902.0,97612.0,99516.0
mean,12.176266,23.218513,2.353024,5.46132,7.61509,39.976966,14.004849,18.650464,68.866376,51.433296,1017.684638,1015.286204,4.447985,4.519122,16.970041,21.68134,0.224677
std,6.390882,7.115072,8.487866,4.16249,3.783008,13.581524,8.902323,8.801827,19.074951,20.777616,7.110166,7.045189,2.88658,2.716618,6.488961,6.931681,0.417372
min,-8.5,-4.1,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,980.5,978.2,0.0,0.0,-7.0,-5.1,0.0
25%,7.6,17.9,0.0,2.6,4.8,31.0,7.0,13.0,57.0,37.0,1013.0,1010.5,1.0,2.0,12.3,16.6,0.0
50%,12.0,22.6,0.0,4.8,8.4,39.0,13.0,19.0,70.0,52.0,1017.7,1015.3,5.0,5.0,16.7,21.1,0.0
75%,16.8,28.2,0.8,7.4,10.6,48.0,19.0,24.0,83.0,65.0,1022.4,1020.0,7.0,7.0,21.5,26.4,0.0
max,33.9,48.1,371.0,86.2,14.5,135.0,130.0,87.0,100.0,100.0,1041.0,1039.6,9.0,9.0,40.2,46.7,1.0


In [79]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99516 entries, 0 to 99515
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   row ID         99516 non-null  object 
 1   Location       99516 non-null  object 
 2   MinTemp        99073 non-null  float64
 3   MaxTemp        99286 non-null  float64
 4   Rainfall       98537 non-null  float64
 5   Evaporation    56985 non-null  float64
 6   Sunshine       52199 non-null  float64
 7   WindGustDir    92995 non-null  object 
 8   WindGustSpeed  93036 non-null  float64
 9   WindDir9am     92510 non-null  object 
 10  WindDir3pm     96868 non-null  object 
 11  WindSpeed9am   98581 non-null  float64
 12  WindSpeed3pm   97681 non-null  float64
 13  Humidity9am    98283 non-null  float64
 14  Humidity3pm    97010 non-null  float64
 15  Pressure9am    89768 non-null  float64
 16  Pressure3pm    89780 non-null  float64
 17  Cloud9am       61944 non-null  float64
 18  Cloud3

#### Drop columns 
Drop these columns/features because these columns have too much null values and are not as import as other columns, so we can drop them.

In [80]:
data.drop("Sunshine", axis=1, inplace=True)
data.drop("Evaporation", axis=1, inplace=True)

In [81]:
# Drop it because it useless 
data.drop("row ID", axis=1, inplace=True)

### Data Impuation
**Cant Use This approach**: `data[one].fillna(data[one].mode()[0], inplace=True)` because it's going to deprecate, so we have to use the below one.

**Use This Approach**: `data[one] = data[one].fillna(data[one].mode()[0])`

In [82]:
for one in data:
    if data[one].dtype == "object":
        data[one] = data[one].fillna(data[one].mode()[0])
    else:
        data[one] = data[one].fillna(data[one].mean()) 

In [83]:
data.sample(8)

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
18538,Penrith,12.9,24.4,0.0,WSW,69.0,N,WSW,4.0,20.0,68.866376,51.433296,1017.684638,1015.286204,4.447985,4.519122,20.4,13.8,No,0
73582,Woomera,8.3,20.4,0.0,S,35.0,S,SSE,20.0,20.0,56.0,26.0,1023.7,1021.1,1.0,1.0,13.2,19.7,No,0
73827,Woomera,7.8,23.9,0.0,NNE,30.0,NE,S,20.0,11.0,34.0,19.0,1023.5,1018.9,4.447985,4.519122,17.8,22.9,No,0
61769,GoldCoast,10.6,20.8,0.0,SSE,28.0,SSW,E,7.0,13.0,74.0,57.0,1024.6,1021.3,4.447985,4.519122,16.0,19.7,No,0
27592,Williamtown,4.6,17.6,0.0,W,56.0,WNW,WNW,31.0,31.0,40.0,29.0,1018.9,1016.5,1.0,2.0,12.5,16.7,No,0
27920,Williamtown,13.5,22.5,0.8,WNW,15.0,WNW,SSE,13.0,11.0,88.0,77.0,1025.1,1022.1,7.0,7.0,18.5,21.7,No,1
68062,MountGambier,7.2,14.2,2.4,SSW,33.0,WNW,S,7.0,17.0,92.0,78.0,1024.4,1024.0,2.0,7.0,12.0,12.4,Yes,0
16309,NorfolkIsland,13.3,20.6,0.0,NNE,33.0,N,NW,13.0,13.0,66.0,65.0,1022.3,1019.4,7.0,8.0,19.0,19.5,No,1


In [84]:
data.nunique()

Location          49
MinTemp          387
MaxTemp          501
Rainfall         602
WindGustDir       16
WindGustSpeed     68
WindDir9am        16
WindDir3pm        16
WindSpeed9am      44
WindSpeed3pm      45
Humidity9am      102
Humidity3pm      102
Pressure9am      537
Pressure3pm      537
Cloud9am          11
Cloud3pm          11
Temp9am          433
Temp3pm          492
RainToday          2
RainTomorrow       2
dtype: int64

#### Convert Object/Categorical data into numerical data
#### we also call it OHE (One Hot Encoding)

In [85]:
data = pd.get_dummies(data, columns=["Location", "WindDir9am", "WindDir3pm", "WindGustDir", "RainToday"], drop_first=True)
# we can also using this to dynamically import all object variables
# data = pd.get_dummies(data, columns=data.select_dtypes(include='object').columns, drop_first=True)

In [86]:
data.shape

(99516, 109)

In [87]:
data.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,...,WindGustDir_NW,WindGustDir_S,WindGustDir_SE,WindGustDir_SSE,WindGustDir_SSW,WindGustDir_SW,WindGustDir_W,WindGustDir_WNW,WindGustDir_WSW,RainToday_Yes
0,13.4,22.9,0.6,44.0,20.0,24.0,71.0,22.0,1007.7,1007.1,...,False,False,False,False,False,False,True,False,False,False
1,7.4,25.1,0.0,44.0,4.0,22.0,44.0,25.0,1010.6,1007.8,...,False,False,False,False,False,False,False,True,False,False
2,17.5,32.3,1.0,41.0,7.0,20.0,82.0,33.0,1010.8,1006.0,...,False,False,False,False,False,False,True,False,False,False
3,14.6,29.7,0.2,56.0,19.0,24.0,55.0,23.0,1009.2,1005.4,...,False,False,False,False,False,False,False,True,False,False
4,7.7,26.7,0.0,35.0,6.0,17.0,48.0,19.0,1013.4,1010.1,...,False,False,False,False,False,False,True,False,False,False


#### Min - Max Scaling
We never do the normalizaation of **Target Variable**, **Object Variables** and **Booleans** etc...  
We only do the normalization of the numerical columns and only in the features(training data)

##### #First let's separate the input data and label(targeted variable) because we only have to 
##### #apply the min-max scaling on the input features and only on the numerical features in the training data  
IMPORTANT: We haven't devided the data into chunks for training and testing  
like 80% for training and 20% for testing etc... we will do after the the Min - Max scaling step

In [88]:
input_data = data.drop("RainTomorrow", axis=1)
label = data["RainTomorrow"]

In [89]:
input_data.shape, label.shape

((99516, 108), (99516,))

In [90]:
input_data.select_dtypes(include=["float"]).shape

(99516, 14)

In [91]:
# It will include all float32 and float64, but in this case we have all in flaot64
float_values = input_data.select_dtypes(include=["float"])

In [92]:
# float_values = float_values / float_values.max()

for one in float_values: 
    input_data[one]= float_values[one] / float_values[one].max()

In [93]:
input_data

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,...,WindGustDir_NW,WindGustDir_S,WindGustDir_SE,WindGustDir_SSE,WindGustDir_SSW,WindGustDir_SW,WindGustDir_W,WindGustDir_WNW,WindGustDir_WSW,RainToday_Yes
0,0.395280,0.476091,0.001617,0.325926,0.153846,0.275862,0.71,0.22,0.968012,0.968738,...,False,False,False,False,False,False,True,False,False,False
1,0.218289,0.521830,0.000000,0.325926,0.030769,0.252874,0.44,0.25,0.970797,0.969411,...,False,False,False,False,False,False,False,True,False,False
2,0.516224,0.671518,0.002695,0.303704,0.053846,0.229885,0.82,0.33,0.970989,0.967680,...,False,False,False,False,False,False,True,False,False,False
3,0.430678,0.617464,0.000539,0.414815,0.146154,0.275862,0.55,0.23,0.969452,0.967103,...,False,False,False,False,False,False,False,True,False,False
4,0.227139,0.555094,0.000000,0.259259,0.046154,0.195402,0.48,0.19,0.973487,0.971624,...,False,False,False,False,False,False,True,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99511,0.235988,0.430353,0.000000,0.303704,0.146154,0.298851,0.56,0.32,0.987608,0.985283,...,False,False,False,False,False,False,False,False,False,False
99512,0.103245,0.453222,0.000000,0.229630,0.115385,0.149425,0.59,0.27,0.984342,0.982301,...,False,False,False,False,False,False,False,False,False,False
99513,0.082596,0.486486,0.000000,0.229630,0.100000,0.126437,0.51,0.24,0.984246,0.981435,...,False,False,False,False,False,False,False,False,False,False
99514,0.106195,0.525988,0.000000,0.162963,0.100000,0.103448,0.56,0.21,0.983189,0.980281,...,False,False,False,False,False,False,False,False,False,False


#### Splitting the whole Data into training and testing data

In [97]:
from sklearn.model_selection import train_test_split

In [98]:
x_train, x_test, y_train, y_test = train_test_split(input_data, label, test_size=0.2, random_state=42)

In [102]:
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(79612, 108) (79612,)
(19904, 108) (19904,)


#### Let's build the Model

In [105]:
model = Sequential()

# The data shape will be the number of features (training data not label)
model.add(Input(shape=(108, ) ))

model.add(Dense(128, activation="relu"))
model.add(Dense(64, activation="relu"))
model.add(Dense(32, activation="relu"))
model.add(Dense(2, activation="sigmoid"))

model.compile(loss="sparse_categorical_crossentropy", optimizer="sgd", metrics=["accuracy"])
model.summary()

#### Training 

In [103]:
x_train

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,...,WindGustDir_NW,WindGustDir_S,WindGustDir_SE,WindGustDir_SSE,WindGustDir_SSW,WindGustDir_SW,WindGustDir_W,WindGustDir_WNW,WindGustDir_WSW,RainToday_Yes
54844,0.407080,0.638254,0.000000,0.259259,0.030769,0.229885,0.740000,0.39,0.978290,0.976048,...,False,False,False,False,False,True,False,False,False,False
97612,0.359182,0.725572,0.006342,0.303704,0.069231,0.126437,0.688664,0.51,0.969452,0.967103,...,False,False,False,False,False,False,False,False,False,False
18682,0.345133,0.395010,0.005391,0.207407,0.000000,0.172414,0.990000,0.70,0.981556,0.981050,...,False,False,True,False,False,False,False,False,False,True
428,-0.011799,0.239085,0.000000,0.111111,0.046154,0.000000,0.990000,0.69,0.988665,0.986822,...,False,False,False,False,False,False,False,False,False,False
92461,0.374631,0.523909,0.000000,0.437037,0.030769,0.402299,0.590000,0.65,0.977603,0.976612,...,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,0.082596,0.318087,0.000000,0.318519,0.100000,0.275862,0.770000,0.42,0.986167,0.985187,...,False,False,False,False,True,False,False,False,False,False
54886,0.371681,0.580042,0.000000,0.325926,0.053846,0.195402,0.710000,0.29,0.973679,0.971239,...,False,False,False,False,False,False,False,False,True,False
76820,0.492625,0.677755,0.000000,0.340741,0.069231,0.298851,0.480000,0.32,0.968780,0.967872,...,False,False,False,True,False,False,False,False,False,False
860,0.312684,0.457380,0.000000,0.096296,0.030769,0.068966,0.860000,0.57,0.982421,0.981724,...,False,True,False,False,False,False,False,False,False,False


In [106]:
from time import time

tic = time()
model.fit(x_train, y_train, batch_size=200, epochs=10, verbose=1)
toc = time()

print("Model Taining took {} Secs".format(toc - tic))

Epoch 1/10
[1m399/399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7462 - loss: 0.5793
Epoch 2/10
[1m399/399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7745 - loss: 0.5092
Epoch 3/10
[1m399/399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7793 - loss: 0.4723
Epoch 4/10
[1m399/399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8004 - loss: 0.4466
Epoch 5/10
[1m399/399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8083 - loss: 0.4259
Epoch 6/10
[1m399/399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8198 - loss: 0.4099
Epoch 7/10
[1m399/399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8264 - loss: 0.3964
Epoch 8/10
[1m399/399[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8305 - loss: 0.3904
Epoch 9/10
[1m399/399[0m [32m━━━━━━━━

#### Let's predict 

In [107]:
y_pred_prob = model.predict(x_test)

[1m622/622[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step


In [113]:
# Let's check the accuracy now
from sklearn.metrics import accuracy_score

# y_pred = np.where(y_pred_prob > 0.5, 1, 0)

# we have to use this line because we didn't converted data into OHE (one hot encoding), just used the
# sparse_categorical_crossentropy, that was fine but now when we have to print the values, we have
# to this line to convert the output into into one-D Array.
y_pred = np.argmax(y_pred_prob, axis=1)

acc_scr = accuracy_score(y_pred, y_test)
rounded = round(acc_scr * 100, 2)

print("Accuracy Score: {}%".format(rounded))

Accuracy Score: 83.59%
