In [1]:
#This binary calssification model is used to predict whether it is going to rain the next day or not
#This model works with an accuracy of 77.85%.

In [2]:
#Importing the required modules for binary classification model
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical

Using TensorFlow backend.


In [3]:
#We read from our dataset which is cities.csv file
#ds stands for dataset
ds=pd.read_csv('cities.csv')
ds.shape

(142241, 24)

In [4]:
#Information on the table given as dataset
ds.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No


In [5]:
#we need to sort it to know how many null values each column has:
ds.count().sort_values()

Sunshine          74425
Evaporation       81398
Cloud3pm          85147
Cloud9am          88584
Pressure9am      128227
Pressure3pm      128260
WindDir9am       132228
WindGustDir      132911
WindGustSpeed    132971
WindDir3pm       138463
Humidity3pm      138631
Temp3pm          139515
WindSpeed3pm     139611
Humidity9am      140467
RainToday        140835
Rainfall         140835
WindSpeed9am     140893
Temp9am          141337
MinTemp          141604
MaxTemp          141919
Date             142241
Location         142241
RISK_MM          142241
RainTomorrow     142241
dtype: int64

In [6]:
#Here we drop all those colums which have less than 70% of 142241 rows
#That is all columns less than 99569 are dropped
#The column 'RISK_MM' can interrupt in identifying the rain tomorrow as it is most probable rain which can occur on next day
#It can be true or false as it is just a probability so we drop that column also.
ds=ds.drop(columns=['Sunshine','Evaporation','RISK_MM','Cloud3pm','Cloud9am'],axis=1)
ds.shape

(142241, 19)

In [7]:
#Hence we have dropped 5 columns.So total columns now are 19.
#Now we have 19 columns but still have null data in them.

In [8]:
#PREPROCESSING STAGE:
#The null data in numerical columns is replaced by median.
#The categorical data is replaced by required one.
num=[]
cat=[]
def is_number(column_name):
    for index,row in ds[column_name].iteritems(): 
        try:
            float(row)   # Type-casting the string to `float`.
                       # If string is not a valid `float`, 
                       # it'll raise `ValueError` exception
        except ValueError:
            return False
        return True
for i in ds.columns:
    if is_number(str(i)):
        num.append(i)
    else:
        cat.append(i)
print(num)
print(cat)

['MinTemp', 'MaxTemp', 'Rainfall', 'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm']
['Date', 'Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'RainTomorrow']


In [9]:
for i in num:
    for index,row in ds[i].iteritems():
        if(row=='nan'or row==i or row=='NaN'):
            ds[i].loc[index]=0

In [10]:
#Here we convert the categorical data also into numerical based on dictionary given below
d={}
x=0
for i in cat[2:5]:
    for index,row in ds[i].iteritems():
        if row not in d:
            d[row]=x
            x+=1
    ds[i]=ds[i].map(d)
print(d)

{'W': 0, 'WNW': 1, 'WSW': 2, 'NE': 3, 'NNW': 4, 'N': 5, 'NNE': 6, 'SW': 7, 'ENE': 8, 'SSE': 9, 'S': 10, 'NW': 11, 'SE': 12, 'ESE': 13, nan: 14, 'E': 15, 'SSW': 16, 'WindGustDir': 17, 'WindDir9am': 18, 'WindDir3pm': 19}


In [11]:
#We drop few categorical data as cannot be converted to string and are no use for us and RainToday will specifically mislead the data  
ds=ds.drop(columns=['Location', 'Date','RainToday'],axis=1)
ds.shape

(142241, 16)

In [12]:
#Now we shuffle and split train and test set as 80% and 20% of the given data
train, test = train_test_split(ds, test_size = 0.2, random_state = 123)

In [13]:
#Few features of train set
train.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainTomorrow
7301,16.6,22.8,0.0,16,37.0,16,7,17.0,22.0,,22.0,1012.8,1013.0,17.3,20.2,No
21097,17.9,22.3,0.4,13,44.0,13,13,24.0,24.0,54.0,58.0,1023.7,1021.9,20.3,20.8,No
107312,12.8,20.1,0.0,14,,4,1,13.0,19.0,51.0,92.0,1011.8,1008.3,15.7,14.4,Yes
124067,7.9,13.2,12.6,7,69.0,16,7,26.0,17.0,74.0,58.0,1015.0,1016.5,9.8,12.0,No
68666,14.3,20.9,0.0,3,31.0,15,6,11.0,22.0,77.0,83.0,1013.8,1010.5,17.2,19.5,Yes


In [14]:
#Few features of test set
test.head()

Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainTomorrow
30929,20.8,26.9,29.6,15,26.0,9,15,13.0,15.0,90.0,68.0,1014.0,1013.4,22.5,26.4,Yes
120054,22.6,29.0,0.0,7,35.0,12,7,13.0,20.0,63.0,51.0,1016.8,1015.1,23.6,25.4,No
57991,-0.8,11.5,1.4,1,35.0,0,1,11.0,15.0,88.0,64.0,1024.1,1023.9,3.8,11.2,No
37904,10.2,33.5,0.0,2,26.0,15,1,15.0,9.0,48.0,7.0,1015.4,1011.4,21.6,31.9,No
41864,16.0,22.7,0.0,12,19.0,9,13,9.0,11.0,76.0,75.0,1028.2,1026.4,20.5,22.3,No


In [15]:
#Now we create feature set and target to train model.
x_train=train.drop('RainTomorrow',axis=1)
y=train['RainTomorrow'].copy()
y_train=y.map({'Yes':1,'No':0,'RainTomorrow':0})

In [16]:
#Same thing for test model 
x_test=test.drop('RainTomorrow',axis=1)
z=test['RainTomorrow'].copy()
y_test=z.map({'Yes':1,'No':0,'RainTomorrow':0})

In [17]:
#No.of columns
n=x_train.shape[1]
y_train=to_categorical(y_train)
y_test=to_categorical(y_test)

In [18]:
#Now we BUILD NEURAL NETWORK(both of model works and gives almostsame accracy)
model=keras.Sequential([
    keras.layers.Dense(10,activation=tf.nn.relu,input_shape=(n,)),
    keras.layers.Dense(10,activation=tf.nn.relu),
    keras.layers.Dense(2,activation=tf.nn.softmax)
])
'''
           (or)
model=keras.Sequential([
    keras.layers.Dense(60, input_dim=15, kernel_initializer='normal', activation='relu'),
    keras.layers.Dense(2, kernel_initializer='normal', activation='sigmoid')
])
'''

W0719 10:53:22.873744 11072 deprecation.py:506] From d:\anaconda\envs\tensorflow\lib\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


"\nmodel=keras.Sequential([\n    keras.layers.Dense(60, input_dim=15, kernel_initializer='normal', activation='relu'),\n    keras.layers.Dense(2, kernel_initializer='normal', activation='sigmoid')\n])\n"

In [19]:
#COMPILATION code:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['accuracy'])

In [20]:
#TRAINING the model:
model.fit(x_train,y_train,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x2461fc10438>

In [21]:
#TESTING THE MODEL:
test_loss,test_acc=model.evaluate(x_test,y_test)
print("Accuracy of the model:",(test_acc*100))

Accuracy of the model: 77.84807682037354
