In [1]:
import warnings
warnings.filterwarnings('ignore')
import os
import numpy as np
import pandas as pd

In [2]:
from matplotlib import pyplot as plt
%matplotlib inline

In [3]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
import tensorflow as tf
from tensorflow import keras

In [5]:
from sklearn import preprocessing

In [46]:
weather_df = pd.read_csv("weatherHistory.csv") #importing dataset

In [47]:
weather_df.head()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251,15.8263,0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259,15.8263,0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204,14.9569,0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269,15.8263,0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259,15.8263,0,1016.51,Partly cloudy throughout the day.


In [8]:
weather_df.columns #show all columns

Index(['Formatted Date', 'Summary', 'Precip Type', 'Temperature (C)',
       'Apparent Temperature (C)', 'Humidity', 'Wind Speed (km/h)',
       'Wind Bearing (degrees)', 'Visibility (km)', 'Loud Cover',
       'Pressure (millibars)', 'Daily Summary'],
      dtype='object')

In [9]:
weather_df.shape

(96453, 12)

In [10]:
weather_df.describe()

Unnamed: 0,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars)
count,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0,96453.0
mean,11.932678,10.855029,0.734899,10.81064,187.509232,10.347325,0.0,1003.235956
std,9.551546,10.696847,0.195473,6.913571,107.383428,4.192123,0.0,116.969906
min,-21.822222,-27.716667,0.0,0.0,0.0,0.0,0.0,0.0
25%,4.688889,2.311111,0.6,5.8282,116.0,8.3398,0.0,1011.9
50%,12.0,12.0,0.78,9.9659,180.0,10.0464,0.0,1016.45
75%,18.838889,18.838889,0.89,14.1358,290.0,14.812,0.0,1021.09
max,39.905556,39.344444,1.0,63.8526,359.0,16.1,0.0,1046.38


## Preprocess DataSet

In [11]:
weather_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 96453 entries, 0 to 96452
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Formatted Date            96453 non-null  object 
 1   Summary                   96453 non-null  object 
 2   Precip Type               95936 non-null  object 
 3   Temperature (C)           96453 non-null  float64
 4   Apparent Temperature (C)  96453 non-null  float64
 5   Humidity                  96453 non-null  float64
 6   Wind Speed (km/h)         96453 non-null  float64
 7   Wind Bearing (degrees)    96453 non-null  int64  
 8   Visibility (km)           96453 non-null  float64
 9   Loud Cover                96453 non-null  int64  
 10  Pressure (millibars)      96453 non-null  float64
 11  Daily Summary             96453 non-null  object 
dtypes: float64(6), int64(2), object(4)
memory usage: 8.8+ MB


In [12]:
weather_df.isnull().any() #finding is there any null value attribute

Formatted Date              False
Summary                     False
Precip Type                  True
Temperature (C)             False
Apparent Temperature (C)    False
Humidity                    False
Wind Speed (km/h)           False
Wind Bearing (degrees)      False
Visibility (km)             False
Loud Cover                  False
Pressure (millibars)        False
Daily Summary               False
dtype: bool

In [13]:
weather_df.isnull().all() #finding is there fully null value attribute

Formatted Date              False
Summary                     False
Precip Type                 False
Temperature (C)             False
Apparent Temperature (C)    False
Humidity                    False
Wind Speed (km/h)           False
Wind Bearing (degrees)      False
Visibility (km)             False
Loud Cover                  False
Pressure (millibars)        False
Daily Summary               False
dtype: bool

In [14]:
round(100*(weather_df.isnull().sum()/len(weather_df.index)),2) #getting the percentage of null value

Formatted Date              0.00
Summary                     0.00
Precip Type                 0.54
Temperature (C)             0.00
Apparent Temperature (C)    0.00
Humidity                    0.00
Wind Speed (km/h)           0.00
Wind Bearing (degrees)      0.00
Visibility (km)             0.00
Loud Cover                  0.00
Pressure (millibars)        0.00
Daily Summary               0.00
dtype: float64

In [15]:
weather_df['Precip Type'].value_counts()

rain    85224
snow    10712
Name: Precip Type, dtype: int64

In [16]:
weather_df.loc[weather_df['Precip Type'].isnull(),'Precip Type']='rain' #set null value to rain

In [17]:
round(100*(weather_df.isnull().sum()/len(weather_df.index)),2)  #recheck the percentage of null values

Formatted Date              0.0
Summary                     0.0
Precip Type                 0.0
Temperature (C)             0.0
Apparent Temperature (C)    0.0
Humidity                    0.0
Wind Speed (km/h)           0.0
Wind Bearing (degrees)      0.0
Visibility (km)             0.0
Loud Cover                  0.0
Pressure (millibars)        0.0
Daily Summary               0.0
dtype: float64

In [18]:
weather_df.loc[weather_df['Precip Type']=='rain','Precip Type']=1 #setting the rain to 1
weather_df.loc[weather_df['Precip Type']=='snow','Precip Type']=0 #setting the snow to 0

In [19]:
weather_df_num=weather_df[list(weather_df.dtypes[weather_df.dtypes!='object'].index)]

In [20]:
weather_y = weather_df_num.pop('Temperature (C)')
weather_x = weather_df_num

# Spliting the data for training and testing

In [21]:
train_x,test_x,train_y,test_y = train_test_split(weather_x,weather_y,test_size = 0.2, random_state=4) #set 20% for testing

In [22]:
train_x.head()

Unnamed: 0,Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars)
70626,21.061111,0.31,12.558,110,16.1,0,1005.87
52457,25.016667,0.36,18.4989,352,10.3523,0,1025.36
90690,0.738889,0.89,17.1304,270,15.8263,0,1014.75
69528,13.772222,0.78,14.49,300,15.8263,0,1014.56
92419,23.288889,0.82,6.3917,357,16.1,0,1022.05


## Linear Regression Model

In [23]:
from sklearn.linear_model import LinearRegression

In [24]:
model = LinearRegression()
model.fit(train_x,train_y)

LinearRegression()

In [25]:
prediction = model.predict(test_x)

In [26]:
er_lr = np.mean((prediction-test_y)**2)
er_lr

0.9022743711593206

In [27]:
pd.DataFrame({'actual':test_y,
             'prediction':prediction,
             'diff':(test_y-prediction)})

Unnamed: 0,actual,prediction,diff
37443,-2.288889,-3.355714,1.066825
86534,8.861111,9.418530,-0.557419
2082,9.805556,9.701321,0.104235
53130,27.222222,27.096837,0.125385
45196,17.705556,17.302053,0.403503
...,...,...,...
55387,-10.066667,-11.960987,1.894320
25976,9.972222,9.831699,0.140523
32423,12.777778,15.436904,-2.659127
26570,15.277778,16.307788,-1.030010


## Polynomial Regression Model

In [28]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree = 4)
x_poly = poly.fit_transform(train_x)

poly.fit(x_poly,train_y)
lin2 = LinearRegression()
lin2.fit(x_poly, train_y)

LinearRegression()

In [29]:
prediction2 = lin2.predict(poly.fit_transform(test_x))

In [30]:
er_pr = np.mean((prediction2-test_y)**2)
er_pr

0.1460947058589796

In [31]:
pd.DataFrame({'actual':test_y,
             'prediction':prediction2,
             'diff':(test_y-prediction2)})

Unnamed: 0,actual,prediction,diff
37443,-2.288889,-2.182929,-0.105959
86534,8.861111,9.111593,-0.250482
2082,9.805556,9.600726,0.204830
53130,27.222222,27.152936,0.069286
45196,17.705556,17.757491,-0.051935
...,...,...,...
55387,-10.066667,-10.148966,0.082299
25976,9.972222,9.772182,0.200040
32423,12.777778,13.271602,-0.493824
26570,15.277778,15.421842,-0.144064


## Decision Tree Regressor

In [32]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(train_x,train_y)

DecisionTreeRegressor(random_state=0)

In [33]:
prediction3 = regressor.predict(test_x)

In [34]:
er_dt = np.mean((prediction3-test_y)**2)
er_dt

0.006845039361290016

In [35]:
pd.DataFrame({'actual':test_y,
             'prediction':prediction3,
             'diff':(test_y-prediction3)})

Unnamed: 0,actual,prediction,diff
37443,-2.288889,-2.266667,-2.222222e-02
86534,8.861111,8.855556,5.555555e-03
2082,9.805556,9.788889,1.666667e-02
53130,27.222222,27.222222,0.000000e+00
45196,17.705556,17.705556,3.552714e-15
...,...,...,...
55387,-10.066667,-10.000000,-6.666667e-02
25976,9.972222,9.972222,0.000000e+00
32423,12.777778,12.777778,-6.217249e-14
26570,15.277778,15.277778,0.000000e+00


## Random Forest With Max Depth=10

In [36]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=10, random_state=0, n_estimators=100)
regr.fit(train_x, train_y)

RandomForestRegressor(max_depth=10, random_state=0)

In [37]:
prediction4 = regr.predict(test_x)

In [38]:
er_rf10 = np.mean((prediction4-test_y)**2)
er_rf10

0.007167585970076116

In [39]:
pd.DataFrame({'actual':test_y,
             'prediction':prediction4,
             'diff':(test_y-prediction4)})

Unnamed: 0,actual,prediction,diff
37443,-2.288889,-2.366102,0.077213
86534,8.861111,8.892260,-0.031149
2082,9.805556,9.847246,-0.041690
53130,27.222222,27.201229,0.020993
45196,17.705556,17.713716,-0.008160
...,...,...,...
55387,-10.066667,-10.099056,0.032389
25976,9.972222,9.913953,0.058269
32423,12.777778,12.781718,-0.003940
26570,15.277778,15.275648,0.002130


# Random Forest With Max Depth=50

In [40]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=50, random_state=0, n_estimators=100)
regr.fit(train_x, train_y)

RandomForestRegressor(max_depth=50, random_state=0)

In [41]:
prediction5 = regr.predict(test_x)

In [42]:
er_rf50 = np.mean((prediction5-test_y)**2)
er_rf50

0.0015859321345731785

pd.DataFrame({'actual':test_y,
             'prediction':prediction4,
             'diff':(test_y-prediction4)})

In [43]:
print("ERROR \n\n")
print("LINEAR REGRESSION = ",round(er_lr,3),"\n")
print("POLYNOMIAL REGRESSION = ",round(er_pr,3),"\n")
print("DECISON TREE = ",round(er_dt,3),"\n")
print("RANDOM FOREST WITH MAX_DEPTH 10 = ",round(er_rf10,3),"\n")
print("RANDOM FOREST WITH MAX_DEPTH 50 = ",round(er_rf50,3),"\n")

ERROR 


LINEAR REGRESSION =  0.902 

POLYNOMIAL REGRESSION =  0.146 

DECISON TREE =  0.007 

RANDOM FOREST WITH MAX_DEPTH 10 =  0.007 

RANDOM FOREST WITH MAX_DEPTH 50 =  0.002 



# Thankyou