In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [2]:
weather_df = pd.read_csv('weatherHistory.csv')

In [3]:
weather_df.head()

Unnamed: 0,Formatted Date,Summary,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars),Daily Summary
0,2006-04-01 00:00:00.000 +0200,Partly Cloudy,rain,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13,Partly cloudy throughout the day.
1,2006-04-01 01:00:00.000 +0200,Partly Cloudy,rain,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63,Partly cloudy throughout the day.
2,2006-04-01 02:00:00.000 +0200,Mostly Cloudy,rain,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94,Partly cloudy throughout the day.
3,2006-04-01 03:00:00.000 +0200,Partly Cloudy,rain,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41,Partly cloudy throughout the day.
4,2006-04-01 04:00:00.000 +0200,Mostly Cloudy,rain,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51,Partly cloudy throughout the day.


In [4]:
weather_df.shape

(96453, 12)

In [5]:
weather_df.isna().sum()

Formatted Date                0
Summary                       0
Precip Type                 517
Temperature (C)               0
Apparent Temperature (C)      0
Humidity                      0
Wind Speed (km/h)             0
Wind Bearing (degrees)        0
Visibility (km)               0
Loud Cover                    0
Pressure (millibars)          0
Daily Summary                 0
dtype: int64

In [6]:
weather_df['Precip Type'].value_counts()

rain    85224
snow    10712
Name: Precip Type, dtype: int64

In [7]:
weather_df.loc[weather_df['Precip Type'].isnull(),'Precip Type'] = 'rain'

In [8]:
weather_df.isnull().sum()

Formatted Date              0
Summary                     0
Precip Type                 0
Temperature (C)             0
Apparent Temperature (C)    0
Humidity                    0
Wind Speed (km/h)           0
Wind Bearing (degrees)      0
Visibility (km)             0
Loud Cover                  0
Pressure (millibars)        0
Daily Summary               0
dtype: int64

In [9]:
weather_df.loc[weather_df['Precip Type']=='rain','Precip Type'] = 1
weather_df.loc[weather_df['Precip Type']=='snow','Precip Type'] = 0

In [10]:
weather_df.dtypes

Formatted Date               object
Summary                      object
Precip Type                  object
Temperature (C)             float64
Apparent Temperature (C)    float64
Humidity                    float64
Wind Speed (km/h)           float64
Wind Bearing (degrees)      float64
Visibility (km)             float64
Loud Cover                  float64
Pressure (millibars)        float64
Daily Summary                object
dtype: object

In [11]:
weather_df['Precip Type'] = pd.to_numeric(weather_df['Precip Type'])

In [12]:
weather_df.drop(['Formatted Date','Summary','Daily Summary'],axis=1,inplace=True)

In [13]:
weather_df.head()

Unnamed: 0,Precip Type,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Loud Cover,Pressure (millibars)
0,1,9.472222,7.388889,0.89,14.1197,251.0,15.8263,0.0,1015.13
1,1,9.355556,7.227778,0.86,14.2646,259.0,15.8263,0.0,1015.63
2,1,9.377778,9.377778,0.89,3.9284,204.0,14.9569,0.0,1015.94
3,1,8.288889,5.944444,0.83,14.1036,269.0,15.8263,0.0,1016.41
4,1,8.755556,6.977778,0.83,11.0446,259.0,15.8263,0.0,1016.51


In [14]:
weather_df.drop(['Loud Cover','Apparent Temperature (C)'],axis=1,inplace=True)

In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [16]:
X_train,X_test,y_train,y_test = train_test_split(weather_df.drop('Temperature (C)',axis=1),weather_df['Temperature (C)'],test_size=0.2)

In [17]:
X_train = pd.DataFrame(scaler.fit_transform(X_train))

In [18]:
X_train.columns = weather_df.drop('Temperature (C)',axis=1).columns

In [19]:
 X_train.head()

Unnamed: 0,Precip Type,Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars)
0,0.353053,0.282053,1.367609,-1.376145,0.077821,0.074037
1,0.353053,-0.280525,-1.07612,1.046902,1.307417,0.106134
2,0.353053,1.151492,-0.68708,-0.062108,-1.720463,0.03704
3,0.353053,-0.229381,-0.041787,-1.478658,-0.087406,0.13215
4,0.353053,-0.024808,-0.232812,-0.639911,-0.560032,0.012038


In [20]:
model = RandomForestRegressor()
model.fit(X_train,y_train)

RandomForestRegressor()

In [21]:
X_test =pd.DataFrame(scaler.transform(X_test))

In [22]:
X_test.columns = X_train.columns

In [23]:
model.predict(X_test)

array([11.90811111, 13.96766667, 12.999     , ..., 11.6075    ,
       20.37627778, 14.83883333])

In [24]:
model.score(X_test,y_test)

0.7500467116172477

In [25]:
# tip - we can improve score by using various feature engineering techniques and hyperparameter optimization

In [26]:
import pickle

In [27]:
with open('scaler.pkl','wb') as f:
    pickle.dump(scaler,f)

In [28]:
with open('model.pkl','wb') as f:
    pickle.dump(model,f)