# Importing necessary libraries

In [1]:
# pandas
import pandas as pd

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Machine learning
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor

# GridSearchCV to find optimal min_samples_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.feature_selection import RFE

# Importing train-test-split 
from sklearn.model_selection import train_test_split

# Evaluation metrics
from sklearn.metrics import *

# Ignoring warning
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")

# Importing the data

In [2]:
df=pd.read_csv('hour.csv',index_col=0)
df.head()

Unnamed: 0_level_0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
instant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [3]:
df.drop(columns=['dteday','yr','casual','registered'],axis=1,inplace=True)
df.head()

Unnamed: 0_level_0,season,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
instant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,16
2,1,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,40
3,1,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,32
4,1,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,13
5,1,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,1


# Model building

In [4]:
df.dtypes

season          int64
mnth            int64
hr              int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
cnt             int64
dtype: object

In [5]:
df_objects=['season','mnth','hr','holiday','weekday','workingday','weathersit']
for i in df_objects:
    df[i]=df[i].apply(lambda x:str(x))

In [6]:
df.dtypes

season         object
mnth           object
hr             object
holiday        object
weekday        object
workingday     object
weathersit     object
temp          float64
atemp         float64
hum           float64
windspeed     float64
cnt             int64
dtype: object

# Building base model

In [7]:
def model(model_name,X,Y):
    x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3,random_state=123)
    model_name.fit(x_train,y_train)
    y_pred=model_name.predict(x_test)
    mse=mean_squared_error(y_test,y_pred)
    
    print('The rmse score for the model=',np.sqrt(mse))
    print('---------------------------------------')
    print('The r^2 value for the model=',r2_score(y_test,y_pred))

In [8]:
df1=df.copy()

In [9]:
x=df1.drop('cnt',axis=1)
y=df1['cnt']

In [10]:
model(LinearRegression(),x,y)

The rmse score for the model= 150.28181522182192
---------------------------------------
The r^2 value for the model= 0.32399854121719174


In [11]:
model(RandomForestRegressor(),x,y)

The rmse score for the model= 70.85078352596707
---------------------------------------
The r^2 value for the model= 0.8497466777115262


In [12]:
model(DecisionTreeRegressor(),x,y)

The rmse score for the model= 92.43828299062821
---------------------------------------
The r^2 value for the model= 0.744236533925386


In [13]:
import statsmodels.formula.api as smf
import statsmodels.api as sm

In [14]:
data=pd.get_dummies(df1,drop_first=True)
data.head()

Unnamed: 0_level_0,temp,atemp,hum,windspeed,cnt,season_2,season_3,season_4,mnth_10,mnth_11,...,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6,workingday_1,weathersit_2,weathersit_3,weathersit_4
instant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.24,0.2879,0.81,0.0,16,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0.22,0.2727,0.8,0.0,40,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,0.22,0.2727,0.8,0.0,32,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,0.24,0.2879,0.75,0.0,13,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
5,0.24,0.2879,0.75,0.0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [15]:
x1=data.drop('cnt',axis=1)
y1=data['cnt']

In [16]:
data.columns

Index(['temp', 'atemp', 'hum', 'windspeed', 'cnt', 'season_2', 'season_3',
       'season_4', 'mnth_10', 'mnth_11', 'mnth_12', 'mnth_2', 'mnth_3',
       'mnth_4', 'mnth_5', 'mnth_6', 'mnth_7', 'mnth_8', 'mnth_9', 'hr_1',
       'hr_10', 'hr_11', 'hr_12', 'hr_13', 'hr_14', 'hr_15', 'hr_16', 'hr_17',
       'hr_18', 'hr_19', 'hr_2', 'hr_20', 'hr_21', 'hr_22', 'hr_23', 'hr_3',
       'hr_4', 'hr_5', 'hr_6', 'hr_7', 'hr_8', 'hr_9', 'holiday_1',
       'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5',
       'weekday_6', 'workingday_1', 'weathersit_2', 'weathersit_3',
       'weathersit_4'],
      dtype='object')

In [17]:
m1 = sm.OLS(y1,sm.add_constant(x1),family=sm.families.Gaussian())
model1 = m1.fit()
model1.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.631
Dependent Variable:,cnt,AIC:,212799.7616
Date:,2019-05-27 17:35,BIC:,213203.4385
No. Observations:,17379,Log-Likelihood:,-106350.0
Df Model:,51,F-statistic:,584.2
Df Residuals:,17327,Prob (F-statistic):,0.0
R-squared:,0.632,Scale:,12134.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
const,-27.4742,7.0955,-3.8721,0.0001,-41.3821,-13.5663
temp,195.1921,31.9175,6.1155,0.0000,132.6305,257.7537
atemp,103.9531,33.1547,3.1354,0.0017,38.9666,168.9397
hum,-114.9024,5.9797,-19.2155,0.0000,-126.6232,-103.1817
windspeed,-43.2351,7.6305,-5.6661,0.0000,-58.1916,-28.2787
season_2,39.4906,5.2578,7.5109,0.0000,29.1847,49.7964
season_3,28.8578,6.2248,4.6359,0.0000,16.6565,41.0590
season_4,66.0479,5.2858,12.4953,0.0000,55.6871,76.4086
mnth_10,3.9416,7.0161,0.5618,0.5743,-9.8106,17.6939

0,1,2,3
Omnibus:,1809.005,Durbin-Watson:,0.433
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3761.709
Skew:,0.665,Prob(JB):,0.0
Kurtosis:,4.85,Condition No.:,1.4323823030256348e+16


In [18]:
#Removing variables that are insignificant i,e removing variables which have p value >0.05
x2=x1.drop(['mnth_10','mnth_12','mnth_2','mnth_3','mnth_4','mnth_5','mnth_8','mnth_9','weekday_1','weekday_2','weekday_3','weekday_4','weekday_5','weathersit_4'],axis=1)

In [19]:
m2 = sm.OLS(y1,sm.add_constant(x2),family=sm.families.Gaussian())
model2 = m2.fit()
model2.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.63
Dependent Variable:,cnt,AIC:,212858.7973
Date:,2019-05-27 17:35,BIC:,213161.555
No. Observations:,17379,Log-Likelihood:,-106390.0
Df Model:,38,F-statistic:,778.6
Df Residuals:,17340,Prob (F-statistic):,0.0
R-squared:,0.630,Scale:,12185.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
const,-31.6897,6.7327,-4.7068,0.0000,-44.8866,-18.4929
temp,200.9236,30.5736,6.5718,0.0000,140.9963,260.8509
atemp,104.7610,32.8104,3.1929,0.0014,40.4494,169.0726
hum,-111.2083,5.8918,-18.8752,0.0000,-122.7568,-99.6598
windspeed,-43.5023,7.5930,-5.7292,0.0000,-58.3854,-28.6191
season_2,35.1906,3.0797,11.4266,0.0000,29.1541,41.2272
season_3,23.8376,4.0566,5.8762,0.0000,15.8863,31.7890
season_4,65.8857,3.0263,21.7707,0.0000,59.9538,71.8177
mnth_11,-15.6438,3.6701,-4.2624,0.0000,-22.8376,-8.4499

0,1,2,3
Omnibus:,1860.341,Durbin-Watson:,0.431
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3923.71
Skew:,0.677,Prob(JB):,0.0
Kurtosis:,4.894,Condition No.:,89.0


In [20]:
def vif_cal(input_data, dependent_col):
    vif_df = pd.DataFrame( columns = ['Var', 'Vif'])
    x_vars=input_data
    xvar_names=x_vars.columns
    for i in range(0,xvar_names.shape[0]):
        y=x_vars[xvar_names[i]] 
        x=x_vars[xvar_names.drop(xvar_names[i])]
        rsq=sm.OLS(y,x).fit().rsquared  
        vif=round(1/(1-rsq),2)
        vif_df.loc[i] = [xvar_names[i], vif]
    return vif_df.sort_values(by = 'Vif', axis=0, ascending=False, inplace=False)

In [21]:
vif_cal(x2,y1)

Unnamed: 0,Var,Vif
1,atemp,380.54
0,temp,375.48
2,hum,14.93
3,windspeed,3.68
33,holiday_1,0.58
18,hr_17,0.5
17,hr_16,0.5
16,hr_15,0.5
15,hr_14,0.5
14,hr_13,0.49


In [22]:
x3=x2.drop(['temp'],axis=1)
vif_cal(x3,y1)

Unnamed: 0,Var,Vif
0,atemp,25.14
1,hum,14.73
2,windspeed,3.52
32,holiday_1,0.58
17,hr_17,0.5
16,hr_16,0.5
15,hr_15,0.5
14,hr_14,0.5
12,hr_12,0.49
29,hr_7,0.49


In [23]:
x3['atemp']=(x2['atemp']**7)*10
x3['hum']=x2['hum']**5
x3['windspeed']=x2['windspeed']

In [24]:
x3.head()

Unnamed: 0_level_0,atemp,hum,windspeed,season_2,season_3,season_4,mnth_11,mnth_6,mnth_7,hr_1,...,hr_5,hr_6,hr_7,hr_8,hr_9,holiday_1,weekday_6,workingday_1,weathersit_2,weathersit_3
instant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.001639,0.348678,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0.001121,0.32768,0.0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,0.001121,0.32768,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,0.001639,0.237305,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,0.001639,0.237305,0.0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [25]:
vif_cal(x3,y1)
#After doing feature engineering now vif for all variables are low

Unnamed: 0,Var,Vif
2,windspeed,3.57
1,hum,2.78
0,atemp,2.43
32,holiday_1,0.57
15,hr_15,0.42
17,hr_17,0.42
16,hr_16,0.42
12,hr_12,0.42
13,hr_13,0.42
14,hr_14,0.42


In [26]:
x3.columns

Index(['atemp', 'hum', 'windspeed', 'season_2', 'season_3', 'season_4',
       'mnth_11', 'mnth_6', 'mnth_7', 'hr_1', 'hr_10', 'hr_11', 'hr_12',
       'hr_13', 'hr_14', 'hr_15', 'hr_16', 'hr_17', 'hr_18', 'hr_19', 'hr_2',
       'hr_20', 'hr_21', 'hr_22', 'hr_23', 'hr_3', 'hr_4', 'hr_5', 'hr_6',
       'hr_7', 'hr_8', 'hr_9', 'holiday_1', 'weekday_6', 'workingday_1',
       'weathersit_2', 'weathersit_3'],
      dtype='object')

In [27]:
model(LinearRegression(),x3,y1)

The rmse score for the model= 116.17066947319256
---------------------------------------
The r^2 value for the model= 0.5960498692346222


In [28]:
m3 = sm.OLS(y1,sm.add_constant(x3),family=sm.families.Gaussian())
model3 = m3.fit()
model3.summary2()

0,1,2,3
Model:,OLS,Adj. R-squared:,0.603
Dependent Variable:,cnt,AIC:,214083.3329
Date:,2019-05-27 17:35,BIC:,214378.3276
No. Observations:,17379,Log-Likelihood:,-107000.0
Df Model:,37,F-statistic:,713.2
Df Residuals:,17341,Prob (F-statistic):,0.0
R-squared:,0.603,Scale:,13075.0

0,1,2,3,4,5,6
,Coef.,Std.Err.,t,P>|t|,[0.025,0.975]
const,-5.4549,5.2922,-1.0307,0.3027,-15.8282,4.9184
atemp,11.4902,2.3203,4.9521,0.0000,6.9422,16.0381
hum,-81.2752,4.9437,-16.4403,0.0000,-90.9653,-71.5851
windspeed,-38.6645,7.5735,-5.1053,0.0000,-53.5093,-23.8198
season_2,97.4198,2.6136,37.2738,0.0000,92.2968,102.5427
season_3,120.5882,3.0365,39.7129,0.0000,114.6363,126.5400
season_4,107.1492,2.8442,37.6733,0.0000,101.5743,112.7241
mnth_11,-38.6625,3.7402,-10.3371,0.0000,-45.9936,-31.3314
mnth_6,12.8126,3.4602,3.7028,0.0002,6.0302,19.5949

0,1,2,3
Omnibus:,1890.058,Durbin-Watson:,0.402
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3901.356
Skew:,0.693,Prob(JB):,0.0
Kurtosis:,4.862,Condition No.:,35.0


### Random forest

In [29]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.3,random_state=123)

rf=RandomForestRegressor()
rf.fit(x_train,y_train)
y_pred=rf.predict(x_test)
rmse=np.sqrt(mean_squared_error(y_test,y_pred))
rmse

70.52490292878376

In [51]:
#Tuning max depth
parameters = {'max_depth': range(1, 30)}
model_rf = GridSearchCV(RandomForestRegressor(),parameters, cv=5)
model_rf.fit(x_train, y_train)

print(model_rf.best_estimator_)
print(model_rf.score(x_train, y_train))

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=16,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
0.9567683977033107


In [52]:
#Tuning max depth
parameters = {'max_features': range(1,11)}
model_rf = GridSearchCV(RandomForestRegressor(),parameters, cv=5)
model_rf.fit(x_train, y_train)

print(model_rf.best_estimator_)
print(model_rf.score(x_train, y_train))

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features=10, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=None, oob_score=False,
           random_state=None, verbose=0, warm_start=False)
0.9722892884557659


In [56]:
rf=RandomForestRegressor(max_depth=16,max_features=10,n_estimators=200)
rf.fit(x_train,y_train)
y_pred=rf.predict(x_test)
rmse=np.sqrt(mean_squared_error(y_test,y_pred))
rmse

67.1434663360404

In [60]:
from sklearn import metrics

In [75]:
neg_scores=cross_val_score(rf,x_train,y_train,cv=10,scoring='neg_mean_squared_error')
scores=-1*neg_scores
rmse=np.sqrt(scores)

In [76]:
print(rmse.mean())

67.83612727522998
