In [None]:
#Load libraries
import os
import pandas as pd
import numpy as np  
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
import seaborn as sns
from random import randrange, uniform
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error,r2_score

# Reading the Data

In [None]:
#Setting working directory and reading the data
os.chdir("F://MBA//Edwisor")
data = pd.read_csv("day.csv")

# Understanding the Data

In [None]:
data.head(5)

In [None]:
data.describe()

In [None]:
data.shape

# Outlier Analysis

In [None]:
plt.boxplot(data['mnth'])
plt.ylabel('Month')

In [None]:
plt.boxplot(data['season'])
plt.ylabel('season')

In [None]:
plt.boxplot(data['windspeed'])
plt.ylabel('windspeed')

In [None]:
plt.boxplot(data['hum'])
plt.ylabel('humidity')

# visualization

In [None]:
plt.hist(data['windspeed'], bins = 10)
plt.ylabel('Frequency')
plt.xlabel('windspeed')
plt.show()

In [None]:
plt.hist(data['cnt'], bins = 10)
plt.ylabel('Frequency')
plt.xlabel('cnt')
plt.show()

# Feature Engineering

In [None]:
# Converting numeric into factor datatypes
data['season']= data['season'].astype('category')
data['yr']=data['yr'].astype('int')
data['mnth']=data['mnth'].astype('category')
data['holiday']=data['holiday'].astype('int')
data['workingday']=data['workingday'].astype('int')
data['weekday']=data['weekday'].astype('category')
data['weathersit']=data['weathersit'].astype('category')
data['dteday'] = pd.to_datetime(data['dteday'], errors='coerce')
data['dteday'] = data['dteday'].dt.day
data['dteday']=data['dteday'].astype('category')
data = data.drop(['instant','casual', 'registered',], axis=1)

In [None]:
data.head(5)

# Feature Selection

In [None]:
#Correlation plot
df_corr = data

In [None]:
#Set the width and hieght of the plot
f, ax = plt.subplots(figsize=(7, 5))

#Generate correlation matrix
corr = df_corr.corr()

#Plot using seaborn library
sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)

In [None]:
#droping corelated variable
data = data.drop(['atemp'], axis=1)

# Sampling

In [None]:
#dividing data into train and test
train, test = train_test_split(data, test_size=0.2)

# Modeling

In [None]:
#linear regression
#creating dummy variable
data_lm=data.copy()
cat_names = ["season", "dteday", "weathersit", "mnth","weekday"]
for i in cat_names:
    temp = pd.get_dummies(data_lm[i], prefix = i)
    data_lm = data_lm.join(temp)
drop = ['dteday', 'season', 'weathersit', 'weekday', 'mnth','cnt']
data_lm = data_lm.drop(drop, axis=1)
data_lm=data_lm.join(data['cnt'])

In [None]:
trainlm, testlm = train_test_split(data_lm, test_size=0.2)
LM_model = sm.OLS(trainlm.iloc[:,63], trainlm.iloc[:,0:62]).fit()
predictions_LM = LM_model.predict(testlm.iloc[:,0:62])
LM_model.summary()

In [None]:
trainlm.head(2)

In [None]:
train.head(2)

In [None]:
fit_svr = SVR().fit(train.iloc[:,0:10], train.iloc[:,11])
predictions_SVR = fit_svr.predict(test.iloc[:,0:10])

In [None]:
#Decision Tree
fit_DT = DecisionTreeRegressor(max_depth=2).fit(train.iloc[:,0:11], train.iloc[:,11])
predictions_DT = fit_DT.predict(test.iloc[:,0:11])

In [None]:
#Random forest
RFmodel = RandomForestRegressor(n_estimators = 200).fit(train.iloc[:,0:11], train.iloc[:,11])
predictions_RF = RFmodel.predict(test.iloc[:,0:11])

# Evaluation

In [None]:
#Defining Mape function
def MAPE(y_act, y_pred): 
    mape = np.mean(np.abs((y_act - y_pred) / y_act)) * 100
    return mape

In [None]:
def RMSE(y_act, y_pred): 
    rmse = np.sqrt(np.mean(np.square(y_act - y_pred)))
    return rmse

In [None]:
MAPE_LM = MAPE(testlm['cnt'], predictions_LM)
MAPE_LM
#19.39625%

In [None]:
RMSE_LM = RMSE(testlm['cnt'], predictions_LM)
RMSE_LM
#867.33

In [None]:
MAPE_DT = MAPE(test['cnt'], predictions_DT)
MAPE_DT
#20.6616%

In [None]:
RMSE_DT = RMSE(test['cnt'], predictions_DT)
RMSE_DT
#1146.90

In [None]:
MAPE_RF = MAPE(test.iloc[:,11], predictions_RF)
MAPE_RF
#17.6433%

In [None]:
RMSE_RF = RMSE(test.iloc[:,11], predictions_RF)
RMSE_RF
#710.85

In [None]:
MAPE_SVR = MAPE(test.iloc[:,11], predictions_SVR)
MAPE_SVR
#18.41555%

In [None]:
RMSE_SVR = RMSE(test.iloc[:,11], predictions_SVR)
RMSE_SVR
#2012.82

In [None]:
r2_LM =r2_score(predictions_LM, testlm['cnt'])
r2_LM
#0.79989

In [None]:
r2_DT =r2_score(predictions_DT, test['cnt'])
r2_DT
#0.47284

In [None]:
r2_RF =r2_score(predictions_RF, test['cnt'])
r2_RF
#0.85239

In [None]:
r2_SVR =r2_score(predictions_SVR, test['cnt'])
r2_SVR
#0.6249