In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import sklearn
import numpy

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [4]:
df = pd.read_csv('bike-data.csv')

In [5]:
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,1/1/11,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,1/1/11,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,1/1/11,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,1/1/11,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,1/1/11,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


# Drop Redundant Variables

In [6]:
df.drop(['casual', 'registered','instant'], axis=1, inplace=True)

In [7]:
df.head()

Unnamed: 0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1/1/11,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,16
1,1/1/11,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,40
2,1/1/11,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,32
3,1/1/11,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,13
4,1/1/11,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,1


# Change into Categorical types

In [8]:
df.dtypes

dteday         object
season          int64
yr              int64
mnth            int64
hr              int64
holiday         int64
weekday         int64
workingday      int64
weathersit      int64
temp          float64
atemp         float64
hum           float64
windspeed     float64
cnt             int64
dtype: object

In [9]:
df['dteday'] = pd.to_datetime(df['dteday'])
df['season'] = df['season'].astype("category")
df['weekday'] = df['weekday'].astype("category")
df['mnth'] = df['mnth'].astype("category")

In [10]:
df.head()

Unnamed: 0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,16
1,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,40
2,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,32
3,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,13
4,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,1


# Check for Null Values

In [11]:
df.isna().sum(axis = 0)

dteday        0
season        0
yr            0
mnth          0
hr            0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
cnt           0
dtype: int64

In [12]:
df.isnull().sum()

dteday        0
season        0
yr            0
mnth          0
hr            0
holiday       0
weekday       0
workingday    0
weathersit    0
temp          0
atemp         0
hum           0
windspeed     0
cnt           0
dtype: int64

# Correlation Matrix and Multicollinearity

In [13]:
df.corr()

Unnamed: 0,yr,hr,holiday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
yr,1.0,-0.003867,0.006692,-0.002196,-0.019157,0.040913,0.039222,-0.083546,-0.00874,0.250495
hr,-0.003867,1.0,0.000479,0.002285,-0.020203,0.137603,0.13375,-0.276498,0.137252,0.394071
holiday,0.006692,0.000479,1.0,-0.252471,-0.017036,-0.02734,-0.030973,-0.010588,0.003988,-0.030927
workingday,-0.002196,0.002285,-0.252471,1.0,0.044672,0.05539,0.054667,0.015688,-0.01183,0.030284
weathersit,-0.019157,-0.020203,-0.017036,0.044672,1.0,-0.10264,-0.105563,0.41813,0.026226,-0.142426
temp,0.040913,0.137603,-0.02734,0.05539,-0.10264,1.0,0.987672,-0.069881,-0.023125,0.404772
atemp,0.039222,0.13375,-0.030973,0.054667,-0.105563,0.987672,1.0,-0.051918,-0.062336,0.400929
hum,-0.083546,-0.276498,-0.010588,0.015688,0.41813,-0.069881,-0.051918,1.0,-0.290105,-0.322911
windspeed,-0.00874,0.137252,0.003988,-0.01183,0.026226,-0.023125,-0.062336,-0.290105,1.0,0.093234
cnt,0.250495,0.394071,-0.030927,0.030284,-0.142426,0.404772,0.400929,-0.322911,0.093234,1.0


In [14]:
df.drop('cnt', axis=1).corrwith(df.cnt)

yr            0.250495
hr            0.394071
holiday      -0.030927
workingday    0.030284
weathersit   -0.142426
temp          0.404772
atemp         0.400929
hum          -0.322911
windspeed     0.093234
dtype: float64

In [15]:
df.drop('temp', axis=1, inplace=True)
df.drop('dteday', axis=1, inplace=True)

In [16]:
df.head()

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,atemp,hum,windspeed,cnt
0,1,0,1,0,0,6,0,1,0.2879,0.81,0.0,16
1,1,0,1,1,0,6,0,1,0.2727,0.8,0.0,40
2,1,0,1,2,0,6,0,1,0.2727,0.8,0.0,32
3,1,0,1,3,0,6,0,1,0.2879,0.75,0.0,13
4,1,0,1,4,0,6,0,1,0.2879,0.75,0.0,1


In [17]:
df = df.rename(columns={"yr": "year", "mnth": "month", "hr": "hour", "hum": "humidity", "cnt": "total", "weathersit": "weather_code", "atemp":"temp"})

In [18]:
df.head()

Unnamed: 0,season,year,month,hour,holiday,weekday,workingday,weather_code,temp,humidity,windspeed,total
0,1,0,1,0,0,6,0,1,0.2879,0.81,0.0,16
1,1,0,1,1,0,6,0,1,0.2727,0.8,0.0,40
2,1,0,1,2,0,6,0,1,0.2727,0.8,0.0,32
3,1,0,1,3,0,6,0,1,0.2879,0.75,0.0,13
4,1,0,1,4,0,6,0,1,0.2879,0.75,0.0,1


# Testing and Training Data

In [19]:
df = pd.get_dummies(df)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('total', axis=1), df.total, test_size=0.33, random_state=5)

# Linear Regression

In [25]:

reg = LinearRegression()
  
reg = reg.fit(X_train, y_train)

y_pred_lr = reg.predict(X_test)


print("R2: ",r2_score(y_test, y_pred_lr))

#RMSE 
print("RMSE: ",np.sqrt(mean_squared_error(y_test, y_pred_lr, squared = 0)))

print("MSE: ",np.sqrt(mean_squared_error(y_test, y_pred_lr)))

R2:  0.40065430181583117
RMSE:  11.949501503368722
MSE:  142.79058617901134


# Random Forest

In [22]:
 
rf = RandomForestRegressor()
rf = rf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)


print("R2: ",r2_score(y_test, y_pred_rf))

 
print("RMSE: ",np.sqrt(mean_squared_error(y_test, y_pred_rf, squared=0)))

print("MSE: ",np.sqrt(mean_squared_error(y_test, y_pred_rf)))

R2:  0.9454611477913304
RMSE:  6.563068521946557
MSE:  43.07386842376577


# Gradient Boost

In [23]:
gb = GradientBoostingRegressor()

gb = gb.fit(X_train, y_train)
 
y_pred_gb = gb.predict(X_test)

print("R2: ",r2_score(y_test, y_pred_gb))

print("RMSE: ",np.sqrt(mean_squared_error(y_test, y_pred_gb, squared = 0)))

print("MSE: ",np.sqrt(mean_squared_error(y_test, y_pred_gb)))

R2:  0.8528616985113731
RMSE:  8.411275824521162
MSE:  70.74956099617415
