**End to End Project - Bikes Assessment - Basic - Importing the libraries**

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
import matplotlib.pyplot as plt
import os

In [2]:
np.random.seed(42)

**Loading the data**

In [3]:
filePath = ("/cxldata/datasets/project/bikes.csv")

In [4]:
bikesData = pd.read_csv(filePath)
bikesData.to_csv("Bikedata.csv", index=False)

In [5]:
bikesData.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


**Perform EDA on the Dataset**

In [6]:
bikesData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     17379 non-null  int64  
 1   dteday      17379 non-null  object 
 2   season      17379 non-null  int64  
 3   yr          17379 non-null  int64  
 4   mnth        17379 non-null  int64  
 5   hr          17379 non-null  int64  
 6   holiday     17379 non-null  int64  
 7   weekday     17379 non-null  int64  
 8   workingday  17379 non-null  int64  
 9   weathersit  17379 non-null  int64  
 10  temp        17379 non-null  float64
 11  atemp       17379 non-null  float64
 12  hum         17379 non-null  float64
 13  windspeed   17379 non-null  float64
 14  casual      17379 non-null  int64  
 15  registered  17379 non-null  int64  
 16  cnt         17379 non-null  int64  
dtypes: float64(4), int64(12), object(1)
memory usage: 2.3+ MB


In [7]:
bikesData['yr'].value_counts()

1    8734
0    8645
Name: yr, dtype: int64

In [8]:
bikesData.describe()

Unnamed: 0,instant,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
count,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
mean,8690.0,2.50164,0.502561,6.537775,11.546752,0.02877,3.003683,0.682721,1.425283,0.496987,0.475775,0.627229,0.190098,35.676218,153.786869,189.463088
std,5017.0295,1.106918,0.500008,3.438776,6.914405,0.167165,2.005771,0.465431,0.639357,0.192556,0.17185,0.19293,0.12234,49.30503,151.357286,181.387599
min,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.02,0.0,0.0,0.0,0.0,0.0,1.0
25%,4345.5,2.0,0.0,4.0,6.0,0.0,1.0,0.0,1.0,0.34,0.3333,0.48,0.1045,4.0,34.0,40.0
50%,8690.0,3.0,1.0,7.0,12.0,0.0,3.0,1.0,1.0,0.5,0.4848,0.63,0.194,17.0,115.0,142.0
75%,13034.5,3.0,1.0,10.0,18.0,0.0,5.0,1.0,2.0,0.66,0.6212,0.78,0.2537,48.0,220.0,281.0
max,17379.0,4.0,1.0,12.0,23.0,1.0,6.0,1.0,4.0,1.0,1.0,1.0,0.8507,367.0,886.0,977.0


**Cleaning the data - Dropping unwanted features**

In [9]:
columnsToDrop = ['instant', 'casual', 'registered', 'atemp', 'dteday']

In [10]:
bikesData = bikesData.drop(columnsToDrop, axis = 1)

In [11]:
bikesData.head()

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,hum,windspeed,cnt
0,1,0,1,0,0,6,0,1,0.24,0.81,0.0,16
1,1,0,1,1,0,6,0,1,0.22,0.8,0.0,40
2,1,0,1,2,0,6,0,1,0.22,0.8,0.0,32
3,1,0,1,3,0,6,0,1,0.24,0.75,0.0,13
4,1,0,1,4,0,6,0,1,0.24,0.75,0.0,1


**Divide Dataset into Train and Test set**

In [12]:
np.random.seed(42)

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
bikesData['dayCount'] = pd.Series(range(bikesData.shape[0]))/24

In [15]:
bikesData.head()

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,hum,windspeed,cnt,dayCount
0,1,0,1,0,0,6,0,1,0.24,0.81,0.0,16,0.0
1,1,0,1,1,0,6,0,1,0.22,0.8,0.0,40,0.041667
2,1,0,1,2,0,6,0,1,0.22,0.8,0.0,32,0.083333
3,1,0,1,3,0,6,0,1,0.24,0.75,0.0,13,0.125
4,1,0,1,4,0,6,0,1,0.24,0.75,0.0,1,0.166667


In [16]:
train_set, test_set = train_test_split(bikesData, test_size=0.3, random_state=42)

In [17]:
import warnings
warnings.filterwarnings(action= 'ignore')

In [18]:
train_set.sort_values('dayCount', axis= 0, inplace=True)
test_set.sort_values('dayCount', axis= 0, inplace=True)

In [19]:
train_set.head()

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,hum,windspeed,cnt,dayCount
1,1,0,1,1,0,6,0,1,0.22,0.8,0.0,40,0.041667
2,1,0,1,2,0,6,0,1,0.22,0.8,0.0,32,0.083333
4,1,0,1,4,0,6,0,1,0.24,0.75,0.0,1,0.166667
6,1,0,1,6,0,6,0,1,0.22,0.8,0.0,2,0.25
7,1,0,1,7,0,6,0,1,0.2,0.86,0.0,3,0.291667


In [20]:
train_set.shape

(12165, 13)

In [21]:
test_set.shape

(5214, 13)

In [22]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

**Feature Scale the Dataset**   (Basic - Cleaning the data - Feature Scaling)

In [23]:
columnsToScale = ['temp', 'hum', 'windspeed']

In [24]:
scaler = StandardScaler()

In [25]:
train_set[columnsToScale] = scaler.fit_transform(train_set[columnsToScale])
test_set[columnsToScale] = scaler.transform(test_set[columnsToScale])

In [26]:
display_scores(train_set)
display_scores(test_set)

Scores:        season  yr  mnth  hr  holiday  weekday  workingday  weathersit  \
1           1   0     1   1        0        6           0           1   
2           1   0     1   2        0        6           0           1   
4           1   0     1   4        0        6           0           1   
6           1   0     1   6        0        6           0           1   
7           1   0     1   7        0        6           0           1   
...       ...  ..   ...  ..      ...      ...         ...         ...   
17372       1   1    12  17        0        1           1           2   
17373       1   1    12  18        0        1           1           2   
17374       1   1    12  19        0        1           1           2   
17376       1   1    12  21        0        1           1           1   
17378       1   1    12  23        0        1           1           1   

           temp       hum  windspeed  cnt    dayCount  
1     -1.440143  0.892500  -1.552670   40    0.041667  
2  

**Train various Models on the Dataset**

In [27]:
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [28]:
trainingCols = train_set.drop('cnt', axis=1)
trainingCols.head()

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,hum,windspeed,dayCount
1,1,0,1,1,0,6,0,1,-1.440143,0.8925,-1.55267,0.041667
2,1,0,1,2,0,6,0,1,-1.440143,0.8925,-1.55267,0.083333
4,1,0,1,4,0,6,0,1,-1.336557,0.633846,-1.55267,0.166667
6,1,0,1,6,0,6,0,1,-1.440143,0.8925,-1.55267,0.25
7,1,0,1,7,0,6,0,1,-1.543729,1.202885,-1.55267,0.291667


In [29]:
trainingLabels = train_set['cnt'].copy()
trainingLabels.head()

1    40
2    32
4     1
6     2
7     3
Name: cnt, dtype: int64

**Train DecisionTree Model**

In [30]:
dec_reg = DecisionTreeRegressor(random_state=42)

In [31]:
dt_mae_scores = -cross_val_score(dec_reg,
                               trainingCols,trainingLabels,
                               cv=10, scoring="neg_mean_absolute_error")
display_scores(dt_mae_scores)

Scores: [42.94494659 50.37222679 36.95891537 44.26211997 46.99589154 71.98026316
 58.19901316 48.87417763 50.84868421 96.46217105]
Mean: 54.7898409457034
Standard deviation: 16.563759407187572


In [32]:
dt_mse_scores = np.sqrt(-cross_val_score(dec_reg,
                               trainingCols,trainingLabels,
                               cv=10, scoring="neg_mean_squared_error"))
display_scores(dt_mse_scores)

Scores: [ 65.39786583  77.67402864  60.57274567  73.73250527  75.48574011
 113.22922285  96.5884429   82.11639785  86.86752618 149.13680359]
Mean: 88.0801278896052
Standard deviation: 24.927341207369675


**Train Linear Regression Model**

In [33]:
lin_reg = LinearRegression()

In [34]:
lr_mae_scores = -cross_val_score(lin_reg, trainingCols, trainingLabels,
                                cv=10, scoring="neg_mean_absolute_error")
display_scores(lr_mae_scores)

Scores: [ 66.96340699  80.48809095 113.84704981  93.17230086  76.11197672
  96.5220689  133.13798218 158.02254734 158.90195479 127.15674717]
Mean: 110.43241256942319
Standard deviation: 31.42696570529541


In [35]:
lr_mse_scores = np.sqrt(-cross_val_score(lin_reg, trainingCols, trainingLabels,
                                cv=10, scoring="neg_mean_squared_error"))
display_scores(lr_mse_scores)

Scores: [ 84.63836676 111.12038541 131.88324414 119.16350622 105.17621319
 127.72562924 174.97188817 187.31691741 205.60028279 164.30585678]
Mean: 141.19022901181862
Standard deviation: 37.55565075919544


**Train Random Forest Model**

In [36]:
forest_reg = RandomForestRegressor(n_estimators=150, random_state=42)

In [None]:
rf_mae_scores = -cross_val_score(forest_reg, trainingCols, trainingLabels,
                                cv=10, scoring="neg_mean_absolute_error")
display_scores(rf_mae_scores)

In [None]:
rf_mse_scores = np.sqrt(-cross_val_score(forest_reg, trainingCols, trainingLabels,
                                cv=10, scoring="neg_mean_squared_error"))
display_scores(rf_mse_scores)

**Fine Tune the Models**  (Choosing set of hyperparameter combinations for Grid Search)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = [{'n_estimators': [120, 150], 'max_features':[10,12], 'max_depth':[15, 28]}]

**Defining GridSearchCV**

In [None]:
grid_search = GridSearchCV(forest_reg, param_grid=param_grid, cv=5, scoring="neg_mean_squared_error")

**Run GridSearchCV**

In [None]:
grid_search.fit(trainingCols, trainingLabels)

In [None]:
print("Best hyperparameters", grid_search.best_params_)
print("Best Estimator", grid_search.best_estimator_)
print("Best Score (Negative MSE):", grid_search.best_score_)

**Knowing Feature Importances**

In [None]:
feature_importances = grid_search.best_estimator_.feature_importances_
print(feature_importances)

**Evaluate the Models** (Preparing to test the final model on Test dataset)

In [None]:
final_model = grid_search.best_estimator_

In [None]:
test_set.sort_values('dayCount', axis= 0, inplace=True)

In [None]:
test_x_cols = (test_set.drop('cnt', axis=1)).columns.values

In [None]:
test_x_cols

In [None]:
test_y_cols = 'cnt'

In [None]:
X_test = test_set.loc[:,test_x_cols]

In [None]:
y_test = test_set.loc[:, test_y_cols]

**Make Predictions on the Test dataset using Final Model**

In [None]:
test_set.loc[:, 'predictedCounts_test'] = final_model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, test_set.loc[:,'predictedCounts_test'])

In [None]:
final_mse = np.sqrt(mse)
final_mse

In [None]:
test_set.head()

In [None]:
times = [9,18]
for time in times:
    fig = plt.figure(figsize=(8, 6))
    fig.clf()
    ax = fig.gca()
    test_set_freg_time = test_set[test_set.hr == time]
    test_set_freg_time.plot(kind = 'line', x = 'dayCount', y = 'cnt', ax = ax)
    test_set_freg_time.plot(kind = 'line', x = 'dayCount', y = 'predictedCounts_test', ax =ax)
    plt.show()

<font size=5>**Author:**</font>

- **Prince Raj**