# Car Rental Demand Forecast
Group Members:

Rekha C R

Abhijith P

Ramya K V

Joel Sunny

In [83]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

In [84]:
# Read the datset into python
train= pd.read_csv('train_E1GspfA.csv')
test=pd.read_csv('test_6QvDdzb.csv')

In [85]:
test

Unnamed: 0,date,hour
0,2021-03-01,0
1,2021-03-01,1
2,2021-03-01,2
3,2021-03-01,3
4,2021-03-01,5
...,...,...
7645,2022-03-28,19
7646,2022-03-28,20
7647,2022-03-28,21
7648,2022-03-28,22


In [86]:
# Checking the datatypes of varaibles
train.dtypes

date      object
hour       int64
demand     int64
dtype: object

In [87]:
#Converting date variable to datetime object
train['date'] = pd.to_datetime(train['date'])
train.dtypes

date      datetime64[ns]
hour               int64
demand             int64
dtype: object

In [88]:
#Extracting datetime features from the date variable__train data
train['year'] = train['date'].apply(lambda x:x.year)
train['month'] = train['date'].apply(lambda x:x.month)
train['day'] = train['date'].apply(lambda x:x.day)
train['dayOfWeek'] = train['date'].apply(lambda x:x.dayofweek)
train['isQuarterDate'] = train['date'].apply(lambda x:x.quarter)
train['isWeekend'] = train['dayOfWeek'].apply(lambda x:1 if x>4 else 0)
train.drop(['date'], axis=1, inplace = True)

train

Unnamed: 0,hour,demand,year,month,day,dayOfWeek,isQuarterDate,isWeekend
0,9,91,2018,8,18,5,3,1
1,10,21,2018,8,18,5,3,1
2,13,23,2018,8,18,5,3,1
3,14,104,2018,8,18,5,3,1
4,15,81,2018,8,18,5,3,1
...,...,...,...,...,...,...,...,...
18242,19,95,2021,2,28,6,1,1
18243,20,88,2021,2,28,6,1,1
18244,21,39,2021,2,28,6,1,1
18245,22,104,2021,2,28,6,1,1


In [89]:
#Converting date variable to datetime object
test['date'] = pd.to_datetime(test['date'])  
#Extracting datetime features from the date variable_test data
test['year'] = test['date'].apply(lambda x:x.year)
test['month'] = test['date'].apply(lambda x:x.month)
test['day'] = test['date'].apply(lambda x:x.day)
test['dayOfWeek'] = test['date'].apply(lambda x:x.dayofweek)
test['isQuarterDate'] = test['date'].apply(lambda x:x.quarter)
test['isWeekend'] = test['dayOfWeek'].apply(lambda x:1 if x>4 else 0)
test.drop(['date'], axis=1, inplace = True)
test

Unnamed: 0,hour,year,month,day,dayOfWeek,isQuarterDate,isWeekend
0,0,2021,3,1,0,1,0
1,1,2021,3,1,0,1,0
2,2,2021,3,1,0,1,0
3,3,2021,3,1,0,1,0
4,5,2021,3,1,0,1,0
...,...,...,...,...,...,...,...
7645,19,2022,3,28,0,1,0
7646,20,2022,3,28,0,1,0
7647,21,2022,3,28,0,1,0
7648,22,2022,3,28,0,1,0


In [90]:
#function to identify the timing of the day
def timeOfDay(n):
    if n in range(1,4):
        return 'Late Night'
    elif n in range(4,7):
        return 'Early Morning'
    elif n in range(7,12):
        return 'Morning'
    elif n in range(12,15):
        return 'Afternoon'
    elif n in range(15,18):
        return 'Evening'
    elif n in range(18,25) or n == 0:
        return 'Night'

In [91]:
train['typeOfDay'] = train['hour'].apply(lambda x: timeOfDay(x))
train

Unnamed: 0,hour,demand,year,month,day,dayOfWeek,isQuarterDate,isWeekend,typeOfDay
0,9,91,2018,8,18,5,3,1,Morning
1,10,21,2018,8,18,5,3,1,Morning
2,13,23,2018,8,18,5,3,1,Afternoon
3,14,104,2018,8,18,5,3,1,Afternoon
4,15,81,2018,8,18,5,3,1,Evening
...,...,...,...,...,...,...,...,...,...
18242,19,95,2021,2,28,6,1,1,Night
18243,20,88,2021,2,28,6,1,1,Night
18244,21,39,2021,2,28,6,1,1,Night
18245,22,104,2021,2,28,6,1,1,Night


In [92]:
#Applying the same function to test data
test['typeOfDay'] = test['hour'].apply(lambda x: timeOfDay(x))
test.head()

Unnamed: 0,hour,year,month,day,dayOfWeek,isQuarterDate,isWeekend,typeOfDay
0,0,2021,3,1,0,1,0,Night
1,1,2021,3,1,0,1,0,Late Night
2,2,2021,3,1,0,1,0,Late Night
3,3,2021,3,1,0,1,0,Late Night
4,5,2021,3,1,0,1,0,Early Morning


In [93]:
#function for converting the months to seasons
def monthToSeasons(x):
    if x in[9,10,11]:
        return 'Spring'
    elif x in [12,1,2]:
        return 'Summer'
    elif x in [3,4,5]:
        return 'Autumn'
    elif x in [6,7,8]:
        return 'Winter'

In [94]:
train['Season'] = train['month'].apply(monthToSeasons)
train

Unnamed: 0,hour,demand,year,month,day,dayOfWeek,isQuarterDate,isWeekend,typeOfDay,Season
0,9,91,2018,8,18,5,3,1,Morning,Winter
1,10,21,2018,8,18,5,3,1,Morning,Winter
2,13,23,2018,8,18,5,3,1,Afternoon,Winter
3,14,104,2018,8,18,5,3,1,Afternoon,Winter
4,15,81,2018,8,18,5,3,1,Evening,Winter
...,...,...,...,...,...,...,...,...,...,...
18242,19,95,2021,2,28,6,1,1,Night,Summer
18243,20,88,2021,2,28,6,1,1,Night,Summer
18244,21,39,2021,2,28,6,1,1,Night,Summer
18245,22,104,2021,2,28,6,1,1,Night,Summer


In [95]:
#Applying the same to the test data
test['Season'] = test['month'].apply(monthToSeasons)
test

Unnamed: 0,hour,year,month,day,dayOfWeek,isQuarterDate,isWeekend,typeOfDay,Season
0,0,2021,3,1,0,1,0,Night,Autumn
1,1,2021,3,1,0,1,0,Late Night,Autumn
2,2,2021,3,1,0,1,0,Late Night,Autumn
3,3,2021,3,1,0,1,0,Late Night,Autumn
4,5,2021,3,1,0,1,0,Early Morning,Autumn
...,...,...,...,...,...,...,...,...,...
7645,19,2022,3,28,0,1,0,Night,Autumn
7646,20,2022,3,28,0,1,0,Night,Autumn
7647,21,2022,3,28,0,1,0,Night,Autumn
7648,22,2022,3,28,0,1,0,Night,Autumn


In [96]:
#Converting the type of day to numerical values in train data

train['typeOfDay'].replace({'Afternoon': 0, 'Morning' : 1, 'Night':2, 'Evening':3, 'Early Morning':4, 'Late Night':5}, 
                          inplace=True)
train['Season'].replace({'Spring':0, 'Summer':1, 'Autumn':2, 'Winter':3}, inplace=True)

In [97]:
#Converting the type of day to numerical values in test data

test['typeOfDay'].replace({'Afternoon': 0, 'Morning' : 1, 'Night':2, 'Evening':3, 'Early Morning':4, 'Late Night':5}, 
                          inplace=True)
test['Season'].replace({'Spring':0, 'Summer':1, 'Autumn':2, 'Winter':3}, inplace=True)

In [98]:
#Splitting the dataset into X and Y

y = train['demand'].copy()
X = train.drop('demand',axis=1).copy()

In [99]:
#Handling the cyclical data. After normalizing cyclical features we have to use sin and cos functions

X['hour_sin'] = np.sin(X['hour']*(2.*np.pi/24))
X['hour_cos'] = np.cos(X['hour']*(2.*np.pi/24))
X['month_sin'] = np.sin(X['month']*(2.*np.pi/12))
X['month_cos'] = np.cos(X['month']*(2.*np.pi/12))
X['dayOfWeek'] = np.cos(X['dayOfWeek'])
X.drop(['hour', 'month'], axis=1, inplace=True)

In [100]:
#Carrying out the same for test data

test['hour_sin'] = np.sin(test['hour']*(2.*np.pi/24))
test['hour_cos'] = np.cos(test['hour']*(2.*np.pi/24))
test['month_sin'] = np.sin((test['month'])*(2.*np.pi/12))
test['month_cos'] = np.cos((test['month'])*(2.*np.pi/12))

test['dayOfWeek'] = np.cos(test['dayOfWeek'])

test.drop(['hour', 'month'], axis=1, inplace=True)

In [101]:
#Checking the Variation Inflation Factor
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_info = pd.DataFrame()
vif_info['VIF'] = [variance_inflation_factor(X.values,i) for i in range(X.shape[1])]
vif_info['Colums'] = X.columns
vif_info.sort_values('VIF', ascending = False)

Unnamed: 0,VIF,Colums
0,27.278847,year
3,17.245119,isQuarterDate
6,7.561146,Season
5,4.828747,typeOfDay
1,4.246301,day
9,3.186494,month_sin
10,2.978869,month_cos
4,1.769235,isWeekend
8,1.636522,hour_cos
2,1.278743,dayOfWeek


In [102]:
#Dropping the highest VIF columns
X.drop(['year', 'isQuarterDate'], axis=1, inplace=True)

In [103]:
X.head(3)

Unnamed: 0,day,dayOfWeek,isWeekend,typeOfDay,Season,hour_sin,hour_cos,month_sin,month_cos
0,18,0.283662,1,1,3,0.707107,-0.707107,-0.866025,-0.5
1,18,0.283662,1,1,3,0.5,-0.866025,-0.866025,-0.5
2,18,0.283662,1,0,3,-0.258819,-0.965926,-0.866025,-0.5


In [104]:
#splitting the dataset for training the model and validating
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.2)

In [105]:
#Linear model
from sklearn import linear_model
lr=linear_model.LinearRegression()
model=lr.fit(X_train,y_train)
predictions=model.predict(X_test)
from sklearn.metrics import mean_squared_error
MSE_lin_reg=mean_squared_error(predictions,y_test)
print('Mean of squared errors of the linear regression model is ',round(MSE_lin_reg,2))

Mean of squared errors of the linear regression model is  1407.3


In [106]:
#polynomial regression model 
from sklearn.preprocessing import PolynomialFeatures  
poly_regs= PolynomialFeatures(degree= 2)

#converting feature matrix into polynomial feature matrix
x_poly= poly_regs.fit_transform(X_train)  
poly =linear_model.LinearRegression()  
model=poly.fit(X_train, y_train) 
predictions=model.predict(X_test)
MSE_poly_reg=mean_squared_error(predictions,y_test)
print('Mean of squared errors of the Polynomial regression model is ',round(MSE_poly_reg,2))

Mean of squared errors of the Polynomial regression model is  1407.3


In [107]:
#Ridge regression model 
from sklearn.linear_model import Ridge
ridge_reg = Ridge(alpha=0.05,normalize=True)
model=ridge_reg.fit(X_train, y_train)
predictions = model.predict(X_test)
MSE_ridg_reg=mean_squared_error(predictions,y_test)
print('Mean of squared errors of the Ridge regression model is ',round(MSE_ridg_reg,2))

Mean of squared errors of the Ridge regression model is  1406.95


In [108]:
#Lasso regression model
from sklearn.linear_model import Lasso
lasso = Lasso()
model=lasso.fit(X_train, y_train)
predictions = model.predict(X_test)
MSE_lasso=mean_squared_error(predictions,y_test)
print('Mean of squared errors of the LASSO regression model is ',round(MSE_lasso,2))

Mean of squared errors of the LASSO regression model is  1417.42


In [109]:
#Decisiontree model
from sklearn.tree import DecisionTreeRegressor
dt=DecisionTreeRegressor()
dt_model=dt.fit(X_train,y_train)
dt_pred=dt_model.predict(X_test)
MSE_dt=mean_squared_error(dt_pred,y_test)
print('Mean of squared errors of the Random Forest model is ',round(MSE_dt,2))

Mean of squared errors of the Random Forest model is  2102.03


In [110]:
#Randomforest model
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor()
rf_model=rf.fit(X_train,y_train)
rf_pred=rf_model.predict(X_test)
MSE_rf=mean_squared_error(rf_pred,y_test)

print('Mean of squared errors of the Random Forest model is ',round(MSE_rf,2))

Mean of squared errors of the Random Forest model is  1174.55


In [111]:
#Gradientboost model
from sklearn.ensemble import GradientBoostingRegressor
gb=GradientBoostingRegressor()
gb_model=gb.fit(X_train,y_train)
gb_pred=gb_model.predict(X_test)
MSE_gb=mean_squared_error(gb_pred,y_test)
print('Mean of squared errors of the Gradient Boosting model is ',round(MSE_gb,2))

Mean of squared errors of the Gradient Boosting model is  1207.66


In [112]:
from xgboost import XGBRegressor
xgb=XGBRegressor()
xgb_model=xgb.fit(X_train,y_train)
xgb_pred=xgb_model.predict(X_test)
MSE_xgb=mean_squared_error(xgb_pred,y_test)
print('Mean of squared errors of the XGBoost model is ',round(MSE_xgb,2))

Mean of squared errors of the XGBoost model is  1140.77


In [113]:
from sklearn.ensemble import AdaBoostRegressor
adb=AdaBoostRegressor()
adb_model=adb.fit(X_train,y_train)
adb_pred=adb_model.predict(X_test)
MSE_adb=mean_squared_error(adb_pred,y_test)
print('Mean of squared errors of the AdaBoost model is ',round(MSE_adb,2))

Mean of squared errors of the AdaBoost model is  1321.95


In [114]:
#Means squred error for xgboost algorithm showed better score and hence tuning the paramaeters of xgboost
xgb.get_params

<bound method XGBModel.get_params of XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)>

In [115]:
#learning rate
eta=[0.3,0.35,0.4]
#minimum loss reduction required to make a split
gamma=[0,10,20]
#maximum depth of a tree
max_depth=[3,5,7,9]
#minimum sum of weights of all observations required in a child
min_child_weight=[0,0.5,0.9]
# Create the param grid
param_grid = {
    'eta':eta,
    'gamma':gamma,
    'max_depth':max_depth,
    'min_child_weight':min_child_weight
}

In [116]:
from sklearn.model_selection import GridSearchCV
xgb_Grid = GridSearchCV(estimator = xgb, param_grid = param_grid, verbose=2,n_jobs=4,cv=3)
xgb_Grid.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


GridSearchCV(cv=3,
             estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                    callbacks=None, colsample_bylevel=1,
                                    colsample_bynode=1, colsample_bytree=1,
                                    early_stopping_rounds=None,
                                    enable_categorical=False, eval_metric=None,
                                    gamma=0, gpu_id=-1, grow_policy='depthwise',
                                    importance_type=None,
                                    interaction_constraints='',
                                    learning_rate=0.300000012, max_bin=256,
                                    max_cat_to_onehot=4, max_delta_step=0,
                                    max_depth=6, max_leaves=0,
                                    min_child_weight=1, missing=nan,
                                    monotone_constraints='()', n_estimators=100,
                                    n_jobs=0, num_p

In [117]:
xgb_Grid.best_params_ 

{'eta': 0.3, 'gamma': 0, 'max_depth': 5, 'min_child_weight': 0}

In [118]:
from sklearn import metrics
xgb=XGBRegressor(learning_rate=0.30,gamma=0,max_depth=3,min_child_weight=0)
xgb_model=xgb.fit(X_train,y_train)
xgb_pred=xgb_model.predict(X_test)
MSE_xgb=mean_squared_error(xgb_pred,y_test)
print('Mean of squared errors of the XGBoost model is ',round(MSE_xgb,2))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, xgb_pred)))

Mean of squared errors of the XGBoost model is  1161.18
Root Mean Squared Error: 34.07615363431173


XGB Regressor gave best reult with RMSE value 34.07