In [4]:
import pandas as pd

## Model Training

In [5]:
df = pd.read_csv(r'C:\Users\prati\Desktop\Project\Bike sharing demand Prediction\notebook\data\hour.csv')
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [6]:
#dropping id column as it is irrelevant to our analysis
df=df.drop(labels=['instant','dteday','casual','registered'],axis=1)

In [7]:
## Seprating Independent and dependent features
X = df.drop(labels=['cnt'],axis=1)
Y = df[['cnt']]

In [8]:
#checking target variable
Y

Unnamed: 0,cnt
0,16
1,40
2,32
3,13
4,1
...,...
17374,119
17375,89
17376,90
17377,61


In [9]:
# Defining which columns should be ordinal-encoded and which should be scaled
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [10]:
numerical_cols

Index(['season', 'yr', 'mnth', 'hr', 'holiday', 'weekday', 'workingday',
       'weathersit', 'temp', 'atemp', 'hum', 'windspeed'],
      dtype='object')

In [15]:
# for handling Missing Values
from sklearn.impute import SimpleImputer 

# for Feature Scaling
from sklearn.preprocessing import StandardScaler 

# for Ordinal Encoding
from sklearn.preprocessing import OrdinalEncoder 

## for creating pipelines
from sklearn.pipeline import Pipeline

#for combining pipelines
from sklearn.compose import ColumnTransformer

In [17]:
# Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)

# Categorigal Pipeline
cat_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='most_frequent')),
    ('ordinalencoder',OrdinalEncoder(categories=[])),
    ('scaler',StandardScaler())
    ]

)

preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols),
('cat_pipeline',cat_pipeline,categorical_cols)
])

In [18]:
## Train test split
from sklearn import preprocessing

# Example usage of a preprocessing module
scaler = preprocessing.StandardScaler()

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=30)

In [19]:
#scaling dataset
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [20]:
#checking for scaling
X_train.head()

Unnamed: 0,num_pipeline__season,num_pipeline__yr,num_pipeline__mnth,num_pipeline__hr,num_pipeline__holiday,num_pipeline__weekday,num_pipeline__workingday,num_pipeline__weathersit,num_pipeline__temp,num_pipeline__atemp,num_pipeline__hum,num_pipeline__windspeed
0,-0.454097,-0.998932,-0.444446,-0.804873,-0.17262,0.9954,0.679808,-0.663381,-0.192936,-0.12599,1.319069,-0.699798
1,1.351162,-0.998932,1.00795,-0.804873,5.79307,-1.001062,-1.471003,-0.663381,-0.296368,-0.213548,1.630755,-1.550843
2,-1.356727,-0.998932,-1.606364,1.500483,-0.17262,1.494516,-1.471003,0.903007,-0.503231,-0.389244,1.630755,0.272591
3,-1.356727,-0.998932,-1.025405,0.780059,-0.17262,0.9954,0.679808,-0.663381,-0.813525,-0.91633,-0.70689,0.272591
4,0.448532,1.001069,0.717471,-0.228534,-0.17262,0.9954,0.679808,-0.663381,1.151674,1.280153,0.591802,-0.457108


In [21]:
## Model Training

from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
import xgboost as xgb
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor 
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor

#for metric evaluation
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [22]:
#function to evaluate model using mae,rmse and R2 score  
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [23]:
## Training  multiple models

models={
    'LinearRegression':LinearRegression(),
    'Random Forest':RandomForestRegressor(),
    'Extra Trees Regressor':ExtraTreesRegressor(),
    'Lightgbm':LGBMRegressor(),
    'XGboost':xgb.XGBRegressor()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')


LinearRegression
Model Training Performance
RMSE: 141.8271103556539
MAE: 105.74549274614643
R2 score 38.45593358957478




  model.fit(X_train,y_train)


Random Forest
Model Training Performance
RMSE: 45.406524307656106
MAE: 26.814079547737773
R2 score 93.69181823988312




  model.fit(X_train,y_train)


Extra Trees Regressor
Model Training Performance
RMSE: 43.07910150478457
MAE: 26.029228998849256
R2 score 94.32192725157745


Lightgbm
Model Training Performance
RMSE: 42.83793831431912
MAE: 26.800450116722306
R2 score 94.38532268936302


XGboost
Model Training Performance
RMSE: 42.36604454824734
MAE: 26.73253996898022
R2 score 94.50834158262793




In [24]:
#printing list of models used.
model_list

['LinearRegression',
 'Random Forest',
 'Extra Trees Regressor',
 'Lightgbm',
 'XGboost']