### Essential libraries

In [1]:
import numpy as np # type: ignore
import pandas as pd # type: ignore
import pytz # type: ignore
import time

### Libraries for preprocessing & metrics

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

### Preprocessing steps

In [3]:
data = pd.read_csv("Data.csv")
print("The head of the data is:")
data.head(5)


The head of the data is:


Unnamed: 0,number_people,date,timestamp,day_of_week,is_weekend,is_holiday,temperature,is_start_of_semester,is_during_semester,month,hour,mixed_datetime,date_column,time_column
0,37,2015-08-14 17:00:11-07:00,61211,4,0,0,71.76,0,0,8,17,2015-08-14 17:00:11-07:00,14-08-2015,17:00:11
1,45,2015-08-14 17:20:14-07:00,62414,4,0,0,71.76,0,0,8,17,2015-08-14 17:20:14-07:00,14-08-2015,17:20:14
2,40,2015-08-14 17:30:15-07:00,63015,4,0,0,71.76,0,0,8,17,2015-08-14 17:30:15-07:00,14-08-2015,17:30:15
3,44,2015-08-14 17:40:16-07:00,63616,4,0,0,71.76,0,0,8,17,2015-08-14 17:40:16-07:00,14-08-2015,17:40:16
4,45,2015-08-14 17:50:17-07:00,64217,4,0,0,71.76,0,0,8,17,2015-08-14 17:50:17-07:00,14-08-2015,17:50:17


In [4]:
# Finding and Handling Duplicate Data

data.duplicated().sum()

0

In [5]:
# Finding and Handling Missing Values

data.isnull().sum()

number_people           0
date                    0
timestamp               0
day_of_week             0
is_weekend              0
is_holiday              0
temperature             0
is_start_of_semester    0
is_during_semester      0
month                   0
hour                    0
mixed_datetime          0
date_column             0
time_column             0
dtype: int64

In [6]:
# Dropping the columns that are Un-Necessary

data.drop('timestamp' , axis = 1 , inplace=True)
data.drop('date' , axis = 1 , inplace=True)
data.drop('mixed_datetime' , axis = 1 , inplace=True)
data.head()

Unnamed: 0,number_people,day_of_week,is_weekend,is_holiday,temperature,is_start_of_semester,is_during_semester,month,hour,date_column,time_column
0,37,4,0,0,71.76,0,0,8,17,14-08-2015,17:00:11
1,45,4,0,0,71.76,0,0,8,17,14-08-2015,17:20:14
2,40,4,0,0,71.76,0,0,8,17,14-08-2015,17:30:15
3,44,4,0,0,71.76,0,0,8,17,14-08-2015,17:40:16
4,45,4,0,0,71.76,0,0,8,17,14-08-2015,17:50:17


In [7]:
# Identifying the Dependent Variable
Y = data.iloc[ : , : 1]
Y

Unnamed: 0,number_people
0,37
1,45
2,40
3,44
4,45
...,...
62179,23
62180,21
62181,25
62182,18


In [8]:
# Identifying the Independent Variables
X = data.iloc[ : , 1 : ]
X

Unnamed: 0,day_of_week,is_weekend,is_holiday,temperature,is_start_of_semester,is_during_semester,month,hour,date_column,time_column
0,4,0,0,71.76,0,0,8,17,14-08-2015,17:00:11
1,4,0,0,71.76,0,0,8,17,14-08-2015,17:20:14
2,4,0,0,71.76,0,0,8,17,14-08-2015,17:30:15
3,4,0,0,71.76,0,0,8,17,14-08-2015,17:40:16
4,4,0,0,71.76,0,0,8,17,14-08-2015,17:50:17
...,...,...,...,...,...,...,...,...,...,...
62179,5,1,0,61.07,0,1,3,18,18-03-2017,18:42:28
62180,5,1,0,61.07,0,1,3,18,18-03-2017,18:52:35
62181,5,1,0,56.71,0,1,3,19,18-03-2017,19:02:40
62182,5,1,0,56.71,0,1,3,19,18-03-2017,19:12:47


In [9]:
X['date_column'] = pd.to_datetime(X['date_column'])
X['year'] = X['date_column'].dt.year
X['Day'] = X['date_column'].dt.day
X['time_column'] = pd.to_datetime(X['time_column'])
X['minutes'] = X['time_column'].dt.minute
X['seconds'] = X['time_column'].dt.second


  X['date_column'] = pd.to_datetime(X['date_column'])
  X['time_column'] = pd.to_datetime(X['time_column'])


In [10]:
X.drop('date_column' , axis = 1 , inplace=True)
X.drop('time_column' , axis = 1 , inplace=True)

In [11]:
# spliting dataset

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X , Y , test_size=0.25 , random_state= 15)

# for dataconversion error
Y_train = np.ravel(Y_train)
Y_test = np.ravel(Y_test)
print("The size of X_train is: ",X_train.shape)
print("The size of X_test is: ", X_test.shape)
print("The size of Y_train is: ", Y_train.shape)
print("The size of Y_test is: " , Y_test.shape)

The size of X_train is:  (46638, 12)
The size of X_test is:  (15546, 12)
The size of Y_train is:  (46638,)
The size of Y_test is:  (15546,)


### LinearRegression

In [None]:
from sklearn.linear_model import LinearRegression

params = {'fit_intercept': [True, False], 'copy_X': [True, False], 'n_jobs': [1,10, 20, 40, 50, 100], 'positive': [False, True]}
results = []
bestparavalue = {}

for parameter, values in params.items():
    bestmae = float('inf')
    bestmse = float('inf')
    bestrmse = float('inf')
    bestR2 = -float('inf')
    besttime = float('inf')
    best_value = {}

    for value in values:
        start = time.time()
        LR = LinearRegression(**{parameter: value})
        LR.fit(X_train, Y_train)
        Y_pred = LR.predict(X_test)
        end = time.time()
        
        prediction_time = end - start
        lrmae = mean_absolute_error(Y_test, Y_pred)
        lrmse = mean_squared_error(Y_test, Y_pred)
        lrrmse = np.sqrt(lrmse)
        lrr2 = r2_score(Y_test, Y_pred)
        
        # Update best values
        if lrmae < bestmae:
            bestmae = lrmae
            best_value["mae"] = value
        if lrmse < bestmse:
            bestmse = lrmse
            best_value["mse"] = value
        if lrrmse < bestrmse:
            bestrmse = lrrmse
            best_value["rmse"] = value
        if lrr2 > bestR2:
            bestR2 = lrr2
            best_value["R2"] = value
        if prediction_time < besttime:
            besttime = prediction_time
            best_value["time"] = value

        results.append({
            'parameter & value': {parameter: value},
            'MAE': lrmae,
            'MSE': lrmse,
            'RMSE': lrrmse,
            'R2-Score': lrr2,
            'Prediction Time (s)': prediction_time,
        })
    bestparavalue[parameter] = best_value

results_df = pd.DataFrame(results)
print(results_df,"\n")

print("Best Performing Parameter values")
bestparavalueDFLR = pd.DataFrame(bestparavalue)
print(bestparavalueDFLR)


#### Re-Training Linear Reggreison with Best Parameter

In [None]:
best_r2_parameters = bestparavalueDFLR.loc['R2']

# Extracting the parameter values
fit_intercept_best = best_r2_parameters['fit_intercept']
copy_X_best = best_r2_parameters['copy_X']
n_jobs_best = best_r2_parameters['n_jobs']
positive_best = best_r2_parameters['positive']

LR_best = LinearRegression(fit_intercept=fit_intercept_best, copy_X=copy_X_best, n_jobs=n_jobs_best, positive=positive_best)
LR_best.fit(X_train, Y_train)

Y_pred_best_r2_LR = LR_best.predict(X_test)

#### Using Best Trained LR model

In [None]:
input_data = {
    'day_of_week': [4, 5, 3],
    'is_weekend': [0, 1, 0],
    'is_holiday': [0, 0, 0],
    'temperature': [71.76, 63.57, 55.67],
    'is_start_of_semester': [0, 0, 0],
    'is_during_semester': [0, 1, 0],
    'month': [8, 2, 6],
    'hour': [17, 15, 1],
    'year': [2015, 2016, 2016],
    'Day': [8, 13, 23],
    'minutes': [00, 20, 56],
    'seconds': [11, 11, 31]
}
input_df = pd.DataFrame(input_data)
Y_pred_input_data = LR_best.predict(input_df)
predictions = Y_pred_input_data.ravel().astype(int)
for prediction in predictions:
    print(prediction, "people")

In [None]:
#Applying Support Vector Regression

from sklearn.svm import SVR

Svm = SVR(C=2, kernel='linear')
Svm.fit(X_train, Y_train)
Y_pred1 = Svm.predict(X_test)
Svmmae = mean_absolute_error(Y_test, Y_pred1)
Svmmse = mean_squared_error(Y_test, Y_pred1)
Svmrmse = np.sqrt(Svmmse)
Svmr2 = r2_score(Y_test, Y_pred1)

print(f'Mean Absolute Error (MAE): {Svmmae}')
print(f'Mean Squared Error (MSE): {Svmmse}')
print(f'Root Mean Squared Error (RMSE): {Svmrmse}')
print(f'R2-Score: {Svmr2}')

### RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor 

params = {
    'n_estimators': [50, 100, 150, 300],
    'criterion': ['squared_error','absolute_error', 'friedman_mse'],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10, 30],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': [None, 'sqrt', 'log2', 3],
    'bootstrap': [True, False]}
results=[]
bestparavalue={}

for parameter, values in params.items():
    bestmae = float('inf')
    bestmse = float('inf')
    bestrmse = float('inf')
    bestR2 = -float('inf')
    besttime = float('inf')
    best_value = {}
    for value in values:
        start_time = time.time()
        currentParameter = {parameter: value}
        RFR = RandomForestRegressor(**currentParameter, random_state=42, n_jobs=-1)
        RFR.fit(X_train,Y_train)
        Y_pred3 = RFR.predict(X_test)
        end_time = time.time()
        prediction_time = end_time - start_time
        RFRmae = mean_absolute_error(Y_test, Y_pred3)
        RFRmse = mean_squared_error(Y_test, Y_pred3)
        RFRrmse = np.sqrt(RFRmse)
        RFRr2 = r2_score(Y_test, Y_pred3)

        # Update best values
        if RFRmae < bestmae:
            bestmae = RFRmae
            best_value["mae"] = value
        if RFRmse < bestmse:
            bestmse = RFRmse
            best_value["mse"] = value
        if RFRrmse < bestrmse:
            bestrmse = RFRrmse
            best_value["rmse"] = value
        if RFRr2 > bestR2:
            bestR2 = RFRr2
            best_value["R2"] = value
        if prediction_time < besttime:
            besttime = prediction_time
            best_value["time"] = value
            
        results.append({
            'parameter & value': f"{parameter}={value}",
            'MAE': RFRmae,
            'MSE': RFRmse,
            'RMSE': RFRrmse,
            'R2-Score': RFRr2,
            'Prediction Time (s)': prediction_time
        })
    bestparavalue[parameter] = best_value
results_df = pd.DataFrame(results)
print(results_df)

print("Best Performing Parameter values")
bestparavalueDFRFR = pd.DataFrame(bestparavalue)
print(bestparavalueDFRFR)

#### Re-Training Random Forest Reggresor with Best Parameter

In [None]:
best_r2_parameters = bestparavalueDFRFR.loc['R2']

# Extracting the parameter values
n_estimators_best = best_r2_parameters['n_estimators']
criterion_best = best_r2_parameters['criterion']
if np.isnan(best_r2_parameters['max_depth']):
    max_depth_best=None
else:
    max_depth_best=best_r2_parameters['max_depth']
min_samples_split_best = best_r2_parameters['min_samples_split']
min_samples_leaf_best = best_r2_parameters['min_samples_leaf']
max_features_best = best_r2_parameters['max_features']
bootstrap_best = best_r2_parameters['bootstrap']

RFRBest = RandomForestRegressor(n_estimators=n_estimators_best, criterion=criterion_best, max_depth=max_depth_best, min_samples_split=min_samples_split_best,min_samples_leaf=min_samples_leaf_best,max_features=max_features_best,bootstrap=bootstrap_best, random_state=42, n_jobs=-1)
RFRBest.fit(X_train, Y_train)

Y_pred_best_r2_RFR = RFRBest.predict(X_test)

#### Using Best Trained RFR model

In [None]:
input_data = {
    'day_of_week': [4, 5, 3],
    'is_weekend': [0, 1, 0],
    'is_holiday': [0, 0, 0],
    'temperature': [71.76, 63.57, 55.67],
    'is_start_of_semester': [0, 0, 0],
    'is_during_semester': [0, 1, 0],
    'month': [8, 2, 6],
    'hour': [17, 15, 1],
    'year': [2015, 2016, 2016],
    'Day': [8, 13, 23],
    'minutes': [00, 20, 56],
    'seconds': [11, 11, 31]
}
input_df = pd.DataFrame(input_data)

input_df = pd.DataFrame(input_data)
Y_pred_input_data = RFRBest.predict(input_df)
predictions = Y_pred_input_data.ravel().astype(int)
for prediction in predictions:
    print(prediction, "people")

### XGBRegressor

In [None]:
from xgboost import XGBRegressor

boosters=["gbtree","gblinear","dart"]
params = {
    'n_estimators': 200,
    'learning_rate': 0.1,
    'max_depth': 10,
}
results=[]
for booster in boosters:
    start_time = time.time()
    xgb = XGBRegressor(booster=booster,**params)
    xgb.fit(X_train,Y_train)

    Y_pred4 = xgb.predict(X_test)
    end_time = time.time()
    prediction_time = end_time - start_time
    XGBmae = mean_absolute_error(Y_test, Y_pred4)
    XGBmse = mean_squared_error(Y_test, Y_pred4)
    XGBrmse = np.sqrt(XGBmse)
    XGBr2 = r2_score(Y_test, Y_pred4)

    results.append({
        'booster': booster,
        'MAE': XGBmae,
        'MSE': XGBmse,
        'RMSE': XGBrmse,
        'R2-Score': XGBr2,
        'Prediction Time (s)': prediction_time
    })

results_df = pd.DataFrame(results)
print(results_df)

#### Using gbtree as booster we got best performance with least time

In [None]:
# using it as booster

xgbFinal = XGBRegressor(booster='gbtree',**params)
xgbFinal.fit(X_train,Y_train)
input_data = {
    'day_of_week': [4, 5, 3],
    'is_weekend': [0, 1, 0],
    'is_holiday': [0, 0, 0],
    'temperature': [71.76, 63.57, 55.67],
    'is_start_of_semester': [0, 0, 0],
    'is_during_semester': [0, 1, 0],
    'month': [8, 2, 6],
    'hour': [17, 15, 1],
    'year': [2015, 2016, 2016],
    'Day': [8, 13, 23],
    'minutes': [00, 20, 56],
    'seconds': [11, 11, 31]
}
input_df = pd.DataFrame(input_data)

Y_pred_input_data = xgbFinal.predict(input_df)
predictions = Y_pred_input_data.ravel().astype(int)
for prediction in predictions:
    print(prediction, "people")

### AdaBoostRegressor

In [13]:

from sklearn.ensemble import AdaBoostRegressor # type: ignore
from sklearn.tree import DecisionTreeRegressor # type: ignore
from sklearn.linear_model import LinearRegression, Lasso, Ridge # type: ignore
from sklearn.svm import SVR # type: ignore
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor # type: ignore
from sklearn.neighbors import KNeighborsRegressor # type: ignore
from sklearn.neural_network import MLPRegressor # type: ignore
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score # type: ignore

baseEstimators=[ DecisionTreeRegressor(max_depth=10), LinearRegression(),Lasso(),Ridge(),RandomForestRegressor(n_estimators=200,max_depth=10),GradientBoostingRegressor(),ExtraTreesRegressor(),KNeighborsRegressor(),MLPRegressor(max_iter=1000)]
bestmae = float('inf')
bestmse = float('inf')
bestrmse = float('inf')
bestR2 = -float('inf')
besttime = float('inf')
best_value_ADA = {}
results = []
for estimator in baseEstimators:
    start_time = time.time()
    ada = AdaBoostRegressor(estimator=estimator)
    ada.fit(X_train,Y_train)

    Y_pred5 = ada.predict(X_test)
    end_time = time.time()
    prediction_time = end_time - start_time
    adamae = mean_absolute_error(Y_test, Y_pred5)
    adamse = mean_squared_error(Y_test, Y_pred5)
    adarmse = np.sqrt(adamse)
    adar2 = r2_score(Y_test, Y_pred5)

    if adamae < bestmae:
        bestmae = adamae
        best_value_ADA["mae"] = estimator
    if adamse < bestmse:
        bestmse = adamse
        best_value_ADA["mse"] = estimator
    if adarmse < bestrmse:
        bestrmse = adarmse
        best_value_ADA["rmse"] = estimator
    if adar2 > bestR2:
        bestR2 = adar2
        best_value_ADA["R2"] = estimator
    if prediction_time < besttime:
        besttime = prediction_time
        best_value_ADA["time"] = estimator

    results.append({
        'Base Estimator': estimator,
        'MAE': adamae,
        'MSE': adamse,
        'RMSE': adarmse,
        'R2-Score': adar2,
        'Prediction Time (s)': prediction_time
    })

results_df = pd.DataFrame(results)
print(results_df)

print("Best Performing Parameter values")
best_value_ADA = pd.DataFrame(best_value_ADA)
print(best_value_ADA)

#### re-training ADABOOST with best parameters

In [None]:
best_r2_baseEstimater = best_value_ADA.loc['R2']

In [None]:
input_data = {
    'day_of_week': [4, 5, 3],
    'is_weekend': [0, 1, 0],
    'is_holiday': [0, 0, 0],
    'temperature': [71.76, 63.57, 55.67],
    'is_start_of_semester': [0, 0, 0],
    'is_during_semester': [0, 1, 0],
    'month': [8, 2, 6],
    'hour': [17, 15, 1],
    'year': [2015, 2016, 2016],
    'Day': [8, 13, 23],
    'minutes': [00, 20, 56],
    'seconds': [11, 11, 31]
}
input_df = pd.DataFrame(input_data)

input_df = pd.DataFrame(input_data)
Y_pred_input_data = RFRBest.predict(input_df)
predictions = Y_pred_input_data.ravel().astype(int)
for prediction in predictions:
    print(prediction, "people")