##Importing Libraries


In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.svm import LinearSVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import pickle
from xgboost import XGBRegressor
import sys


##Filenames

In [2]:
file1 = "/content/drive/MyDrive/Data Science Projects/Cameron_headcount_prediction/headcounts.csv"
file2= "/content/drive/MyDrive/Data Science Projects/Cameron_headcount_prediction/winter2023_headcounts.csv"

In [3]:
cols1 = ['Timestamp','Date','Time','Basement','Main_floor','Comments']
cols2 = ['Timestamp','Day','Time','Basement','Main_floor', "Second_floor","Third_floor","Fourth_floor" ]

Coverting csv files into dataframes

In [4]:
df1 = pd.read_csv(file1, names = cols1)
df2 = pd.read_csv(file2, names = cols2)

In [5]:
df1 = df1.drop([0])
df2 = df2.drop([0])

In [6]:
df1=df1.drop("Comments", axis = 1)

In [7]:
df1["Second_floor"] =0
df1["Third_floor"] =0
df1["Fourth_floor"] =0


In [8]:
df1

Unnamed: 0,Timestamp,Date,Time,Basement,Main_floor,Second_floor,Third_floor,Fourth_floor
1,"Monday, May 8, 2023 at 6:33:58 PM",5/8/2023,6:30 pm,17,3,0,0,0
2,"Monday, May 8, 2023 at 7:38:16 PM",5/8/2023,7:30 pm,7,1,0,0,0
3,"Monday, May 8, 2023 at 8:31:10 PM",5/8/2023,8:30 pm,5,0,0,0,0
4,"Monday, May 8, 2023 at 9:34:52 PM",5/8/2023,9:30 pm,2,0,0,0,0
5,"Tuesday, May 9, 2023 at 6:31:10 PM",5/9/2023,6:30 pm,13,8,0,0,0
...,...,...,...,...,...,...,...,...
124,6/5/2023 21:32:55,6/5/2023,9:30 pm,13,4,0,0,0
125,6/6/2023 18:37:28,6/6/2023,6:30 pm,21,8,0,0,0
126,6/6/2023 19:31:35,6/6/2023,7:30 pm,19,9,0,0,0
127,6/6/2023 20:34:41,6/6/2023,8:30 pm,16,5,0,0,0


In [9]:
df2

Unnamed: 0,Timestamp,Day,Time,Basement,Main_floor,Second_floor,Third_floor,Fourth_floor
1,1/30/2023 17:55:22,Monday,5:30 pm,78,126,115,43,47
2,1/30/2023 20:26:32,Monday,8:00 pm,64,34,44,17,6
3,1/30/2023 21:18:53,Monday,9:30 pm (basement only),62,,,,
4,1/31/2023 17:45:30,Tuesday,5:30 pm,80,123,136,43,36
5,1/31/2023 20:41:47,Tuesday,8:00 pm,65,32,17,10,12
...,...,...,...,...,...,...,...,...
126,4/25/2023 18:12:08,Tuesday,5:30 pm,45,40,25,13,12
127,4/26/2023 17:39:40,Wednesday,5:30 pm,54,33,21,11,6
128,4/26/2023 19:57:45,Wednesday,8:00 pm,28,8,10,3,3
129,4/27/2023 18:37:30,Thursday,5:30 pm,51,24,15,8,3


In [10]:
def organizing_datetime1(data):
  data= data.drop("Timestamp", axis = 1)
  data['Date']= pd.to_datetime(data["Date"])

  data['Year'] = data['Date'].dt.year
  data['Month'] = data['Date'].dt.month
  data["Day"] = data["Date"].dt.day

  #replacing wrong values for summer months
  data["Month"] = data["Month"].replace([8,2],5)

  #coverting strings into integera
  data["Month"]= data["Month"].astype(int)
  data["Day"]= data["Day"].astype(int)
  data["Year"]= data["Year"].astype(int)
  data["Basement"]= data["Basement"].astype(int)
  data["Main_floor"]= data["Main_floor"].astype(int)

  #coverting time into float
  data["Time"] = data['Time'].map({"5:30 pm (Fri - Sat - Sun only)": "17.50", "6:30 pm":"18.50", "7:30 pm":"19.50", "8:30 pm":"20.50", "9:30 pm":"21.50"})

  #rearranging columns
  new_cols = ['Day','Month','Year','Time','Basement','Main_floor', "Second_floor","Third_floor","Fourth_floor"]
  data = data.reindex(columns=new_cols)

  return data

In [11]:
def organizing_datetime2(data):
  data['Timestamp']= pd.to_datetime(data["Timestamp"])
  data= data.fillna(0)

  data['Year'] = data['Timestamp'].dt.year
  data['Month'] = data['Timestamp'].dt.month
  data["Day"] = data["Timestamp"].dt.day

  #coverting strings into integera
  data["Month"]= data["Month"].astype(int)
  data["Day"]= data["Day"].astype(int)
  data["Year"]= data["Year"].astype(int)
  data["Basement"]= data["Basement"].astype(int)
  data["Main_floor"]= data["Main_floor"].astype(int)
  data["Second_floor"]= data["Second_floor"].astype(int)
  data["Third_floor"]= data["Third_floor"].astype(int)
  data["Fourth_floor"]= data["Fourth_floor"].astype(int)

  #coverting time into float
  data["Time"] = data['Time'].map({"5:00 pm (weekends only)":"17.00","5:30 pm": "17.50", "8:00 pm":"20.00", "9:30 pm (basement only)":"21.50"})

  #rearranging columns
  new_cols = ['Day','Month','Year','Time','Basement','Main_floor', "Second_floor","Third_floor","Fourth_floor"]
  data = data.reindex(columns=new_cols)


  return data

In [12]:
data1 = organizing_datetime1(df1)
data1

Unnamed: 0,Day,Month,Year,Time,Basement,Main_floor,Second_floor,Third_floor,Fourth_floor
1,8,5,2023,18.50,17,3,0,0,0
2,8,5,2023,19.50,7,1,0,0,0
3,8,5,2023,20.50,5,0,0,0,0
4,8,5,2023,21.50,2,0,0,0,0
5,9,5,2023,18.50,13,8,0,0,0
...,...,...,...,...,...,...,...,...,...
124,5,6,2023,21.50,13,4,0,0,0
125,6,6,2023,18.50,21,8,0,0,0
126,6,6,2023,19.50,19,9,0,0,0
127,6,6,2023,20.50,16,5,0,0,0


In [13]:
data2 = organizing_datetime2(df2)
data2

Unnamed: 0,Day,Month,Year,Time,Basement,Main_floor,Second_floor,Third_floor,Fourth_floor
1,30,1,2023,17.50,78,126,115,43,47
2,30,1,2023,20.00,64,34,44,17,6
3,30,1,2023,21.50,62,0,0,0,0
4,31,1,2023,17.50,80,123,136,43,36
5,31,1,2023,20.00,65,32,17,10,12
...,...,...,...,...,...,...,...,...,...
126,25,4,2023,17.50,45,40,25,13,12
127,26,4,2023,17.50,54,33,21,11,6
128,26,4,2023,20.00,28,8,10,3,3
129,27,4,2023,17.50,51,24,15,8,3


In [14]:
df_merged = data2.append(data1)
df_merged

  df_merged = data2.append(data1)


Unnamed: 0,Day,Month,Year,Time,Basement,Main_floor,Second_floor,Third_floor,Fourth_floor
1,30,1,2023,17.50,78,126,115,43,47
2,30,1,2023,20.00,64,34,44,17,6
3,30,1,2023,21.50,62,0,0,0,0
4,31,1,2023,17.50,80,123,136,43,36
5,31,1,2023,20.00,65,32,17,10,12
...,...,...,...,...,...,...,...,...,...
124,5,6,2023,21.50,13,4,0,0,0
125,6,6,2023,18.50,21,8,0,0,0
126,6,6,2023,19.50,19,9,0,0,0
127,6,6,2023,20.50,16,5,0,0,0


In [15]:
df_merged.to_csv(r'/content/drive/MyDrive/Data Science Projects/Cameron_headcount_prediction/merged_data.csv')


In [16]:
data = df_merged.copy()

In [17]:
data.isnull().sum()

Day             0
Month           0
Year            0
Time            0
Basement        0
Main_floor      0
Second_floor    0
Third_floor     0
Fourth_floor    0
dtype: int64

In [18]:
data["Time"].value_counts()/len(data)

17.50    0.271318
20.00    0.209302
21.50    0.158915
18.50    0.116279
19.50    0.112403
20.50    0.112403
17.00    0.019380
Name: Time, dtype: float64

In [19]:
data["Month"].value_counts()/len(data)

5    0.391473
2    0.213178
3    0.151163
4    0.116279
6    0.104651
1    0.023256
Name: Month, dtype: float64

In [20]:
data

Unnamed: 0,Day,Month,Year,Time,Basement,Main_floor,Second_floor,Third_floor,Fourth_floor
1,30,1,2023,17.50,78,126,115,43,47
2,30,1,2023,20.00,64,34,44,17,6
3,30,1,2023,21.50,62,0,0,0,0
4,31,1,2023,17.50,80,123,136,43,36
5,31,1,2023,20.00,65,32,17,10,12
...,...,...,...,...,...,...,...,...,...
124,5,6,2023,21.50,13,4,0,0,0
125,6,6,2023,18.50,21,8,0,0,0
126,6,6,2023,19.50,19,9,0,0,0
127,6,6,2023,20.50,16,5,0,0,0


####Splitting data into train and test

In [21]:
y = data[['Basement','Main_floor', "Second_floor","Third_floor","Fourth_floor"]]
X= data[['Day','Month','Year','Time']]


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, random_state=42,stratify = data["Time"])

In [42]:
X_train

Unnamed: 0,Day,Month,Year,Time
38,13,2,2023,17.50
9,10,5,2023,18.50
116,17,4,2023,17.50
81,27,5,2023,18.50
47,16,2,2023,17.50
...,...,...,...,...
42,14,2,2023,20.00
41,14,2,2023,17.50
37,12,2,2023,20.00
95,30,5,2023,19.50


In [46]:
y_train

Unnamed: 0,Basement,Main_floor,Second_floor,Third_floor,Fourth_floor
38,139,124,104,37,48
9,15,10,0,0,0
116,117,89,65,21,20
81,17,6,0,0,0
47,88,144,121,48,43
...,...,...,...,...,...
42,69,41,14,9,4
41,109,134,115,43,34
37,67,0,0,0,0
95,21,7,0,0,0


In [23]:
X_train['Time'].value_counts() / len(X_train)

17.50    0.271845
20.00    0.208738
21.50    0.160194
18.50    0.116505
19.50    0.111650
20.50    0.111650
17.00    0.019417
Name: Time, dtype: float64

In [24]:
X_test['Time'].value_counts() / len(X_test)

17.50    0.269231
20.00    0.211538
21.50    0.153846
19.50    0.115385
18.50    0.115385
20.50    0.115385
17.00    0.019231
Name: Time, dtype: float64

#Training different models

##1. Random forest regression

In [25]:
rf_reg = RandomForestRegressor(random_state =42,max_features=2, n_estimators=30)
rf_reg.fit(X_train, y_train)
prediction_rf = rf_reg.predict(X_test)


###Evaluating model

In [26]:
def evaluate_model(reg):
  kscores_rf = cross_val_score(reg, X_test, y_test,scoring = "neg_mean_squared_error", cv= 10)
  rf_rmse_kscores = np.sqrt(-kscores_rf)
  return rf_rmse_kscores.mean()/(y_test.max() - y_test.min())

In [27]:
result = evaluate_model(rf_reg)
result

Basement        0.128261
Main_floor      0.116448
Second_floor    0.125532
Third_floor     0.316072
Fourth_floor    0.421430
dtype: float64

###Fine tuning hyperparameters using OpenGridCV

In [28]:
param_grid = [
    {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
    {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
  ]
grid_search = GridSearchCV(rf_reg, param_grid, scoring = "neg_mean_squared_error", return_train_score=True,cv=10)
grid_search.fit(X_test, y_test)

In [29]:
#best parmeters which we put into our model later on.
grid_search.best_params_

{'max_features': 2, 'n_estimators': 30}

##2. k-Nearest Neighbors


In [30]:
knr_reg = KNeighborsRegressor()
knr_reg.fit(X_train, y_train)


###Evaluating model

In [31]:
result2 = evaluate_model(knr_reg)
result2

Basement        0.197389
Main_floor      0.179208
Second_floor    0.193189
Third_floor     0.486423
Fourth_floor    0.648564
dtype: float64

##3. Support vector regression - too bad don't use

In [32]:
#sv_reg = LinearSVR()
# define the direct multioutput wrapper model
#wrapper = MultiOutputRegressor(sv_reg)

In [33]:
#wrapper.fit(X_train,y_train)

In [34]:
#Its too bad
#result3 = evaluate_model(wrapper)
#result3

##Downloading the trained model

In [37]:
# Saving model to disk
pickle.dump(rf_reg, open('model.pkl','wb'))

In [43]:
example = {"Day": [17,2],"Month":[5,6], "Year": [2023, 2023], "Time":[17.50, 18.50]}
example_df = pd.DataFrame(data = example)

In [48]:
#Comparing results
model = pickle.load(open("/content/drive/MyDrive/Data Science Projects/Cameron_headcount_prediction/model.pkl",'rb'))
result = model.predict(example_df)
print(pd.DataFrame(result))

           0          1         2         3    4
0  29.700000  23.466667  3.666667  1.366667  1.4
1  24.133333  14.266667  0.000000  0.000000  0.0


In [47]:
data.query("Day in(2,17)")

Unnamed: 0,Day,Month,Year,Time,Basement,Main_floor,Second_floor,Third_floor,Fourth_floor
10,2,2,2023,17.5,94,148,171,45,34
11,2,2,2023,20.0,61,43,42,14,19
12,2,2,2023,21.5,65,0,0,0,0
50,17,2,2023,17.5,68,54,51,24,13
64,2,3,2023,17.5,71,84,100,47,31
65,2,3,2023,20.0,70,25,16,5,4
83,17,3,2023,17.5,94,115,124,43,28
116,17,4,2023,17.5,117,89,65,21,20
40,17,5,2023,18.5,42,26,0,0,0
41,17,5,2023,19.5,32,17,0,0,0
