**Data Preprocessing**
---
---

In [1]:
#Importing pymongo (Connecting MongoDB with Python) as well as other ML libraries
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

from pymongo import MongoClient
from datetime import datetime
import random



In [2]:
#Connect to Database
client = MongoClient("mongodb+srv://webServer:hkSEd64DH1wujNPD@slugmetercluster.de0aesc.mongodb.net/")

In [3]:
#Get the database
db = client.SlugMeterTest

In [4]:
#Get the timestamp data
TimeStamps = db.Times
stamps = TimeStamps.find()

In [5]:
#Importing DB as dataframe
df1 = pd.DataFrame(list(stamps))

In [6]:
# Add Month, Day, Hour, Minute, isHoliday, isWeekend as columns in dataframe 
#Encoding for Day: Mon:0, Tues:1, Wed:2, Thur:3, Fri:4, Sat:5, Sun:6
index = 0

for i in df1['timestamp']:
    try:
        d = datetime.strptime(str(i), '%Y-%m-%d %H:%M:%S.%f')
    except ValueError:
        d = datetime.strptime(str(i), '%Y-%m-%d %H:%M:%S')
    # d = datetime.strptime(str(i), '%Y-%m-%d %H:%M:%S.%f')
    df1.loc[index, 'Month'] = d.month
    df1.loc[index, 'Day'] = d.weekday()
    df1.loc[index, 'Hour'] = d.hour
    df1.loc[index, 'Minute'] = d.minute

    #Set weekends
    if(d.weekday() == 5 or d.weekday() == 6):
        df1.loc[index, 'isWeekend'] = 1
    else:
        df1.loc[index, 'isWeekend'] = 0

    #Set holidays
    if(d.month == 11 and (d.day == 10 or d.day == 23 or d.day == 24)):
        df1.iloc[index, 4] = 0
        df1.loc[index, 'isHoliday'] = 1
    elif(d.month == 12 and (d.day == i for i in range(25, 32))):
        df1.iloc[index, 4] = 0
        df1.loc[index, 'isHoliday'] = 1
    elif(d.month == 1 and d.day == 1):
        df1.iloc[index, 4] = 0
        df1.loc[index, 'isHoliday'] = 1
    else:
        df1.loc[index, 'isHoliday'] = 0
    index+=1

In [7]:
# Set minutes in 10-minute intervals
index = 0

for i in df1['Minute']:
    df1.loc[index, 'Minute'] = df1.loc[index, 'Minute'] // 10 * 10
    index+=1

In [8]:
#Set the number of people based on 10-minute time intervals
df1['timestamp'] = pd.to_datetime(df1['timestamp'])
diffs = df1['timestamp'] - df1['timestamp'].shift()
laps = diffs > pd.Timedelta('10 min')
periods = laps.cumsum().apply(lambda x: '{}'.format(x+1))
df1['Num_of_people'] = periods.astype(int)

In [9]:
#Get the number of people in the gym (divide the number in half each hour, set to random number high number if passes gym capacity)
for i in range(len(df1)):
    if(df1.loc[i, 'Num_of_people'] > 82):
        df1.loc[i, 'Num_of_people'] = random.randrange(48, 78)
    else:
        df1.loc[i, 'Num_of_people'] = (df1.loc[i, 'Num_of_people'] // 2)

In [10]:
#Zero out the number of people if the gym is not open (based on the hours posted on UCSC's website)
#Encoding: Mon:0, Tues:1, Wed:2, Thur:3, Fri:4, Sat:5, Sun:6

df1.loc[(df1.Day < 4) & ((df1.Hour < 7) | (df1.Hour == 23)), "Num_of_people"] = 0 #Mon-Thurs (6am - 11pm) 
df1.loc[(df1.Day == 4) & (((df1.Hour < 7 )) | (df1.Hour >= 22)), "Num_of_people"] = 0 #Fri (6am-10pm)
df1.loc[(df1.Day >= 5) & (((df1.Hour <= 8)) | (df1.Hour >= 20)), "Num_of_people"] = 0 #Sat/Sun (8am-8pm)


In [11]:
#Remove unecessary columns (Only care about the timestamps and dates)
df1 = df1.drop(columns = '_id')
df1 = df1.drop(columns = 'isEntry')
df1 = df1.drop(columns='timestamp')
df1 = df1.drop(columns='isEntrance')

**ML DATABASE**
---
---

In [12]:
# df = pd.get_dummies(df, columns = ['Day'])
# df = pd.get_dummies(df, columns = ['hours'])

In [13]:
df2 = pd.read_csv('gym_data.csv')

In [14]:
# Add Minute as column in dataframe 
index = 0

for i in df2['date']:
    df2.loc[index, 'Minute'] = i[14:16]
    index+=1

In [15]:
#Match names to original dataframe (df)
df2 = df2.rename(columns={'day_of_week':'Day', 'month':'Month', 'hour':'Hour', 'number_people':'Num_of_people', 'is_weekend':'isWeekend'})

In [16]:
#Create a seperate column for holidays
#If it is a holiday, zero-out the Num_of_people
index = 0
for actual_date in df2['date']:
    actual_date = pd.to_datetime(actual_date)
    if(actual_date.month == 11 and actual_date.day == 10):
        df2.iloc[index, 0] = 0
        df2.loc[index, 'isHoliday'] = 1
    elif(actual_date.month == 11 and (actual_date.day == 23 or actual_date.day == 24)):
        df2.iloc[index, 0] = 0
        df2.loc[index, 'isHoliday'] = 1
    elif (actual_date.month == 12 and any(actual_date.day == i for i in range(25, 32))):
        df2.iloc[index, 0] = 0
        df2.loc[index, 'isHoliday'] = 1
    elif(actual_date.month == 1 and actual_date.day == 1):
        df2.iloc[index, 0] = 0
        df2.loc[index, 'isHoliday'] = 1
    else:
        df2.loc[index, 'isHoliday'] = 0
    index+=1

In [17]:
#Remove unecessary columns
df2 = df2.drop(columns = 'timestamp')
df2 = df2.drop(columns = 'is_holiday')
df2 = df2.drop(columns='temperature')
df2 = df2.drop(columns='is_start_of_semester')
df2 = df2.drop(columns='is_during_semester')
df2 = df2.drop(columns='date')

**Combine the Dataframes**
---
---

In [18]:
dataframes = [df1, df2]
df = pd.concat(dataframes)

In [19]:
#Drop any NaN values 
df = df.dropna()

**Data Exploration**
---
---

In [20]:
# #Function to plot interactive plots using Plotly (Reference: https://medium.com/mlearning-ai/forecasting-timeseries-using-machine-learning-deep-learning-446eccc6eb6d)
# def plotl(df, x, y, title):
#     fig = px.histogram(df, x=x, y=y, title=title)
#     fig.show()

# plotl(df, 'Month', df['Num_of_people'], 'Number of people (Year)')

People seemed to stop going to the gym as the years passed. Seems like a steady decline, could be an issue with how the data was obtained

In [21]:
# #Creating a bar chart to show num of people during each day
# plt.bar(df['Day'], df['Num_of_people'], width = 0.6)
# plt.xlabel('Day of Week (Mon:0, Tues:1, Wed:2, Thur:3, Fri:4, Sat:5, Sun:6)')
# plt.ylabel('Num of People')
# plt.title('Number of people (Day)')

# #Show plot
# plt.show()

Number of people seems to be evenly disbursed throughout the days of the week. There is more activity towards the start/middle of the week and it falls off during the weekend

In [22]:
# #Creating a bar chart to show num of people during each hour
# plt.bar(df['Hour'], df['Num_of_people'], width = 0.6)
# plt.xlabel('Hour of Day')
# plt.ylabel('Num of People')
# plt.title('Number of people (Hour)')

# #Show plot
# plt.show()

As suspected, people aren't going to the early in the morning, definitely a spike around noon and high activities throughout the day.

**Model Training**

---

In [23]:
#Create a subset of original dataframe to train model based on hours/day
df_hour_day = df

In [24]:
#Seperate features/number of people
X = df_hour_day.drop('Num_of_people', axis='columns') # Get day/hour Weights
y = df_hour_day.Num_of_people # Get number of people

In [25]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [26]:
# Split data into train test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.25, random_state=12)

In [27]:
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from numpy import mean, std

model = BaggingRegressor(n_jobs=1, random_state=12)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

MAE: -7.707 (0.093)


In [28]:
model.fit(X_train, y_train)

In [29]:
y_pred = model.predict(X_test)

# Print the predictions along with actual weights
pred = pd.DataFrame()
pred['Prediction'] = y_pred
pred['Actual'] = y_test.values
pred

Unnamed: 0,Prediction,Actual
0,10.000000,25
1,40.366667,39
2,58.681667,51
3,0.000000,0
4,0.300000,0
...,...,...
18175,45.116667,40
18176,55.023810,93
18177,0.100000,0
18178,0.000000,0


In [30]:
#Print the MSE and COD of both training/test
ridge_mse_test = mean_squared_error(y_test, model.predict(X_test))
ridge_cod_test = r2_score(y_test, model.predict(X_test))
ridge_mse_train = mean_squared_error(y_train, model.predict(X_train))
ridge_cod_train = r2_score(y_train, model.predict(X_train))


print("TRAIN MSE: " + str(ridge_mse_train))
print("TRAIN COD: " + str(ridge_cod_train))
print("TEST MSE: " + str(ridge_mse_test))
print("TEST COD: " + str(ridge_cod_test))
print("MODEL SCORE: " + str(model.score(X_test, y_test)))

TRAIN MSE: 79.33818035043484
TRAIN COD: 0.8648821654635226
TEST MSE: 160.68172133250548
TEST COD: 0.7255008833419236
MODEL SCORE: 0.7255008833419236


**JS STUFF**
---
---

In [31]:
# # Create a React component to display the model's predictions
# class ModelPredictions extends React.Component {
#   render() {
#     return (
#       <div>
#         The model predicts that the probability of the class is {this.props.prediction}.
#       </div>
#     );
#   }
# }

# # Render the React component in the Jupyter notebook
# ReactDOM.render(<ModelPredictions prediction={model.predict(data)} />, document.getElementById('root'));

**PREDICTIONS**
---
---

In [32]:
#Import module to get current date
import datetime

In [33]:
#Get current date
todays_date = datetime.date.today()
 
# # printing original date
# print("The original date is : " + str(todays_date)[:10])
 
# Initialize how many days ahead you want to predcit
weekday_idx = 7

next_week_dates = []
# Compute next week's dates and add to list
for day in range(weekday_idx):
    days_delta = day - todays_date.weekday()
    if days_delta < 7:
        days_delta += 1

    next_week_dates.append(todays_date + datetime.timedelta(days_delta))
 
# # printing result
# print("Dates of next week: " + str(next_week_dates))

In [34]:
#Create a list of inputs for the next 7 days
#Contains [Month, date, day_of_week, is_weekend, is_holiday]
model_input = list(range(len(next_week_dates)))

index = 0
for i in next_week_dates:
    model_input[index] = [i.month]
    model_input[index].append(i.day)
    model_input[index].append(i.weekday())

    #Check if date is a weekend
    if(i.weekday() == 5 or i.weekday() == 6):
        model_input[index].append(1)
    else:
        model_input[index].append(0)

    #Check if date is a holiday
    if(i.month == 11 and i.day == 10):
        model_input[index].append(1)
    elif(i.month == 11 and (i.day == 23 or i.day == 24)):
        model_input[index].append(1)
    elif (i.month == 12 and any(i.day == j for j in range(25, 32))):
        model_input[index].append(1)
    elif(i.month == 1 and i.day == 1):
        model_input[index].append(1)
    else:
        model_input[index].append(0)
    
    index+=1

print(model_input)

[[11, 28, 1, 0, 0], [11, 29, 2, 0, 0], [11, 30, 3, 0, 0], [12, 1, 4, 0, 0], [12, 2, 5, 1, 0], [12, 3, 6, 1, 0], [12, 4, 0, 0, 0]]


In [51]:
#Create a list of model predictions
#Format: [[Month, day, day_of_week, hour, minute, is_weekend, is_holiday, prediction]]
model_output = list()

day_index = 0
for main in model_input:
    for hour in range(0, 24):
        value_over_hour = []
        for minute in range(0, 60, 10):
            prediction = model.predict([[main[0], main[2], hour, minute, main[3], main[4]]])
            value_over_hour.append(prediction[0])
        
        average_over_hour = sum(value_over_hour) / len(value_over_hour)
        model_output.append([main[0], main[1], main[2], hour, main[3], main[4], round(average_over_hour)])
    day_index += 1

In [52]:
print(model_output)

[[11, 28, 1, 0, 0, 0, 22], [11, 28, 1, 1, 0, 0, 27], [11, 28, 1, 2, 0, 0, 1], [11, 28, 1, 3, 0, 0, 1], [11, 28, 1, 4, 0, 0, 1], [11, 28, 1, 5, 0, 0, 1], [11, 28, 1, 6, 0, 0, 1], [11, 28, 1, 7, 0, 0, 1], [11, 28, 1, 8, 0, 0, 1], [11, 28, 1, 9, 0, 0, 1], [11, 28, 1, 10, 0, 0, 1], [11, 28, 1, 11, 0, 0, 1], [11, 28, 1, 12, 0, 0, 1], [11, 28, 1, 13, 0, 0, 1], [11, 28, 1, 14, 0, 0, 1], [11, 28, 1, 15, 0, 0, 1], [11, 28, 1, 16, 0, 0, 1], [11, 28, 1, 17, 0, 0, 1], [11, 28, 1, 18, 0, 0, 1], [11, 28, 1, 19, 0, 0, 1], [11, 28, 1, 20, 0, 0, 1], [11, 28, 1, 21, 0, 0, 1], [11, 28, 1, 22, 0, 0, 1], [11, 28, 1, 23, 0, 0, 1], [11, 29, 2, 0, 0, 0, 20], [11, 29, 2, 1, 0, 0, 28], [11, 29, 2, 2, 0, 0, 19], [11, 29, 2, 3, 0, 0, 19], [11, 29, 2, 4, 0, 0, 19], [11, 29, 2, 5, 0, 0, 19], [11, 29, 2, 6, 0, 0, 19], [11, 29, 2, 7, 0, 0, 19], [11, 29, 2, 8, 0, 0, 19], [11, 29, 2, 9, 0, 0, 19], [11, 29, 2, 10, 0, 0, 19], [11, 29, 2, 11, 0, 0, 19], [11, 29, 2, 12, 0, 0, 19], [11, 29, 2, 13, 0, 0, 19], [11, 29, 2, 14,