**Data Preprocessing**

---

In [1]:
#Importing pymongo (Connecting MongoDB with Python) as well as other ML libraries
import pandas as pd
import numpy as np
import math
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

from pymongo import MongoClient
from datetime import datetime
import random



In [2]:
#Connect to Database
client = MongoClient("mongodb+srv://webServer:hkSEd64DH1wujNPD@slugmetercluster.de0aesc.mongodb.net/")

In [3]:
#Get the database
db = client.SlugMeterTest

In [4]:
#Get the timestamp data
TimeStamps = db.Times
stamps = TimeStamps.find()

In [5]:
#Importing DB as dataframe
df = pd.DataFrame(list(stamps))

In [6]:
# Add Month, Day, Hour, Minute as columns in dataframe 
index = 0

for i in df['timestamp']:
    d = datetime.strptime(str(i), '%Y-%m-%d %H:%M:%S')
    df.loc[index, 'Month'] = d.month
    df.loc[index, 'Day'] = d.weekday()
    df.loc[index, 'Hour'] = d.hour
    df.loc[index, 'Minute'] = d.minute
    index+=1

In [7]:
#Set the number of people based by 10-minute time intervals
df['timestamp'] = pd.to_datetime(df['timestamp'])
diffs = df['timestamp'] - df['timestamp'].shift()
laps = diffs > pd.Timedelta('10 min')
periods = laps.cumsum().apply(lambda x: '{}'.format(x+1))
df['Num_of_people'] = periods.astype(int)

In [8]:
#Get the number of people in the gym (divide the number in half each hour, set to random number high number if passes gym capacity)
index = 0
for hour in df['Hour']:
    if(df.loc[index, 'Num_of_people'] > 82):
        df.iat[index, 7] = random.randrange(48, 78)
    else:
        df.iat[index, 7] = (df.iat[index, 7] // 2)
    index+=1

In [9]:
#Zero out the number of people if the gym is not open (based on the hours posted on UCSC's website)
#Encoding: Mon:0, Tues:1, Wed:2, Thur:3, Fri:4, Sat:5, Sun:6

df.loc[(df.Day < 4) & ((df.Hour < 7) | (df.Hour == 23)), "Num_of_people"] = 0 #Mon-Thurs (6am - 11pm) 
df.loc[(df.Day == 4) & (((df.Hour < 7 )) | (df.Hour >= 22)), "Num_of_people"] = 0 #Fri (6am-10pm)
df.loc[(df.Day >= 5) & (((df.Hour <= 8)) | (df.Hour >= 20)), "Num_of_people"] = 0 #Sat/Sun (8am-8pm)


In [10]:
#Remove unecessary columns (Only care about the timestamps and dates)
df = df.drop(columns = '_id')
df = df.drop(columns = 'isEntry')
df = df.drop(columns='timestamp')

---

USE THIS FOR THE ML DATABASE

In [None]:
# Convert Timestamps into Ints
index = 0

for i in df['timestamp']:
    d = datetime.strptime(str(i), '%Y-%m-%d %H:%M:%S')
    print(d)
    info = i[:-6]
    new_info = info[:2] + "" + info[3:5] + "" + info[-2:]
    new_info = int(new_info)
    #print(new_info)
    #df.loc[index, 'time'] = new_info
    index+=1

In [None]:
df.head()

In [None]:
#Create a seperate column for all times in terms of hours
index = 0
for i in df['time']:
    new = i//10000
    df.loc[index, 'hours'] = new
    index+=1

In [None]:
#Create a seperate column for the month of the year
index = 0
for full_date in df['date']:
    df.loc[index, 'Month'] = full_date.month
    index+=1

In [None]:
#Create a seperate column for the day of the week
#Encoding: Mon:0, Tues:1, Wed:2, Thur:3, Fri:4, Sat:5, Sun:6
index = 0
for full_date in df['date']:
    df.loc[index, 'Day'] = full_date.weekday()
    index+=1

In [None]:
df = pd.get_dummies(df, columns = ['Day'])
df = pd.get_dummies(df, columns = ['hours'])

In [None]:
data = pd.read_csv('gym_data.csv')

In [None]:
print(data)

In [None]:
#Create a seperate column for the day of the week
#Encoding: Mon:0, Tues:1, Wed:2, Thur:3, Fri:4, Sat:5, Sun:6
index = 0
for num_peep in data['number_people']:
    df.loc[index, 'Num_of_people'] = num_peep
    index+=1

In [None]:
#Create a seperate column for holidays
#If it is a holiday, zero-out the Num_of_people
index = 0
for actual_date in df['date']:
    if(actual_date.month == 11 and actual_date.day == 10):
        df.iloc[index, 4] = 0
        df.loc[index, 'isHoliday'] = 1
    elif(actual_date.month == 11 and (actual_date.day == 23 or actual_date.day == 24)):
        df.iloc[index, 4] = 0
        df.loc[index, 'isHoliday'] = 1
    elif (actual_date.month == 12 and any(actual_date.day == i for i in range(25, 32))):
        df.iloc[index, 4] = 0
        df.loc[index, 'isHoliday'] = 1
    elif(actual_date.month == 1 and actual_date.day == 1):
        df.iloc[index, 4] = 0
        df.loc[index, 'isHoliday'] = 1
    else:
        df.loc[index, 'isHoliday'] = 0
    index+=1

In [None]:
#Drop any NaN values 
df = df.dropna()

**Data Exploration**

---

In [None]:
# # Function to plot interactive plots using Plotly (Reference: https://medium.com/mlearning-ai/forecasting-timeseries-using-machine-learning-deep-learning-446eccc6eb6d)
# def plotl(df, x, y, title):
#     fig = px.line(df, x=x, y=y, title=title)
#     fig.show()

In [None]:
# plotl(df, 'date', df['Num_of_people'], 'Number of people (Year)')

People seemed to stop going to the gym as the years passed. Seems like a steady decline, could be an issue with how the data was obtained

In [None]:
# # Creating a bar chart to show num of people during each day
# plt.bar(df['Day'], df['Num_of_people'], width = 0.6)
# plt.xlabel('Day of Week (Mon:0, Tues:1, Wed:2, Thur:3, Fri:4, Sat:5, Sun:6)')
# plt.ylabel('Num of People')
# plt.title('Number of people (Day)')

# # Show plot
# plt.show()

Number of people seems to be evenly disbursed throughout the days of the week. There is more activity towards the start/middle of the week and it falls off during the weekend

In [None]:
# # Creating a bar chart to show num of people during each hour
# plt.bar(df['hours'], df['Num_of_people'], width = 0.6)
# plt.xlabel('Hour of Day')
# plt.ylabel('Num of People')
# plt.title('Number of people (Hour)')

# # Show plot
# plt.show()

As suspected, people aren't going to the early in the morning, definitely a spike around noon and high activities throughout the day.

**Model Training**

---

In [None]:
df.head(2)

In [None]:
# #Create a subset of original dataframe to train model based on hours/day
df_hour_day = df.loc[:, ['time','hours', 'Day', 'isHoliday', 'Num_of_people']]

In [None]:
#Seperate features/number of people
X = df_hour_day.drop('Num_of_people', axis='columns') # Get day/hour Weights
y = df_hour_day.Num_of_people # Get number of people

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Split data into train test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.28)

In [None]:
from sklearn.ensemble import BaggingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from numpy import mean, std

model = BaggingRegressor(n_jobs=10, random_state=12)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise')
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_test, y_test)

In [None]:
y_pred = model.predict(X_test)

# Print the predictions along with actual weights
pred = pd.DataFrame()
pred['Prediction'] = y_pred
pred['Actual'] = y_test.values
pred

In [None]:
#Print the MSE and COD of both training/test
ridge_mse_test = mean_squared_error(y_test, model.predict(X_test))
ridge_cod_test = r2_score(y_test, model.predict(X_test))
ridge_mse_train = mean_squared_error(y_train, model.predict(X_train))
ridge_cod_train = r2_score(y_train, model.predict(X_train))


print("TRAIN MSE: " + str(ridge_mse_train))
print("TRAIN COD: " + str(ridge_cod_train))
print("TEST MSE: " + str(ridge_mse_test))
print("TEST COD: " + str(ridge_cod_test))

In [None]:
model.predict([[1720, 1, 0, 0]])

---

OLD STUFF

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=12)


In [None]:
# Scale the data to be between -1 and 1
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor
# Establish model
model = RandomForestRegressor(n_jobs=-1)

In [None]:
# Try different numbers of n_estimators - this will take a minute or so
estimators = np.arange(10, 200, 10)
scores = []
for n in estimators:
    model.set_params(n_estimators=n)
    model.fit(X_train, y_train)
    scores.append(model.score(X_test, y_test))
plt.title("Effect of n_estimators")
plt.xlabel("n_estimator")
plt.ylabel("score")
plt.plot(estimators, scores)

In [None]:
# # Create a React component to display the model's predictions
# class ModelPredictions extends React.Component {
#   render() {
#     return (
#       <div>
#         The model predicts that the probability of the class is {this.props.prediction}.
#       </div>
#     );
#   }
# }

# # Render the React component in the Jupyter notebook
# ReactDOM.render(<ModelPredictions prediction={model.predict(data)} />, document.getElementById('root'));