 # Importing Significant Packages

In [None]:
import pandas as pd
import pickle 
import matplotlib.pyplot as plt
import statsmodels
import numpy as np
import seaborn as sns
from numpy import array
from scipy.stats import norm
from statsmodels.formula.api import logit
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder 
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn import preprocessing

# Data Exploration 

In [None]:
# Reading in Data
data = pd.read_csv("occupancy.csv")

In [None]:
# Visualizing Structure
data.head()

In [None]:
# Splitting the date column into date and time

data[["Date", "Time"]] = data.date.str.split(expand=True)
data.head()

In [None]:
# Drop the initial date column
data = data.drop(["date"], axis=1)
data.head()

In [None]:
# Looking for the first occurence of a weekend begins on the 5914ith observation
data.iloc[5913:]

In [None]:
# The weekend ends on the 8794th observation with the rest being weekdays
data.iloc[8792:]

In [None]:
# Setting a new binary column parameter known as day
data[["Day"]] = "weekday"
data.head()

In [None]:
# Setting the days that are weekends manually
data.loc[5914:8793, "Day"] = "weekend"
data.iloc[5913:8795]

In [None]:
# Changing the time to have a seconds for the timetodelta method
data["Time"] = data["Time"] + ":00"
data.head()

In [None]:
data["Time"] = pd.to_timedelta(data["Time"])
data.head()

In [None]:
# Setting a new categorical variable to specify the day of time
b = pd.to_timedelta(['00:00:00','07:00:00','19:00:00','24:00:00'])
l = ['Night','Day','Night']
data["dayoftime"] = pd.cut(data["Time"], bins=b, labels=l,ordered=False)
data.head()

Here we are now left with a relatively cleaner dataframe in terms of date along with an added Day parameter from EDA.

In [None]:
#data.to_csv("cleaned_occupancy.csv", index=False)

# Checking for Disruptive Noise/Bias

In [None]:
print(data.any())

In [None]:
print(data.info())

# Removing any rows with NaN values
## Ensures a better estimation by eliminating nonexistent values

In [None]:
data.shape

In [None]:
data.dropna(subset=['Temperature', 'Humidity','Light','CO2','HumidityRatio','Occupancy','Date','Time','Day','dayoftime'],inplace=True)

In [None]:
data.shape

## As noticed, 8 rows were removed due to any missing value in any column 
### running print(data.info()), we can notice that the column day of time had 10800 non-null values, hence being the remmoved NaN values
### look at rows

# One-Hot Encode Nominal Feature Vectors

In [None]:
columns= ['Day','dayoftime']

columns_encoding = pd.DataFrame(data,columns=columns)
print(columns_encoding.shape)
print( columns_encoding)

In [None]:
columns_encoding = pd.get_dummies(columns_encoding)
print(columns_encoding)

In [None]:
frames = [data,columns_encoding]
resulting_df = pd.concat(frames,axis=1,)
#print(resulting_df.head)
resulting_df = resulting_df.drop(['Day','dayoftime'],axis=1)
print(resulting_df.head)

In [None]:
df = resulting_df.drop(columns = ['Date'])
print(df.head)

# Algorithm

In [None]:
plt.matshow(df.corr())
plt.show()

In [None]:
df.corr()

In [None]:
f = plt.figure(figsize=(19, 15))
plt.matshow(df.corr(), fignum=f.number)
plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=14, rotation=45)
plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16)

# We choose not to do PCA here, even though we have correlations as we want to preserve our features to compute feature importances

In [None]:
clf = Pipeline([
  ('feature_selection', SelectFromModel(ExtraTreesClassifier())),
  ('classification', RandomForestClassifier())
])

In [None]:
X = df.drop(columns = ['Occupancy'])
y = df.Occupancy

In [None]:
clf = ExtraTreesClassifier()
clf = clf.fit(X, y)
clf.feature_importances_

In [None]:
model = SelectFromModel(clf, prefit=True)

In [None]:
for ind, item in enumerate(np.argsort(clf.feature_importances_)[::-1]):
    print("#",ind + 1, "Feature:", X.columns[item], ":", clf.feature_importances_[item])

# We see that our top 5 features are Light, CO2, Temperature, Hour, and Humidity ratio, let us use these features to train

In [None]:
X = X[["Light", "CO2", "Temperature", "hour", "HumidityRatio"]]
y = y

In [None]:
X

In [None]:
# Scale features to normal
X = preprocessing.StandardScaler().fit_transform(X)
y = y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
# We see that our data is not balanced, so we must train on balanced accuracy
y.describe()

## SVM

In [None]:
estimator = SVC()
param_grid = {'C': [1, 10, 100, 1000], 'kernel': ["linear", "poly", "rbf"]} 
 
search = GridSearchCV(estimator, param_grid=param_grid, scoring = 'balanced_accuracy', verbose = 3)

In [None]:
search.fit(X_train, y_train)

In [None]:
pd.DataFrame(search.cv_results_).sort_values(by = 'rank_test_score')

SVC Best Estimator: {'C': 100, 'kernel': 'poly'}

Training Accuracy (mean over 5-folds): 0.991054

In [None]:
svc_clf = search

In [None]:
svc_clf.best_estimator_.score(X_test, y_test)

## Random Forests

In [None]:
estimator = RandomForestClassifier()
param_grid = {"criterion": ["gini", "entropy"], "n_estimators": [10, 100, 1000]} 
 
search = GridSearchCV(estimator, param_grid=param_grid, scoring = 'balanced_accuracy', verbose = 3)

In [None]:
search.fit(X_train, y_train)

In [None]:
pd.DataFrame(search.cv_results_).sort_values(by = 'rank_test_score')

RF Best Estimator: {'criterion': 'entropy', 'n_estimators': 1000}

Training Score (mean over 5-folds): 0.990605

In [None]:
rf_clf = search

In [None]:
rf_clf.best_estimator_.score(X_test, y_test)

## Removing Light

In [None]:
X = X[:, 1:]
y = y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

## SVM (again)

In [None]:
estimator = SVC()
param_grid = {'C': [1, 10, 100], 'kernel': ["linear", "poly", "rbf"]} 
 
search = GridSearchCV(estimator, param_grid=param_grid, scoring = 'balanced_accuracy', verbose = 3)

In [None]:
search.fit(X_train, y_train)

In [None]:
pd.DataFrame(search.cv_results_).sort_values(by = 'rank_test_score')

SVC Best Estimator: {'C': 100, 'kernel': 'rbf'}

Training Accuracy (mean over 5-folds): 0.977083

In [None]:
svc_clf_no_light = search

In [None]:
svc_clf_no_light.best_estimator_.score(X_test, y_test)

## Random Forests

In [None]:
estimator = RandomForestClassifier()
param_grid = {"criterion": ["gini", "entropy"], "n_estimators": [10, 100, 1000]} 
 
search = GridSearchCV(estimator, param_grid=param_grid, scoring = 'balanced_accuracy', verbose = 3)

In [None]:
search.fit(X_train, y_train)

In [None]:
pd.DataFrame(search.cv_results_).sort_values(by = 'rank_test_score')

RF Best Estimator: {'criterion': 'entropy', 'n_estimators': 1000}

Training Score (mean over 5-folds): 0.983292

In [None]:
rf_clf_no_light = search

In [None]:
rf_clf_no_light.best_estimator_.score(X_test, y_test)

# Future alterations to Project
As the dataset categories imply, the given projections can only be farily extrapolated to rooms of similar occupancy and can not be generalized for all room occupancies. A modification to this project would be the have higher occupancy rooms with specific temporal feautres such as seasonal change, a notion of geographic location that determines tendency for room occupancy and external uncontrolled environments/diseases. This would take into account a more biable and predictive model based on surrounding particles and the state of the system, which in a pandemic is of significance as those whom are immunocompromised will greatly benefit from the space. A limitation to this project is the numerical readings and the type of environmental being construded by noise in the system. An improvement to this project is to first classify the rooms in categories relating to setting and temperature readings, etc in which dense rooms and lower reading rooms will on average differ by the same amount( instead of checking based on day). Also additonal combination of room statistics from other datasets might be able to help set up a foundational trend seen among all rooms with similar features.