In [1]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import statsmodels.api as sm
from scipy.stats import chi2_contingency
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report, confusion_matrix

In [2]:
# read in dataset
data = pd.read_csv('Occupancy.csv')


In [3]:
# create datetime variable and drop date object variable
data['Datetime'] = pd.to_datetime(data['date'])


In [4]:
# check for missing values
data.isna().sum()

date             0
Temperature      0
Humidity         0
Light            0
CO2              0
HumidityRatio    0
Occupancy        0
Datetime         0
dtype: int64

In [5]:
# reduce Light variable to binary
data['Light_on_off'] = np.where(data['Light'] > 0, 1, 0)
# get hour number from datetime column
data['Hour'] = data['Datetime'].dt.hour
# getting the weekday from the 'date' column
data['DOTW'] = pd.to_datetime(data['date']).dt.day_name()
# making the column 'DOTW' to be catergorical and will now be changed to numerical values representing the days of the week
day_mapping = {
    'Monday': 1,
    'Tuesday': 2,
    'Wednesday': 3,
    'Thursday': 4,
    'Friday': 5,
    'Saturday': 6,
    'Sunday': 7
}
# now mapping the new values to the 'DOTW' column
data['DOTW_encoded'] = data['DOTW'].map(day_mapping)
# create a binary variable for weekday or not
data['Weekday'] = np.where(data['DOTW_encoded'] < 6, 1, 0)
# create a binary variable for working hours or not
data['Working_Hours'] = np.where(np.logical_and(data['Hour'] >= 7, data['Hour'] <= 18), 1, 0)


In [6]:
# function to run all the predictor variables individually in regression and capture the statistics
def runreg(var, input_y, num):
    
    # x is the independent variable that influences y
    x1 = pd.DataFrame(data[var])
    
    # define the target
    # y is the dependent variable we are trying to predict
    #y1 = pd.DataFrame(input_y)
    y1 = input_y
    
    # Split the data into training and testing sets
    x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size=0.2, random_state=42) 
    
    # Create a logistic regression model
    model1 = LogisticRegression(max_iter=500)
    
    # Fit the model to the training data
    model1.fit(x_train, y_train)

    # Make predictions on the test data
    y_pred = model1.predict(x_test)

    # Evaluate the model
    accuracy1 = accuracy_score(y_test, y_pred)
    
    # create a list of accuracy score to return
    acc_list3 = [var,round(accuracy1,8)]
    
    # Print the results
    #print("\n------------------------------------------------------------\n")
    #print(f'Accuracy of Logistic Regression for Predictor {var}: {accuracy1}')
    #print("\n------------------------------------------------------------\n")
    # create glm model and fit
    #model2 = sm.GLM(y_train, x_train, family=sm.families.Binomial()).fit()
    # Print the results
    #print(model2.summary())
    #print("\n------------------------------------------------------------\n")
    
    return acc_list3

In [7]:
#LOGISTIC REGRESSION: DEPENDENT VARIABLES ARE BINARY

# create function to repeat the regression
def repeat_reg1(x_indep,y_dep,factor_list):

    # Split the data into training and testing sets
    x_indep_train, x_indep_test, y_dep_train, y_dep_test = train_test_split(x_indep, y_dep, test_size=0.2, random_state=42)
    
    # Create a logistic regression model
    model_reg = LogisticRegression(max_iter=500)
    
    # Fit the model to the training data
    model_reg.fit(x_indep_train, y_dep_train)
    
    # Make predictions on the test data
    y_dep_pred = model_reg.predict(x_indep_test)
    
    # Evaluate the model
    accuracy_reg = accuracy_score(y_dep_test, y_dep_pred)
    #report_reg = classification_report(y_dep_test, y_dep_pred)
    #cm_reg = confusion_matrix(y_dep_test, y_dep_pred)

    # save accuracy to list
    acc_list1 = ["Grouped Predictors",round(accuracy_reg,8)]
    
    # Print the results
    #print(f'Accuracy of Logistic Regression for Grouped Predictors: {accuracy_reg}')
    #print("\n------------------------------------------------------------\n")
    #print('Classification Report of Logistic Regression for Grouped Predictors:\n', report_reg)
    #print("\n------------------------------------------------------------\n")
    #print('Confusion Matrix of Logistic Regression for Grouped Predictors:\n', cm_reg)
    
    # GENERALIZED LINEAR MODEL
    
    # create glm model and fit
    #model_glm = sm.GLM(y_dep_train, x_indep_train, family=sm.families.Binomial()).fit()
    
    # Make predictions on the test data
    #y_dep_pred1 = model_glm.predict(x_indep_test)
    
    #print(model_glm.summary())

    # create list for returning all accuracy scores
    acc_list2 = list()
    acc_list2.append(acc_list1)
    
    # RUN REGRESSION FOR EACH VARIABLE SEPARATELY
    # iterate through factor_list
    for idx, x in enumerate(factor_list):
        acc_list3 = runreg(x, y_dep, idx+1)
        acc_list2.append(acc_list3)

    return acc_list2
    

In [8]:
# dependent variable (Y) is Occupancy (binary 0/1)
# independent variables (X):
#     Float: Temperature, Humidity, Light, CO2
#     Integer: Hour, DOTW_encoded
#     Binary: Light_on_off, Weekday, Working_Hours

# Split the dataset into independent variables (X) and the dependent variable (y)
y_dep1 = data["Occupancy"]
x_indep1 = data[["Temperature","Humidity","Light","CO2","DOTW_encoded","Hour"]]
#x_indep1 = data[["Temperature","Humidity","Light","CO2","Hour"]]
factor_list1 = ["Temperature","Humidity","Light","CO2","DOTW_encoded","Hour","Light_on_off","Weekday","Working_Hours"]
#factor_list1 = ["Temperature","Humidity","Light","CO2","Hour","Light_on_off","Weekday","Working_Hours"]

# run regressions
acc_list_occupancy = repeat_reg1(x_indep1,y_dep1,factor_list1)


In [9]:
# dependent variable (Y) is Weekday (binary 0/1)
# independent variables (X):
#     Float: Temperature, Humidity, Light, CO2
#     Integer: Hour
#     Binary: Light_on_off, Occupancy, Working_Hours

# Split the dataset into independent variables (X) and the dependent variable (y)
y_dep2 = data["Weekday"]
x_indep2 = data[["Temperature","Humidity","Light","CO2","Hour","Occupancy"]]
factor_list2 = ["Temperature","Humidity","Light","CO2","Hour","Light_on_off","Occupancy","Working_Hours"]
#factor_list2 = ["Temperature","Humidity","Light","CO2","Hour","Light_on_off","Occupancy"]

# run regressions
acc_list_weekday = repeat_reg1(x_indep2,y_dep2,factor_list2)


In [10]:
# dependent variable (Y) is Working_Hours (binary 0/1)
# independent variables (X):
#     Float: Temperature, Humidity, Light, CO2
#     Integer: DOTW_encoded
#     Binary: Light_on_off, Occupancy, Weekday

# Split the dataset into independent variables (X) and the dependent variable (y)
y_dep3 = data["Working_Hours"]
x_indep3 = data[["Temperature","Humidity","Light","CO2","DOTW_encoded","Occupancy"]]
factor_list3 = ["Temperature","Humidity","Light","CO2","DOTW_encoded","Light_on_off","Occupancy","Weekday"]
#factor_list3 = ["Temperature","Humidity","Light","CO2","DOTW_encoded","Light_on_off","Occupancy","Weekday"]

# run regressions
acc_list_hours = repeat_reg1(x_indep3,y_dep3,factor_list3)


In [11]:
# create table from accuracy list
df1 = pd.DataFrame(acc_list_occupancy, columns = ['Predictor(s)','Accuracy Score for Occupancy'])
df1.set_index('Predictor(s)', inplace=True)
test1 = df1.transpose() 
# create table from accuracy list
df2 = pd.DataFrame(acc_list_weekday, columns = ['Predictor(s)','Accuracy Score for Weekday'])
df2.set_index('Predictor(s)', inplace=True)
test2 = df2.transpose() 
# create table from accuracy list
df3 = pd.DataFrame(acc_list_hours, columns = ['Predictor(s)','Accuracy Score for Working_Hours'])
df3.set_index('Predictor(s)', inplace=True)
test3 = df3.transpose() 

In [12]:
# create table from accuracy list
df_concat = pd.concat([test1,test2,test3], join='outer')
df_concat

Predictor(s),Grouped Predictors,Temperature,Humidity,Light,CO2,DOTW_encoded,Hour,Light_on_off,Weekday,Working_Hours,Occupancy
Accuracy Score for Occupancy,0.992218,0.812014,0.776265,0.989786,0.79037,0.776265,0.776265,0.845331,0.776265,0.776265,
Accuracy Score for Weekday,0.816634,0.812743,0.707442,0.705253,0.760944,,0.705253,0.705253,,0.705253,0.705253
Accuracy Score for Working_Hours,0.899562,0.745866,0.508268,0.899562,0.650778,0.521158,,0.899562,0.521158,,0.744893
