# Research Practicum

This notebook contains a model which uses the minvalue per hour per day per room.

<b> GET DATA </b>

In [1]:
#import pandas package to read and merge csv files
import pandas as pd
#import csv package for reading from and writing to csv files
import csv
# Import package numpy for numeric computing
import numpy as np
# Import package matplotlib for visualisation/plotting
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# read data from csv file into a data frame
# use double backslash to prevent unicode error from '\U' characters
wifi_df = pd.read_csv("D:\\Users\\Elayne Ruane\\Documents\\CSI MA\\research_practicum\\cleaned_data\\full.csv", names=['room', 'event_time', 'ass', 'auth'])

OSError: File b'D:\\Users\\Elayne Ruane\\Documents\\CSI MA\\research_practicum\\cleaned_data\\full.csv' does not exist

In [None]:
# check data loaded into data frame correctly
wifi_df.head()

In [None]:
# check data loaded into data frame correctly
wifi_df.tail()

<b> CLEAN DATA </b>

Convert timestamp to epoch time.

In [None]:
import time
from dateutil.parser import parse

def convert_to_epoch(df, column):
    '''function that reads in a dataframe with a column containing values in timestamp format and converts those values to epoch forma
   
    requires module time and parse function from dateutil.parser
    
    paramaters
    ----------
    df is a dataframe
    column is a string that denotes the name of the column containing value in timestamp format
    '''
    
    #for loop that iterates through each row in the dataframe
    for i in range(df.shape[0]):
        # variable 'x' is assigned the value from the column and row 'i'
        x = df[column][i]
        # variable 'y' is assigned the result of variable 'x' passed through the parse method 
        y = parse(x)
        # variable 'epoch' is assigned 'y' value converted to epoch time
        epoch = int(time.mktime(y.timetuple()))
        # set column value to value of variable 'epoch'
        df.set_value(i, column, epoch)
    return df

In [None]:
convert_to_epoch(wifi_df, 'event_time')

In [None]:
## Original code used to create convert_to_epoch() function above

#import time
#from dateutil.parser import parse

#for i in range(wifi_log_data.shape[0]):
#    x = wifi_log_data["event_time"][i]
#    y = parse(x)
#    epoch = int(time.mktime(y.timetuple()))
#    wifi_log_data.set_value(i,"event_time",epoch)

Clean Room Identifiers

In [None]:
def room_number(df, room_column):
    '''function that reads in a dataframe with a column containing room information in the format 'campus > building > roomcode-xxx' 
    and replaces the values in the column with just the room ID which is the last character of the string in that column.    
    '''
    # for loop that iterates through each row in the df
    for i in range(df.shape[0]):
        # selects last character of the string in the room_column which is the room ID
        df.set_value(i, room_column, df[room_column][i][-1:])
    return df

In [None]:
room_number(wifi_df, 'room')

In [None]:
wifi_df.head()

Add building.

In [None]:
wifi_df['building'] = 'school of computer science'

In [None]:
wifi_df.head()

Clean Occupancy Data

In [None]:
# put survey data in a dataframe
occupancy_df = pd.read_csv("D:\\Users\\Elayne Ruane\\Documents\\CSI MA\\research_practicum\\cleaned_data\\survey_data.csv")

In [None]:
occupancy_df.head()

In [None]:
# delete column 'Unnamed: 0'
del occupancy_df['Unnamed: 0']

In [None]:
occupancy_df.head()

Convert EPCOH time into human-readable format.

In [None]:
# convert 'event_time' values from EPOCH to DATETIME
wifi_df['event_time'] = pd.to_datetime(wifi_df.event_time, unit='s')
# use event_time as dataframe index 
wifi_df.set_index('event_time', inplace=True)

In [None]:
wifi_df.head()

In [None]:
# create two new columns, event_hour and event_day
wifi_df['event_hour'] = wifi_df.index.hour
wifi_df['event_day'] = wifi_df.index.day

In [None]:
wifi_df.head()

In [None]:
# convert 'event_time' values from EPOCH to DATETIME
occupancy_df['event_time'] = pd.to_datetime(occupancy_df.event_time, unit='s')
# use event_time as dataframe index 
occupancy_df.set_index('event_time', inplace=True)

In [None]:
occupancy_df.head()

In [None]:
# create two new columns, event_hour and event_day
occupancy_df['event_hour'] = occupancy_df.index.hour
occupancy_df['event_day'] = occupancy_df.index.day

In [None]:
occupancy_df.head()

<b> DATA ANALYSIS </b>

Survey data contains one recorded value per room, per day, per hour. Here, we take the min reading per hour, per day, per room.

In [None]:
df_min_conn = wifi_df.groupby(['room', 'event_day', 'event_hour'], as_index=False).min()

In [None]:
df_min_conn.tail()

In [None]:
# merge data into single dataframe
df_min_conn['room'] = df_min_conn['room'].astype(int)
full_df = pd.merge(df_min_conn, occupancy_df, on=['room', 'event_day', 'event_hour'], how='inner')

full_df.head(15)

In [None]:
# add column for number of estimated occupants based on room capacity * occupancy rate

def estimate_occ(df,room, occupancy_rate):
    '''function that caluclates the estimated number of room occupants
    
    parameters
    ----------
    df is a dataframe with columns room and occupancy_rate
    room is a string denoting a column in df that contains INT values representing room IDs
    occupancy_rate is a string denoting a column in df that contains DECIMAL values that represent the estimated room occupancy rate
    
    '''
    #for loop that iterates through each row of the df
    for i in range(df.shape[0]):
        
        #room two and three have capacity of 90
        if df[room][i] == 2 or df[room][i] == 3:
            # calculate estimated occupants for row, assign to variable 'est'
            est = df[occupancy_rate][i] * 90
            #set value in new column
            df.set_value(i, 'est_occupants', est)
        
        #room four has a capcity of 220
        elif df[room][i] == 4:
            est = df[occupancy_rate][i] * 220
            df.set_value(i, 'est_occupants', est)
        
        else:
            raise ValueError('Incorrect room number:', df[room][i])
            

In [None]:
estimate_occ(full_df, 'room', 'occupancy')

In [None]:
full_df.head()

In [None]:
# look at correlations for estimated occupants and associated devices
full_df[['ass', 'auth', 'est_occupants']].corr()

In [None]:
fig, axs = plt.subplots(1, 2, sharey=True)
full_df.plot(kind='scatter', x='ass', y='est_occupants', label='%.3f' 
        % full_df[['ass', 'est_occupants']].corr().as_matrix()[0,1], ax=axs[0], figsize=(15, 8))
full_df.plot(kind='scatter', x='auth', y='est_occupants', label='%.3f' 
        % full_df[['auth', 'est_occupants']].corr().as_matrix()[0,1], ax=axs[1])

Linear Regression Model

In [None]:
import statsmodels.formula.api as sm

In [None]:
# can also use associated but higher correlation with authenticated 
lm = sm.ols(formula='est_occupants ~ auth', data=full_df).fit()

In [None]:
print(lm.params)

In [None]:
print(lm.summary())

Test the model.

In [None]:
full_df.describe()

In [None]:
test_df = pd.DataFrame({'auth':[30]}) #taking the mean value (rounded to decimal)
test_df.head()

In [None]:
lm.predict(test_df)

Plot the model

In [None]:
minmax_df = pd.DataFrame({'auth': [full_df.auth.min(), full_df.auth.max()]})
minmax_df.head()

In [None]:
#predict values based on the maximum and minimum value experienced.

predictions = lm.predict(minmax_df)
predictions

In [None]:
full_df.plot(kind='scatter', x='auth', y='est_occupants')
plt.plot(minmax_df, predictions, c='red', linewidth=2)

Predictions for entire dataframe

In [None]:
predict_df = pd.DataFrame({'real_val': full_df['est_occupants'], 'predict': lm.predict(full_df)})

In [None]:
predict_df

In [None]:
predict_df[['predict', 'real_val']].corr()

In [None]:
# calculate average difference 
count = 0
total = 0
for i in range(predict_df.shape[0]):
    diff = abs(predict_df['real_val'][i] - predict_df['predict'][i])
    count += 1
    total += diff
avg_diff = float(total/count)

print(avg_diff)

Add linear predictions to dataframe

In [None]:
full_df['linear_predict'] = None

for i in range(full_df.shape[0]):
    full_df.set_value(i, 'linear_predict', lm.predict({'auth':full_df['auth'][i]})[0])

In [None]:
full_df.head()

In [None]:
# add column to dataframe for prediction category
full_df['cat_predict'] = None

In [None]:
def set_occupancy_category(prediction, max_room_cap):
    '''function that converts the predictions to a category
    
    Parameters
    ----------
    prediction: a float or int that is the predicted number
    max_room_cap: an int that is the max capacity of the room
    
    Returns
    -------
    The prediction category
    
    '''
    # calculate the occupancy rate and assign to variable 'ratio'
    ratio = prediction / max_room_cap
    if ratio < 0.13:
        return 0.0
    elif ratio < 0.38:
        return 0.25
    elif ratio < 0.5:
        return 0.5
    elif ratio < 0.88:
        return 0.75
    else:
        return 1.0
    
for i in range(full_df.shape[0]):
    if full_df['room'][i] == 2 or full_df['room'][i] == 3:
        full_df.set_value(i, 'cat_predict', set_occupancy_category(full_df['linear_predict'][i], 90))
    elif full_df['room'][i]==4:
        full_df.set_value(i, 'cat_predict', set_occupancy_category(full_df['linear_predict'][i], 220))

In [None]:
def set_occupancy_category(df, room, linear_predict, cat_predict):
    '''function that converts linear predictions to a defined category and updates the dataframe passed through
    
    Parameters
    ----------
    df: a dataframe
    room: a string that is the column in df containing room id values of type INT
    linear_predict: a string that is the column in df containing linear predictions
    cat_predict: a string that is the column in df that will containing category predictions
    
    '''
    
    for i in range(df.shape[0]):
        
        # assign room capacity
        if df[room][i] == 2 or df[room][i] == 3:
            cap = 90
        elif df[room][i] == 4:
            cap = 200
            
        # calculate the occupancy rate and assign to variable 'ratio'
        ratio = df[linear_predict][i]/ cap
        
        # assign category based on ratio
        if ratio < 0.13:
            cat = 0.0
        elif ratio < 0.38:
            cat =  0.25
        elif ratio < 0.5:
            cat =  0.5
        elif ratio < 0.88:
            cat =  0.75
        else:
            cat =  1.0
        
        # set category value in df
        df.set_value(i, cat_predict, cat)

In [None]:
full_df.head()

Check accuracy of model according to survey data

In [None]:
full_df['accurate'] = None

for i in range(full_df.shape[0]):
    full_df.set_value(i, 'accurate', 1 if full_df['occupancy'][i] == full_df['cat_predict'][i] else 0)

In [None]:
full_df.head()

In [None]:
accuracy = full_df['accurate'].sum()/full_df.shape[0]
accuracy

### Remove intercept to check if accuracy is better