# Research Practicum

This notebook contains a model which uses the max value per hour per day per room.

<b> GET DATA </b>

In [109]:
#import pandas package to read and merge csv files
import pandas as pd
#import csv package for reading from and writing to csv files
import csv
# Import package numpy for numeric computing
import numpy as np
# Import package matplotlib for visualisation/plotting
import matplotlib.pyplot as plt
%matplotlib inline

# Get wifi logs data

In [110]:
# read data from csv file into a data frame
# code is now OS agnostic

import os

a = '..' # removed slash
b = 'cleaned_data' # removed slash
c = 'full.csv'

print(os.path.join(a, b, c))

wifi_df = pd.read_csv(os.path.join(a, b, c), names=['room', 'event_time', 'ass', 'auth'])

../cleaned_data/full.csv


In [111]:
# check data loaded into data frame correctly
wifi_df.head()

Unnamed: 0,room,event_time,ass,auth
0,Belfield > Computer Science > B-002,Mon Nov 02 20:32:06 GMT+00:00 2015,0,0
1,Belfield > Computer Science > B-002,Mon Nov 02 20:37:10 GMT+00:00 2015,0,0
2,Belfield > Computer Science > B-002,Mon Nov 02 20:42:12 GMT+00:00 2015,0,0
3,Belfield > Computer Science > B-002,Mon Nov 02 20:47:14 GMT+00:00 2015,0,0
4,Belfield > Computer Science > B-002,Mon Nov 02 20:52:11 GMT+00:00 2015,0,0


In [112]:
# check data loaded into data frame correctly
wifi_df.tail()

Unnamed: 0,room,event_time,ass,auth
12412,Belfield > Computer Science > B-004,Tue Nov 17 11:01:59 GMT+00:00 2015,18,18
12413,Belfield > Computer Science > B-004,Tue Nov 17 11:06:37 GMT+00:00 2015,32,32
12414,Belfield > Computer Science > B-004,Tue Nov 17 11:13:19 GMT+00:00 2015,39,39
12415,Belfield > Computer Science > B-004,Tue Nov 17 11:21:11 GMT+00:00 2015,42,42
12416,Belfield > Computer Science > B-004,Tue Nov 17 11:26:07 GMT+00:00 2015,46,46


<b> CLEAN DATA </b>

Convert timestamp to epoch time.

In [113]:
import time
from dateutil.parser import parse

def convert_to_epoch(df, column):
    '''function that reads in a dataframe with a column containing values in timestamp format and converts those values to epoch forma
   
    requires module time and parse function from dateutil.parser
    
    paramaters
    ----------
    df is a dataframe
    column is a string that denotes the name of the column containing value in timestamp format
    '''
    
    #for loop that iterates through each row in the dataframe
    for i in range(df.shape[0]):
        # variable 'x' is assigned the value from the column and row 'i'
        x = df[column][i]
        # variable 'y' is assigned the result of variable 'x' passed through the parse method 
        y = parse(x)
        # variable 'epoch' is assigned 'y' value converted to epoch time
        epoch = int(time.mktime(y.timetuple()))
        # set column value to value of variable 'epoch'
        df.set_value(i, column, epoch)
    return df

In [114]:
convert_to_epoch(wifi_df, 'event_time').head()

Unnamed: 0,room,event_time,ass,auth
0,Belfield > Computer Science > B-002,1446496326,0,0
1,Belfield > Computer Science > B-002,1446496630,0,0
2,Belfield > Computer Science > B-002,1446496932,0,0
3,Belfield > Computer Science > B-002,1446497234,0,0
4,Belfield > Computer Science > B-002,1446497531,0,0


Clean Room Identifiers

In [115]:
def room_number(df, room_column):
    '''function that reads in a dataframe with a column containing room information in the format 'campus > building > roomcode-xxx' 
    and replaces the values in the column with just the room ID which is the last character of the string in that column.    
    '''
    # for loop that iterates through each row in the df
    for i in range(df.shape[0]):
        # selects last character of the string in the room_column which is the room ID
        df.set_value(i, room_column, df[room_column][i][-1:])
    return df

In [116]:
room_number(wifi_df, 'room').head()

Unnamed: 0,room,event_time,ass,auth
0,2,1446496326,0,0
1,2,1446496630,0,0
2,2,1446496932,0,0
3,2,1446497234,0,0
4,2,1446497531,0,0


Add building.

In [117]:
wifi_df['building'] = 'school of computer science'

In [118]:
wifi_df.head()

Unnamed: 0,room,event_time,ass,auth,building
0,2,1446496326,0,0,school of computer science
1,2,1446496630,0,0,school of computer science
2,2,1446496932,0,0,school of computer science
3,2,1446497234,0,0,school of computer science
4,2,1446497531,0,0,school of computer science


# Get occupancy data

In [119]:
# put survey data in a dataframe

a = '..' # removed slash
b = 'cleaned_data' # removed slash
c = 'survey_data.csv'

print(os.path.join(a, b, c))

occupancy_df = pd.read_csv(os.path.join(a, b, c))

../cleaned_data/survey_data.csv


In [120]:
occupancy_df.head()

Unnamed: 0.1,Unnamed: 0,room,event_time,occupancy,building
0,0,4,1446454800,0.25,school of computer science
1,1,2,1446454800,0.25,school of computer science
2,2,3,1446454800,0.25,school of computer science
3,3,4,1446458400,0.5,school of computer science
4,4,2,1446458400,0.5,school of computer science


In [121]:
# delete column 'Unnamed: 0'
del occupancy_df['Unnamed: 0']

In [122]:
occupancy_df.head()

Unnamed: 0,room,event_time,occupancy,building
0,4,1446454800,0.25,school of computer science
1,2,1446454800,0.25,school of computer science
2,3,1446454800,0.25,school of computer science
3,4,1446458400,0.5,school of computer science
4,2,1446458400,0.5,school of computer science


Convert EPCOH time into human-readable format.

In [123]:
# convert 'event_time' values from EPOCH to DATETIME
wifi_df['event_time'] = pd.to_datetime(wifi_df.event_time, unit='s')
# use event_time as dataframe index 
wifi_df.set_index('event_time', inplace=True)

In [124]:
wifi_df.head()

Unnamed: 0_level_0,room,ass,auth,building
event_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-11-02 20:32:06,2,0,0,school of computer science
2015-11-02 20:37:10,2,0,0,school of computer science
2015-11-02 20:42:12,2,0,0,school of computer science
2015-11-02 20:47:14,2,0,0,school of computer science
2015-11-02 20:52:11,2,0,0,school of computer science


In [125]:
# create two new columns, event_hour and event_day
wifi_df['event_hour'] = wifi_df.index.hour
wifi_df['event_day'] = wifi_df.index.day

In [126]:
wifi_df.head()

Unnamed: 0_level_0,room,ass,auth,building,event_hour,event_day
event_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-11-02 20:32:06,2,0,0,school of computer science,20,2
2015-11-02 20:37:10,2,0,0,school of computer science,20,2
2015-11-02 20:42:12,2,0,0,school of computer science,20,2
2015-11-02 20:47:14,2,0,0,school of computer science,20,2
2015-11-02 20:52:11,2,0,0,school of computer science,20,2


In [127]:
# convert 'event_time' values from EPOCH to DATETIME
occupancy_df['event_time'] = pd.to_datetime(occupancy_df.event_time, unit='s')
# use event_time as dataframe index 
occupancy_df.set_index('event_time', inplace=True)

In [128]:
occupancy_df.head()

Unnamed: 0_level_0,room,occupancy,building
event_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-11-02 09:00:00,4,0.25,school of computer science
2015-11-02 09:00:00,2,0.25,school of computer science
2015-11-02 09:00:00,3,0.25,school of computer science
2015-11-02 10:00:00,4,0.5,school of computer science
2015-11-02 10:00:00,2,0.5,school of computer science


In [129]:
# create two new columns, event_hour and event_day
occupancy_df['event_hour'] = occupancy_df.index.hour
occupancy_df['event_day'] = occupancy_df.index.day

In [130]:
occupancy_df.head()

Unnamed: 0_level_0,room,occupancy,building,event_hour,event_day
event_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-11-02 09:00:00,4,0.25,school of computer science,9,2
2015-11-02 09:00:00,2,0.25,school of computer science,9,2
2015-11-02 09:00:00,3,0.25,school of computer science,9,2
2015-11-02 10:00:00,4,0.5,school of computer science,10,2
2015-11-02 10:00:00,2,0.5,school of computer science,10,2


# Get timetable data

In [131]:
# read data from csv file into a data frame
# code is now OS agnostic

import os

a = '..' # removed slash
b = 'cleaned_data' # removed slash
c = 'timetable.csv'

print(os.path.join(a, b, c))

timetable_df = pd.read_csv(os.path.join(a, b, c))

../cleaned_data/timetable.csv


In [132]:
del timetable_df['Unnamed: 0']

# convert 'event_time' values from EPOCH to DATETIME
timetable_df['event_time'] = pd.to_datetime(timetable_df.event_time, unit='s')
# use event_time as dataframe index 
timetable_df.set_index('event_time', inplace=True)

# create two new columns, event_hour and event_day
timetable_df['event_hour'] = timetable_df.index.hour
timetable_df['event_day'] = timetable_df.index.day

timetable_df.head()

Unnamed: 0_level_0,room,module,nb_reg_stud,building,event_hour,event_day
event_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-11-02 09:00:00,2,COMP30190,29.0,school of computer science,9,2
2015-11-03 09:00:00,2,,,school of computer science,9,3
2015-11-04 09:00:00,2,COMP30190,29.0,school of computer science,9,4
2015-11-05 09:00:00,2,,,school of computer science,9,5
2015-11-06 09:00:00,2,COMP30220,38.0,school of computer science,9,6


# Merge data

Survey data contains one recorded value per room, per day, per hour. Here, we take the max reading per hour, per day, per room.

In [133]:
df_max_conn = wifi_df.groupby(['room', 'event_day', 'event_hour'], as_index=False).median()

In [134]:
df_max_conn.tail()

Unnamed: 0,room,event_day,event_hour,ass,auth
1051,4,17,7,0.0,0.0
1052,4,17,8,0.0,0.0
1053,4,17,9,72.0,72.0
1054,4,17,10,85.0,85.0
1055,4,17,11,39.0,39.0


In [135]:
# merge data into single dataframe
df_max_conn['room'] = df_max_conn['room'].astype(int)
df = pd.merge(df_max_conn, occupancy_df, on=['room', 'event_day', 'event_hour'], how='inner')
df = pd.merge(df, timetable_df, on=['room', 'event_day', 'event_hour'], how='inner')

df.head(15)

Unnamed: 0,room,event_day,event_hour,ass,auth,occupancy,building_x,module,nb_reg_stud,building_y
0,2,3,9,2.0,2.0,0.0,school of computer science,,,school of computer science
1,2,3,10,29.0,29.0,0.5,school of computer science,COMP30110,45.0,school of computer science
2,2,3,11,27.0,27.0,0.5,school of computer science,COMP30260,18.0,school of computer science
3,2,3,12,16.0,16.0,0.5,school of computer science,COMP40370,27.0,school of computer science
4,2,3,13,13.0,13.0,0.0,school of computer science,,,school of computer science
5,2,3,14,47.0,47.0,0.75,school of computer science,COMP30240,42.0,school of computer science
6,2,3,15,35.0,35.0,0.25,school of computer science,COMP30240,42.0,school of computer science
7,2,3,16,36.5,36.5,0.25,school of computer science,COMP40370,27.0,school of computer science
8,2,4,9,14.0,14.0,0.25,school of computer science,COMP30190,29.0,school of computer science
9,2,4,10,15.0,15.0,0.25,school of computer science,COMP40660,53.0,school of computer science


In [136]:
# add column for number of estimated occupants based on room capacity * occupancy rate

def estimate_occ(df,room, occupancy_rate):
    '''function that caluclates the estimated number of room occupants
    
    parameters
    ----------
    df is a dataframe with columns room and occupancy_rate
    room is a string denoting a column in df that contains INT values representing room IDs
    occupancy_rate is a string denoting a column in df that contains DECIMAL values that represent the estimated room occupancy rate
    
    '''
    #for loop that iterates through each row of the df
    for i in range(df.shape[0]):
        
        #room two and three have capacity of 90
        if df[room][i] == 2 or df[room][i] == 3:
            # calculate estimated occupants for row, assign to variable 'est'
            est = df[occupancy_rate][i] * 90
            #set value in new column
            df.set_value(i, 'est_occupants', est)
        
        #room four has a capcity of 220
        elif df[room][i] == 4:
            est = df[occupancy_rate][i] * 220
            df.set_value(i, 'est_occupants', est)
        
        else:
            raise ValueError('Incorrect room number:', df[room][i])
            

In [139]:
estimate_occ(df, 'room', 'occupancy')

df.head()

Unnamed: 0,room,event_day,event_hour,ass,auth,occupancy,building_x,module,nb_reg_stud,building_y,est_occupants
0,2,3,9,2.0,2.0,0.0,school of computer science,,,school of computer science,0.0
1,2,3,10,29.0,29.0,0.5,school of computer science,COMP30110,45.0,school of computer science,45.0
2,2,3,11,27.0,27.0,0.5,school of computer science,COMP30260,18.0,school of computer science,45.0
3,2,3,12,16.0,16.0,0.5,school of computer science,COMP40370,27.0,school of computer science,45.0
4,2,3,13,13.0,13.0,0.0,school of computer science,,,school of computer science,0.0


# Linear Regression Model

In [33]:
import statsmodels.formula.api as sm

In [166]:
# can also use associated but higher correlation with authenticated 
lm = sm.ols(formula='est_occupants ~ auth + C(module) + nb_reg_stud + room + event_hour', data=df).fit()

In [167]:
print(lm.params)

Intercept                                  5.078542e+00
C(module)[T.COMP10130]                    -2.390210e+01
C(module)[T.COMP10280]                    -2.367260e+01
C(module)[T.COMP20010]                    -2.200963e+01
C(module)[T.COMP20020]                    -1.577075e+01
C(module)[T.COMP20070]                     4.105967e+00
C(module)[T.COMP20110]                     1.061144e+01
C(module)[T.COMP20130]                    -7.031593e+00
C(module)[T.COMP30010]                    -2.514546e+01
C(module)[T.COMP30060]                    -3.312754e+01
C(module)[T.COMP30070]                    -3.770922e+01
C(module)[T.COMP30080]                    -4.412250e+00
C(module)[T.COMP30110]                    -3.660668e+00
C(module)[T.COMP30170]                     1.156012e-01
C(module)[T.COMP30190]                    -1.126574e+01
C(module)[T.COMP30220]                    -9.352082e+00
C(module)[T.COMP30240]                     7.447524e+00
C(module)[T.COMP30250]                    -3.680

In [168]:
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:          est_occupants   R-squared:                       0.711
Model:                            OLS   Adj. R-squared:                  0.608
Method:                 Least Squares   F-statistic:                     6.887
Date:                Fri, 15 Jul 2016   Prob (F-statistic):           3.30e-16
Time:                        11:38:04   Log-Likelihood:                -673.61
No. Observations:                 153   AIC:                             1429.
Df Residuals:                     112   BIC:                             1553.
Df Model:                          40                                         
Covariance Type:            nonrobust                                         
                                              coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------------------------------

### We can see from the above that no feature is found to be significant apart from the number of authenticated devices and room id.

In [169]:
lm = sm.ols(formula='est_occupants ~ auth + nb_reg_stud', data=df).fit()

In [170]:
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:          est_occupants   R-squared:                       0.581
Model:                            OLS   Adj. R-squared:                  0.575
Method:                 Least Squares   F-statistic:                     104.0
Date:                Fri, 15 Jul 2016   Prob (F-statistic):           4.68e-29
Time:                        11:38:17   Log-Likelihood:                -702.02
No. Observations:                 153   AIC:                             1410.
Df Residuals:                     150   BIC:                             1419.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [95.0% Conf. Int.]
-------------------------------------------------------------------------------
Intercept       4.3104      3.429      1.257      

In [173]:
lm = sm.ols(formula='est_occupants ~ auth + event_hour', data=df).fit()

In [174]:
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:          est_occupants   R-squared:                       0.586
Model:                            OLS   Adj. R-squared:                  0.582
Method:                 Least Squares   F-statistic:                     150.9
Date:                Fri, 15 Jul 2016   Prob (F-statistic):           1.55e-41
Time:                        11:38:52   Log-Likelihood:                -976.50
No. Observations:                 216   AIC:                             1959.
Df Residuals:                     213   BIC:                             1969.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept     13.3315      8.734      1.526      0.1

In [178]:
lm = sm.ols(formula='est_occupants ~ auth + room', data=df).fit()

lm.params

Intercept   -10.310480
auth          0.832804
room          5.703769
dtype: float64

In [176]:
print(lm.summary())

                            OLS Regression Results                            
Dep. Variable:          est_occupants   R-squared:                       0.601
Model:                            OLS   Adj. R-squared:                  0.597
Method:                 Least Squares   F-statistic:                     160.1
Date:                Fri, 15 Jul 2016   Prob (F-statistic):           3.62e-43
Time:                        11:39:55   Log-Likelihood:                -972.69
No. Observations:                 216   AIC:                             1951.
Df Residuals:                     213   BIC:                             1962.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
Intercept    -10.3105      5.708     -1.806      0.0

# Test with intercept

In [191]:
lm.predict(df)

array([   2.7626661 ,   25.24836557,   23.5827582 ,   14.42191767,
         11.92350662,   40.23883188,   30.24518767,   31.4943932 ,
         12.7563103 ,   13.58911399,    7.7594882 ,   30.24518767,
         21.08434715,   20.25154346,    5.26107715,   16.92032873,
          3.17906794,   29.41238399,   28.99598214,   18.16953425,
         10.25789925,   26.91397293,   41.07163556,   25.66476741,
          2.7626661 ,    4.01187162,   15.25472136,   12.7563103 ,
          9.84149741,   28.5795803 ,   28.5795803 ,   18.58593609,
         13.58911399,   15.25472136,    7.7594882 ,   15.25472136,
         30.24518767,   10.67430109,   13.17271215,   56.06210188,
          1.92986241,   21.50074899,   16.92032873,   27.74677662,
          6.92668452,   37.74042083,   35.65841162,   30.24518767,
         11.92350662,    9.84149741,    9.42509557,   33.99280425,
         16.08752504,   14.83831951,   26.08116925,   21.08434715,
          5.26107715,   24.41556188,   25.66476741,   13.58911

In [181]:
df['prediction_max'] = None

for i in range(df.shape[0]):
    df.set_value(i, 'prediction_max', lm.predict(df)[i])

In [186]:
# add column to dataframe for prediction category
df['cat_predict'] = None

In [187]:
def set_occupancy_category(df, room, linear_predict, cat_predict):
    '''function that converts linear predictions to a defined category and updates the dataframe passed through
    
    Parameters
    ----------
    df: a dataframe
    room: a string that is the column in df containing room id values of type INT
    linear_predict: a string that is the column in df containing linear predictions
    cat_predict: a string that is the column in df that will containing category predictions
    
    '''
    
    for i in range(df.shape[0]):
        
        # assign room capacity
        if df[room][i] == 2 or df[room][i] == 3:
            cap = 90
        elif df[room][i] == 4:
            cap = 200
            
        # calculate the occupancy rate and assign to variable 'ratio'
        ratio = df[linear_predict][i]/ cap
        
        # assign category based on ratio
        if ratio < 0.13:
            cat = 0.0
        elif ratio < 0.38:
            cat =  0.25
        elif ratio < 0.5:
            cat =  0.5
        elif ratio < 0.88:
            cat =  0.75
        else:
            cat =  1.0
        
        # set category value in df
        df.set_value(i, cat_predict, cat)

In [188]:
set_occupancy_category(df, 'room', 'prediction_max', 'cat_predict')

In [189]:
df.head()

Unnamed: 0,room,event_day,event_hour,ass,auth,occupancy,building_x,module,nb_reg_stud,building_y,est_occupants,prediction_max,cat_predict
0,2,3,9,2.0,2.0,0.0,school of computer science,,,school of computer science,0.0,2.76267,0.0
1,2,3,10,29.0,29.0,0.5,school of computer science,COMP30110,45.0,school of computer science,45.0,25.2484,0.25
2,2,3,11,27.0,27.0,0.5,school of computer science,COMP30260,18.0,school of computer science,45.0,23.5828,0.25
3,2,3,12,16.0,16.0,0.5,school of computer science,COMP40370,27.0,school of computer science,45.0,14.4219,0.25
4,2,3,13,13.0,13.0,0.0,school of computer science,,,school of computer science,0.0,11.9235,0.25


In [190]:
df['accurate'] = None

for i in range(df.shape[0]):
    df.set_value(i, 'accurate', 1 if df['occupancy'][i] == df['cat_predict'][i] else 0)

accuracy = df['accurate'].sum()/df.shape[0]
accuracy

0.5972222222222222

# Test with no intercept

In [192]:
lm.params

Intercept   -10.310480
auth          0.832804
room          5.703769
dtype: float64

In [193]:
df['prediction_max'] = None

for i in range(df.shape[0]):
    df.set_value(i, 'prediction_max', df['auth'][i] * lm.params['auth'] + df['room'][i] * lm.params['room'])

In [194]:
set_occupancy_category(df, 'room', 'prediction_max', 'cat_predict')

In [195]:
df['accurate'] = None

for i in range(df.shape[0]):
    df.set_value(i, 'accurate', 1 if df['occupancy'][i] == df['cat_predict'][i] else 0)

accuracy = df['accurate'].sum()/df.shape[0]
accuracy

0.4074074074074074

In [196]:
df.head()

Unnamed: 0,room,event_day,event_hour,ass,auth,occupancy,building_x,module,nb_reg_stud,building_y,est_occupants,prediction_max,cat_predict,accurate
0,2,3,9,2.0,2.0,0.0,school of computer science,,,school of computer science,0.0,13.0731,0.25,0
1,2,3,10,29.0,29.0,0.5,school of computer science,COMP30110,45.0,school of computer science,45.0,35.5588,0.5,1
2,2,3,11,27.0,27.0,0.5,school of computer science,COMP30260,18.0,school of computer science,45.0,33.8932,0.25,0
3,2,3,12,16.0,16.0,0.5,school of computer science,COMP40370,27.0,school of computer science,45.0,24.7324,0.25,0
4,2,3,13,13.0,13.0,0.0,school of computer science,,,school of computer science,0.0,22.234,0.25,0


# Test with 3 categories and intercept

In [202]:
df['prediction_max'] = None

for i in range(df.shape[0]):
    df.set_value(i, 'prediction_max', lm.predict(df)[i])

set_occupancy_category(df, 'room', 'prediction_max', 'cat_predict')

df.head()

Unnamed: 0,room,event_day,event_hour,ass,auth,occupancy,building_x,module,nb_reg_stud,building_y,est_occupants,prediction_max,cat_predict,accurate,est_occupants_newcat,prediction_max_newcat,accurate2
0,2,3,9,2.0,2.0,0.0,school of computer science,,,school of computer science,0.0,2.76267,0.0,0,empty,half,0
1,2,3,10,29.0,29.0,0.5,school of computer science,COMP30110,45.0,school of computer science,45.0,25.2484,0.25,1,half,half,1
2,2,3,11,27.0,27.0,0.5,school of computer science,COMP30260,18.0,school of computer science,45.0,23.5828,0.25,0,half,half,1
3,2,3,12,16.0,16.0,0.5,school of computer science,COMP40370,27.0,school of computer science,45.0,14.4219,0.25,0,half,half,1
4,2,3,13,13.0,13.0,0.0,school of computer science,,,school of computer science,0.0,11.9235,0.25,0,empty,half,0


In [203]:
df['est_occupants_newcat'] = None
df['prediction_max_newcat'] = None

for i in range(df.shape[0]):
    if df['occupancy'][i] == 0.0:
        df.set_value(i, 'est_occupants_newcat', 'empty')
    elif df['occupancy'][i] == 0.25 or df['occupancy'][i] == 0.5:
        df.set_value(i, 'est_occupants_newcat', 'half')
    elif df['occupancy'][i] == 0.75 or df['occupancy'][i] == 1.0:
        df.set_value(i, 'est_occupants_newcat', 'full')
    else:
        raise ValueError('wrong occupancy:', df['occupancy'][i])
    if df['cat_predict'][i] == 0.0:
        df.set_value(i, 'prediction_max_newcat', 'empty')
    elif df['cat_predict'][i] == 0.25 or df['cat_predict'][i] == 0.5:
        df.set_value(i, 'prediction_max_newcat', 'half')
    elif df['cat_predict'][i] == 0.75 or df['cat_predict'][i] == 1.0:
        df.set_value(i, 'prediction_max_newcat', 'full')
    else:
        raise ValueError('wrong occupancy:', df['cat_predict'][i])

In [204]:
df['accurate2'] = None

for i in range(df.shape[0]):
    df.set_value(i, 'accurate2', 1 if df['est_occupants_newcat'][i] == df['prediction_max_newcat'][i] else 0)

accuracy2 = df['accurate2'].sum()/df.shape[0]
accuracy2

0.7453703703703703

# Test with 3 categories and no intercept

In [207]:
df['prediction_max'] = None

for i in range(df.shape[0]):
    df.set_value(i, 'prediction_max', df['auth'][i] * lm.params['auth'] + df['room'][i] * lm.params['room'])

set_occupancy_category(df, 'room', 'prediction_max', 'cat_predict')

In [208]:
df['est_occupants_newcat'] = None
df['prediction_max_newcat'] = None

for i in range(df.shape[0]):
    if df['occupancy'][i] == 0.0:
        df.set_value(i, 'est_occupants_newcat', 'empty')
    elif df['occupancy'][i] == 0.25 or df['occupancy'][i] == 0.5:
        df.set_value(i, 'est_occupants_newcat', 'half')
    elif df['occupancy'][i] == 0.75 or df['occupancy'][i] == 1.0:
        df.set_value(i, 'est_occupants_newcat', 'full')
    else:
        raise ValueError('wrong occupancy:', df['occupancy'][i])
    if df['cat_predict'][i] == 0.0:
        df.set_value(i, 'prediction_max_newcat', 'empty')
    elif df['cat_predict'][i] == 0.25 or df['cat_predict'][i] == 0.5:
        df.set_value(i, 'prediction_max_newcat', 'half')
    elif df['cat_predict'][i] == 0.75 or df['cat_predict'][i] == 1.0:
        df.set_value(i, 'prediction_max_newcat', 'full')
    else:
        raise ValueError('wrong occupancy:', df['cat_predict'][i])

In [209]:
df['accurate2'] = None

for i in range(df.shape[0]):
    df.set_value(i, 'accurate2', 1 if df['est_occupants_newcat'][i] == df['prediction_max_newcat'][i] else 0)

accuracy2 = df['accurate2'].sum()/df.shape[0]
accuracy2

0.6111111111111112