#  This notebook looks at the correlation between windows and computer usage, and between dual monitors and computer usage, for the semester of spring 2016 through fall 2017.

In [2]:
# Imports
import cPickle
import os.path
import pandas as pd
from scipy.stats import stats

In [5]:
# Load utilization summary data into a dataframe
pkl = open(os.path.join('..', 'data', 'LibData.pkl'),'rb')
utilization = cPickle.load(pkl)
pkl.close()

### The utilization data is stored as a percentage of any given hour in use. So if an entry lists 0.68, that means the computer was in use for 68% of a given hour. 

In [6]:
# get the divisions of each semester as a series

Spr2016 = pd.Timestamp('2016-01-01 00:00:00')
Fall2016 = pd.Timestamp('2016-07-01 00:00:00')
Spr2017 = pd.Timestamp('2017-01-01 00:00:00')
Fall2017 = pd.Timestamp('2017-07-01 00:00:00')
Spr2018 = pd.Timestamp('2018-01-01 00:00:00')

spring2016 = utilization.truncate(before = Spr2016, after = Fall2016)
fall2016 = utilization.truncate(before = Fall2016, after = Spr2017)

spring2017 = utilization.truncate(before = Spr2017, after = Fall2017)
fall2017 = utilization.truncate(before = Fall2017, after = Spr2018)

### We are using the means (averages) of the percentages of computer use, rather than the median, because nighttime hours (all with 0 use) make the median permanently 0, and we cannot run comparisons on that.

In [7]:
# Now find the mean of each and put them into a dataframe for easier viewing

s16 = spring2016.mean(axis = 0)
f16 = fall2016.mean(axis = 0)
s17 = spring2017.mean(axis = 0)
f17 = fall2017.mean(axis =0)

SemesterMeans = pd.DataFrame({'s16': s16, 'f16': f16,
                              's17': s17,'f17': f17})

SemesterMeans.head()

Unnamed: 0,f16,f17,s16,s17
BL001,0.35984,0.033467,0.044504,0.043709
BL002,0.07294,0.043494,0.071121,0.092427
CITI001,0.117084,0.049552,0.053626,0.051997
CITI002,0.246369,0.042365,0.065337,0.046405
CITI003,0.241809,0.039901,0.054258,0.037921


## Null Hypothsis: Computers with >1 monitor have the same average usage percentages as those with 1 or less since Spring 2016. We'll use a 5% confidence to reject (p-value <= |0.05|)

* ** We'll run this test on all semesters since spring 2016 since that is when most of the dual monitors seem to have been added. 

In [10]:
# Get the attributes 
attributes = pd.read_csv(os.path.join('..', 'data', 'computerAttributes.csv'))
attributes.index = attributes['computerName']

#Keep only the machines that require a logon
machinesOfInterest = [x for x in attributes.index
                        if attributes.loc[x]['requiresLogon']== True]


In [11]:
# Assemble usage stats
s16Util = SemesterMeans.s16[machinesOfInterest]
s17Util = SemesterMeans.s17[machinesOfInterest]
f16Util = SemesterMeans.f16[machinesOfInterest]
f17Util = SemesterMeans.f17[machinesOfInterest]



In [12]:
# Split into dual monitors and Single monitors
spr16DualUsage = s16Util[[x for x in machinesOfInterest
                         if attributes.loc[x]['numMonitors']>1]]

spr16SingUsage = s16Util[[x for x in machinesOfInterest
                         if attributes.loc[x]['numMonitors']==1]]


spr17DualUsage = s17Util[[x for x in machinesOfInterest
                         if attributes.loc[x]['numMonitors']>1]]
spr17SingUsage = s17Util[[x for x in machinesOfInterest
                         if attributes.loc[x]['numMonitors']==1]]

f16DualUsage = f16Util[[x for x in machinesOfInterest
                         if attributes.loc[x]['numMonitors']>1]]
f16SingUsage = f16Util[[x for x in machinesOfInterest
                         if attributes.loc[x]['numMonitors']==1]]

f17DualUsage = f17Util[[x for x in machinesOfInterest
                         if attributes.loc[x]['numMonitors']>1]]
f17SingUsage = f17Util[[x for x in machinesOfInterest
                        if attributes.loc[x]['numMonitors']==1]]

In [13]:
s16Results = stats.ttest_ind(spr16DualUsage, spr16SingUsage)
s17Results = stats.ttest_ind(spr17DualUsage, spr17SingUsage)
f16Results = stats.ttest_ind(f16DualUsage, f16SingUsage)
f17Results = stats.ttest_ind(f17DualUsage, f17SingUsage)


print ("Spring16: ")
print s16Results
print("\n Spring17:")
print s17Results
print("\nFall16: ")
print f16Results
print("\nFall17: ")
print f17Results


Spring16: 
Ttest_indResult(statistic=2.0766947104500439, pvalue=0.038738410483323521)

 Spring17:
Ttest_indResult(statistic=5.0033608030267382, pvalue=9.9473296541892113e-07)

Fall16: 
Ttest_indResult(statistic=2.1022847929803099, pvalue=0.036418087078301423)

Fall17: 
Ttest_indResult(statistic=2.5600373693375338, pvalue=0.010989362194974371)


# We can reject the null hypothesis for all semesters! Each of these p-values are less than 0.05. Thus we know that the computers with dual monitors are NOT used the same way. In fact, they are used more.

# Now let's run the same test for the attribute 'adjacentWindow'. That is, we assume as our null hypothesis that windows have no affect on the percentage of computer use. If we get p-values lower than 0.05, we can reject the null hypothesis.

In [14]:
# Split into window/no window
s16Window = s16Util[[x for x in machinesOfInterest
                         if attributes.loc[x]['adjacentWindow']== True]]

s16NoWindow = s16Util[[x for x in machinesOfInterest
                         if attributes.loc[x]['adjacentWindow']==False]]


s17Window = s17Util[[x for x in machinesOfInterest
                         if attributes.loc[x]['adjacentWindow']== True]]

s17NoWindow = s17Util[[x for x in machinesOfInterest
                         if attributes.loc[x]['adjacentWindow']==False]]
                        
f16Window = f16Util[[x for x in machinesOfInterest
                         if attributes.loc[x]['adjacentWindow']== True]]

f16NoWindow = f16Util[[x for x in machinesOfInterest
                         if attributes.loc[x]['adjacentWindow']==False]]

f17Window = f17Util[[x for x in machinesOfInterest
                         if attributes.loc[x]['adjacentWindow']== True]]

f17NoWindow = f17Util[[x for x in machinesOfInterest
                         if attributes.loc[x]['adjacentWindow']==False]]

In [15]:
s16Results = stats.ttest_ind(s16Window, s16NoWindow)
s17Results = stats.ttest_ind(s17Window, s17NoWindow)
f16Results = stats.ttest_ind(f16Window, f16NoWindow)
f17Results = stats.ttest_ind(f17Window, f17NoWindow)


print ("Spring16: ")
print s16Results
print("\n Spring17:")
print s17Results
print("\nFall16: ")
print f16Results
print("\nFall17: ")
print f17Results


Spring16: 
Ttest_indResult(statistic=-2.643069825031886, pvalue=0.0086752333508418444)

 Spring17:
Ttest_indResult(statistic=-1.8609377355596997, pvalue=0.063793571814633673)

Fall16: 
Ttest_indResult(statistic=-1.7815654216925538, pvalue=0.075896242969613795)

Fall17: 
Ttest_indResult(statistic=-1.8238717642417666, pvalue=0.069229772721319402)


# The p-values for all but spring2016 from these four semesters indicate that the window has no effect on overall use of a computer, so we fail to reject the null hypotheis. Window proximity appears to have no significant affect on computer use.