In [1]:
import __init__
#
from IPython.display import HTML, display
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
#
# some functions
#
def text_display(text, font_size):
    display(HTML('<font size=%d>' % font_size + text + '</font>'))

def table_display(table_data):
    display(HTML(
    '<table><tr>{}</tr></table>'.format(
        '</tr><tr>'.join(
            '<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) for row in table_data)
        )
    ))

In [2]:
from information_boards import ssDriversStatisticsTripBased2009_ap_fpath
from information_boards import ssDriversStatisticsTripBased2010_ap_fpath

Y2009_df = pd.read_csv(ssDriversStatisticsTripBased2009_ap_fpath)
Y2010_df = pd.read_csv(ssDriversStatisticsTripBased2010_ap_fpath)

In [3]:
for cn in ['driverID', 'apIn', 'year', 'month', 'day', 'hour', 'month^2']:
    Y2009_df[cn] = Y2009_df[cn].apply(lambda x: int(x))
    Y2010_df[cn] = Y2010_df[cn].apply(lambda x: int(x))

In [4]:
# Models
dep_v = 'apQTime'
ib_impact = ['apIn']
control_variables = ['apEconomicProfit', 'apProductivity']
learning_variables = ['month', 'month^2']
m1_inDepV = ib_impact + ['weekend']
m2_inDepV = ib_impact + control_variables
m3_inDepV = ib_impact + control_variables + learning_variables

In [5]:
# M1
dataset = Y2009_df
#
model = pd.ols(y=dataset[dep_v], 
               x={v: dataset[v] for v in m1_inDepV})
model

  exec(code_obj, self.user_global_ns, self.user_ns)



-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <apIn> + <weekend> + <intercept>

Number of Observations:         98997
Number of Degrees of Freedom:   3

R-squared:         0.0101
Adj R-squared:     0.0101

Rmse:             19.6984

F-stat (2, 98994):   504.2253, p-value:     0.0000

Degrees of Freedom: model 2, resid 98994

-----------------------Summary of Estimated Coefficients------------------------
      Variable       Coef    Std Err     t-stat    p-value    CI 2.5%   CI 97.5%
--------------------------------------------------------------------------------
          apIn     1.4533     0.1312      11.07     0.0000     1.1960     1.7105
       weekend    -4.1372     0.1376     -30.06     0.0000    -4.4069    -3.8675
     intercept    41.3035     0.0869     475.03     0.0000    41.1331    41.4740
---------------------------------End of Summary---------------------------------

In [6]:
# M1
dataset = Y2010_df
#
model = pd.ols(y=dataset[dep_v], 
               x={v: dataset[v] for v in m1_inDepV})
model


-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <apIn> + <weekend> + <intercept>

Number of Observations:         102415
Number of Degrees of Freedom:   3

R-squared:         0.0033
Adj R-squared:     0.0033

Rmse:             16.4904

F-stat (2, 102412):   170.5382, p-value:     0.0000

Degrees of Freedom: model 2, resid 102412

-----------------------Summary of Estimated Coefficients------------------------
      Variable       Coef    Std Err     t-stat    p-value    CI 2.5%   CI 97.5%
--------------------------------------------------------------------------------
          apIn     1.3906     0.1046      13.29     0.0000     1.1855     1.5957
       weekend    -1.4971     0.1148     -13.04     0.0000    -1.7223    -1.2720
     intercept    33.9311     0.0743     456.96     0.0000    33.7856    34.0767
---------------------------------End of Summary---------------------------------

In [7]:
hours = ['H%d' % x for x in range(24) if x not in [2, 3, 4, 5]]
for Hhour in hours:
    Y2009_df[Hhour] = np.where(Y2009_df['hour'] == int(Hhour[1:]), 1, 0)
    Y2010_df[Hhour] = np.where(Y2010_df['hour'] == int(Hhour[1:]), 1, 0)
Y2009_drivers = [str(did) for did in set(Y2009_df['driverID'])]
for did in Y2009_drivers:
    Y2009_df[did] = np.where(Y2009_df['driverID'] == int(did), 1, 0)
Y2010_drivers = [str(did) for did in set(Y2010_df['driverID'])]
for did in Y2010_drivers:
    Y2010_df[did] = np.where(Y2010_df['driverID'] == int(did), 1, 0)

# m2a_inDepV = ib_impact + ['weekend'] + hours[:-1]    
    
m2a_inDepV_Y2009 = ib_impact + ['weekend'] + hours[:-1] + Y2009_drivers[:-1]
m2a_inDepV_Y2010 = ib_impact + ['weekend'] + hours[:-1] + Y2010_drivers[:-1]

# m2a_inDepV_Y2010 = ib_impact + control_variables + learning_variables + Y2010_drivers[:-1] + hours[:-1]

In [9]:
# M2a
dataset = Y2009_df
#
model = pd.ols(y=dataset[dep_v], 
               x={v: dataset[v] for v in m2a_inDepV_Y2009})
model


-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <10123> + <10128> + <10146> + <10171> + <10172> + <1022> + <10534>
             + <10602> + <10604> + <10648> + <10677> + <1079> + <10850> + <10904>
             + <10969> + <10990> + <1108> + <11587> + <11805> + <1198> + <12047>
             + <1213> + <12185> + <12304> + <12357> + <12392> + <12443>
             + <12494> + <12530> + <12590> + <12900> + <12929> + <13041> + <13147>
             + <1318> + <13468> + <13508> + <13539> + <13612> + <13701> + <13733>
             + <1378> + <13799> + <14023> + <1406> + <14088> + <14186> + <14291>
             + <1432> + <14372> + <14452> + <14541> + <14603> + <14613> + <14829>
             + <14889> + <14897> + <14942> + <15015> + <15333> + <15339> + <15377>
             + <15412> + <15467> + <15478> + <15527> + <15561> + <15603> + <15753>
             + <1580> + <15907> + <16105> + <16244> + <1631> + <16344> + <16429>
             + <16443> + <1

In [10]:
# M2a
dataset = Y2010_df
#
model = pd.ols(y=dataset[dep_v], 
               x={v: dataset[v] for v in m2a_inDepV_Y2010})
model


-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <10123> + <10128> + <10146> + <10171> + <10172> + <1022> + <10534>
             + <10602> + <10604> + <10648> + <10677> + <1079> + <10850> + <10904>
             + <10969> + <10990> + <1108> + <11587> + <11805> + <1198> + <12047>
             + <1213> + <12185> + <12304> + <12357> + <12392> + <12443>
             + <12494> + <12530> + <12590> + <12900> + <12929> + <13041> + <13147>
             + <1318> + <13468> + <13508> + <13539> + <13612> + <13701> + <13733>
             + <1378> + <13799> + <14023> + <1406> + <14088> + <14186> + <14291>
             + <1432> + <14372> + <14452> + <14541> + <14603> + <14613> + <14829>
             + <14889> + <14897> + <14942> + <15015> + <15333> + <15339> + <15377>
             + <15412> + <15467> + <15478> + <15527> + <15561> + <15603> + <15753>
             + <1580> + <15907> + <16105> + <16244> + <1631> + <16344> + <16429>
             + <16443> + <1

In [19]:
# M2
dataset = Y2009_df
#
model = pd.ols(y=dataset[dep_v], 
               x={v: dataset[v] for v in m2_inDepV})
model


-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <apEconomicProfit> + <apIn> + <apProductivity> + <intercept>

Number of Observations:         98997
Number of Degrees of Freedom:   4

R-squared:         0.7382
Adj R-squared:     0.7382

Rmse:             10.1303

F-stat (3, 98993): 93041.8139, p-value:     0.0000

Degrees of Freedom: model 3, resid 98993

-----------------------Summary of Estimated Coefficients------------------------
      Variable       Coef    Std Err     t-stat    p-value    CI 2.5%   CI 97.5%
--------------------------------------------------------------------------------
apEconomicProfit    -1.7939     0.0076    -234.82     0.0000    -1.8089    -1.7790
          apIn    -1.1833     0.0677     -17.47     0.0000    -1.3161    -1.0506
apProductivity    -0.4150     0.0088     -47.35     0.0000    -0.4322    -0.3978
     intercept    43.2614     0.2066     209.40     0.0000    42.8564    43.6663
--------------------------

In [20]:
# M2
dataset = Y2010_df
#
model = pd.ols(y=dataset[dep_v], 
               x={v: dataset[v] for v in m2_inDepV})
model


-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <apEconomicProfit> + <apIn> + <apProductivity> + <intercept>

Number of Observations:         102415
Number of Degrees of Freedom:   4

R-squared:         0.7032
Adj R-squared:     0.7032

Rmse:              8.9983

F-stat (3, 102411): 80893.5949, p-value:     0.0000

Degrees of Freedom: model 3, resid 102411

-----------------------Summary of Estimated Coefficients------------------------
      Variable       Coef    Std Err     t-stat    p-value    CI 2.5%   CI 97.5%
--------------------------------------------------------------------------------
apEconomicProfit    -1.6102     0.0071    -227.03     0.0000    -1.6241    -1.5963
          apIn    -1.1986     0.0574     -20.90     0.0000    -1.3110    -1.0862
apProductivity    -0.2885     0.0074     -38.75     0.0000    -0.3031    -0.2739
     intercept    36.6359     0.1912     191.60     0.0000    36.2611    37.0107
-----------------------

In [21]:
# M3
dataset = Y2009_df
#
model = pd.ols(y=dataset[dep_v], 
               x={v: dataset[v] for v in m3_inDepV})
model


-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <apEconomicProfit> + <apIn> + <apProductivity> + <month>
             + <month^2> + <intercept>

Number of Observations:         98997
Number of Degrees of Freedom:   6

R-squared:         0.7409
Adj R-squared:     0.7409

Rmse:             10.0778

F-stat (5, 98991): 56614.9012, p-value:     0.0000

Degrees of Freedom: model 5, resid 98991

-----------------------Summary of Estimated Coefficients------------------------
      Variable       Coef    Std Err     t-stat    p-value    CI 2.5%   CI 97.5%
--------------------------------------------------------------------------------
apEconomicProfit    -1.8082     0.0076    -237.36     0.0000    -1.8232    -1.7933
          apIn    -1.1013     0.0674     -16.33     0.0000    -1.2334    -0.9691
apProductivity    -0.3908     0.0088     -44.64     0.0000    -0.4079    -0.3736
         month     0.4595     0.0448      10.26     0.0000     0.3717   

In [22]:
# M3
dataset = Y2010_df
#
model = pd.ols(y=dataset[dep_v], 
               x={v: dataset[v] for v in m3_inDepV})
model


-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <apEconomicProfit> + <apIn> + <apProductivity> + <month>
             + <month^2> + <intercept>

Number of Observations:         102415
Number of Degrees of Freedom:   6

R-squared:         0.7070
Adj R-squared:     0.7069

Rmse:              8.9419

F-stat (5, 102409): 49410.8062, p-value:     0.0000

Degrees of Freedom: model 5, resid 102409

-----------------------Summary of Estimated Coefficients------------------------
      Variable       Coef    Std Err     t-stat    p-value    CI 2.5%   CI 97.5%
--------------------------------------------------------------------------------
apEconomicProfit    -1.6268     0.0071    -230.29     0.0000    -1.6407    -1.6130
          apIn    -1.1520     0.0570     -20.21     0.0000    -1.2638    -1.0403
apProductivity    -0.2640     0.0074     -35.53     0.0000    -0.2785    -0.2494
         month    -0.7998     0.0338     -23.63     0.0000    -0.8662

In [23]:
Y2009_drivers = [str(did) for did in set(Y2009_df['driverID'])]
for did in Y2009_drivers:
    Y2009_df[did] = np.where(Y2009_df['driverID'] == int(did), 1, 0)
Y2010_drivers = [str(did) for did in set(Y2010_df['driverID'])]
for did in Y2010_drivers:
    Y2010_df[did] = np.where(Y2010_df['driverID'] == int(did), 1, 0)
m4_inDepV_Y2009 = ib_impact + control_variables + learning_variables + Y2009_drivers[:-1]
m4_inDepV_Y2010 = ib_impact + control_variables + learning_variables + Y2010_drivers[:-1]

In [24]:
# M4
dataset = Y2009_df
#
model = pd.ols(y=dataset[dep_v], 
               x={v: dataset[v] for v in m4_inDepV_Y2010})
model


-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <10123> + <10128> + <10146> + <10171> + <10172> + <1022> + <10534>
             + <10602> + <10604> + <10648> + <10677> + <1079> + <10850> + <10904>
             + <10969> + <10990> + <1108> + <11587> + <11805> + <1198> + <12047>
             + <1213> + <12185> + <12304> + <12357> + <12392> + <12443>
             + <12494> + <12530> + <12590> + <12900> + <12929> + <13041> + <13147>
             + <1318> + <13468> + <13508> + <13539> + <13612> + <13701> + <13733>
             + <1378> + <13799> + <14023> + <1406> + <14088> + <14186> + <14291>
             + <1432> + <14372> + <14452> + <14541> + <14603> + <14613> + <14829>
             + <14889> + <14897> + <14942> + <15015> + <15333> + <15339> + <15377>
             + <15412> + <15467> + <15478> + <15527> + <15561> + <15603> + <15753>
             + <1580> + <15907> + <16105> + <16244> + <1631> + <16344> + <16429>
             + <16443> + <1

In [25]:
# M4
dataset = Y2010_df
#
model = pd.ols(y=dataset[dep_v], 
               x={v: dataset[v] for v in m4_inDepV_Y2010})
model


-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <10123> + <10128> + <10146> + <10171> + <10172> + <1022> + <10534>
             + <10602> + <10604> + <10648> + <10677> + <1079> + <10850> + <10904>
             + <10969> + <10990> + <1108> + <11587> + <11805> + <1198> + <12047>
             + <1213> + <12185> + <12304> + <12357> + <12392> + <12443>
             + <12494> + <12530> + <12590> + <12900> + <12929> + <13041> + <13147>
             + <1318> + <13468> + <13508> + <13539> + <13612> + <13701> + <13733>
             + <1378> + <13799> + <14023> + <1406> + <14088> + <14186> + <14291>
             + <1432> + <14372> + <14452> + <14541> + <14603> + <14613> + <14829>
             + <14889> + <14897> + <14942> + <15015> + <15333> + <15339> + <15377>
             + <15412> + <15467> + <15478> + <15527> + <15561> + <15603> + <15753>
             + <1580> + <15907> + <16105> + <16244> + <1631> + <16344> + <16429>
             + <16443> + <1

In [26]:
Y2010_df.groupby(['hour'])['apQTime'].mean()

hour
0     38.633642
1     41.506782
6     35.655536
7     42.931750
8     43.954012
9     42.066855
10    35.382890
11    39.754814
12    31.421338
13    29.786306
14    35.044678
15    29.394071
16    24.721657
17    36.920173
18    38.886517
19    41.139870
20    31.871471
21    30.120433
22    27.529222
23    26.160455
Name: apQTime, dtype: float64

In [27]:
hours = ['H%d' % x for x in range(24) if x not in [2, 3, 4, 5]]
for Hhour in hours:
    Y2009_df[Hhour] = np.where(Y2009_df['hour'] == int(Hhour[1:]), 1, 0)
    Y2010_df[Hhour] = np.where(Y2010_df['hour'] == int(Hhour[1:]), 1, 0)
m5_inDepV_Y2009 = ib_impact + control_variables + learning_variables + Y2009_drivers[:-1] + hours[:-1]
m5_inDepV_Y2010 = ib_impact + control_variables + learning_variables + Y2010_drivers[:-1] + hours[:-1]

In [28]:
# M5
dataset = Y2009_df
#
model = pd.ols(y=dataset[dep_v], 
               x={v: dataset[v] for v in m5_inDepV_Y2009})
model


-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <10123> + <10128> + <10146> + <10171> + <10172> + <1022> + <10534>
             + <10602> + <10604> + <10648> + <10677> + <1079> + <10850> + <10904>
             + <10969> + <10990> + <1108> + <11587> + <11805> + <1198> + <12047>
             + <1213> + <12185> + <12304> + <12357> + <12392> + <12443>
             + <12494> + <12530> + <12590> + <12900> + <12929> + <13041> + <13147>
             + <1318> + <13468> + <13508> + <13539> + <13612> + <13701> + <13733>
             + <1378> + <13799> + <14023> + <1406> + <14088> + <14186> + <14291>
             + <1432> + <14372> + <14452> + <14541> + <14603> + <14613> + <14829>
             + <14889> + <14897> + <14942> + <15015> + <15333> + <15339> + <15377>
             + <15412> + <15467> + <15478> + <15527> + <15561> + <15603> + <15753>
             + <1580> + <15907> + <16105> + <16244> + <1631> + <16344> + <16429>
             + <16443> + <1

In [29]:
# M5
dataset = Y2010_df
#
model = pd.ols(y=dataset[dep_v], 
               x={v: dataset[v] for v in m5_inDepV_Y2009})
model


-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <10123> + <10128> + <10146> + <10171> + <10172> + <1022> + <10534>
             + <10602> + <10604> + <10648> + <10677> + <1079> + <10850> + <10904>
             + <10969> + <10990> + <1108> + <11587> + <11805> + <1198> + <12047>
             + <1213> + <12185> + <12304> + <12357> + <12392> + <12443>
             + <12494> + <12530> + <12590> + <12900> + <12929> + <13041> + <13147>
             + <1318> + <13468> + <13508> + <13539> + <13612> + <13701> + <13733>
             + <1378> + <13799> + <14023> + <1406> + <14088> + <14186> + <14291>
             + <1432> + <14372> + <14452> + <14541> + <14603> + <14613> + <14829>
             + <14889> + <14897> + <14942> + <15015> + <15333> + <15339> + <15377>
             + <15412> + <15467> + <15478> + <15527> + <15561> + <15603> + <15753>
             + <1580> + <15907> + <16105> + <16244> + <1631> + <16344> + <16429>
             + <16443> + <1

In [32]:
m6_inDepV_Y2009 = ib_impact + control_variables + learning_variables \
                + Y2009_drivers[:-1] + hours[:-1] + ['weekend']
m6_inDepV_Y2010 = ib_impact + control_variables + learning_variables \
                + Y2010_drivers[:-1] + hours[:-1] + ['weekend']

In [33]:
# M6
dataset = Y2009_df
#
model = pd.ols(y=dataset[dep_v], 
               x={v: dataset[v] for v in m6_inDepV_Y2009})
model


-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <10123> + <10128> + <10146> + <10171> + <10172> + <1022> + <10534>
             + <10602> + <10604> + <10648> + <10677> + <1079> + <10850> + <10904>
             + <10969> + <10990> + <1108> + <11587> + <11805> + <1198> + <12047>
             + <1213> + <12185> + <12304> + <12357> + <12392> + <12443>
             + <12494> + <12530> + <12590> + <12900> + <12929> + <13041> + <13147>
             + <1318> + <13468> + <13508> + <13539> + <13612> + <13701> + <13733>
             + <1378> + <13799> + <14023> + <1406> + <14088> + <14186> + <14291>
             + <1432> + <14372> + <14452> + <14541> + <14603> + <14613> + <14829>
             + <14889> + <14897> + <14942> + <15015> + <15333> + <15339> + <15377>
             + <15412> + <15467> + <15478> + <15527> + <15561> + <15603> + <15753>
             + <1580> + <15907> + <16105> + <16244> + <1631> + <16344> + <16429>
             + <16443> + <1

In [34]:
# M6
dataset = Y2010_df
#
model = pd.ols(y=dataset[dep_v], 
               x={v: dataset[v] for v in m6_inDepV_Y2010})
model


-------------------------Summary of Regression Analysis-------------------------

Formula: Y ~ <10123> + <10128> + <10146> + <10171> + <10172> + <1022> + <10534>
             + <10602> + <10604> + <10648> + <10677> + <1079> + <10850> + <10904>
             + <10969> + <10990> + <1108> + <11587> + <11805> + <1198> + <12047>
             + <1213> + <12185> + <12304> + <12357> + <12392> + <12443>
             + <12494> + <12530> + <12590> + <12900> + <12929> + <13041> + <13147>
             + <1318> + <13468> + <13508> + <13539> + <13612> + <13701> + <13733>
             + <1378> + <13799> + <14023> + <1406> + <14088> + <14186> + <14291>
             + <1432> + <14372> + <14452> + <14541> + <14603> + <14613> + <14829>
             + <14889> + <14897> + <14942> + <15015> + <15333> + <15339> + <15377>
             + <15412> + <15467> + <15478> + <15527> + <15561> + <15603> + <15753>
             + <1580> + <15907> + <16105> + <16244> + <1631> + <16344> + <16429>
             + <16443> + <1

In [35]:
len(set(Y2009_df['driverID']))

348

In [36]:
len(set(Y2010_df['driverID']))

348