In [1]:
import __init__
#
from IPython.display import HTML, display
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [2]:
def significance(pvalue):
    if pvalue < 0.01:
        num_stars = 3
    elif pvalue < 0.05:
        num_stars = 2
    elif pvalue < 0.1:
        num_stars = 1
    else:
        num_stars = 0    
    return '*' * num_stars

In [3]:
def display_res(Y2009, Y2010, inDepV):
    results = []
    for df in [Y2009, Y2010]:
        y = df[dep_v]
        X = df[inDepV]
        X = sm.add_constant(X)
        results.append(sm.OLS(y, X, missing='drop').fit())
    res2009, res2010 = results
    for idv in inDepV + ['const']:
        coef2009 = '%.4f' % res2009.params[idv] + significance(res2009.pvalues[idv])
        coef2010 = '%.4f' % res2010.params[idv] + significance(res2010.pvalues[idv])
        print '%s:' % idv + ','.join([coef2009, coef2010])
        ste2009 = res2009.params[idv] / res2009.tvalues[idv]
        ste2010 = res2010.params[idv] / res2010.tvalues[idv]
        print '(%.4f),(%.4f)' % (ste2009, ste2010)
    print

    fvalue2009 = '%.4f' % res2009.fvalue + significance(res2009.f_pvalue)
    fvalue2010 = '%.4f' % res2010.fvalue + significance(res2010.f_pvalue)
    print 'N:%d,%d' % (res2009.nobs, res2010.nobs)
    print 'R-squared:%.4f,%.4f' % (res2009.rsquared, res2010.rsquared)
    print 'Adj R-squared:%.4f,%.4f' % (res2009.rsquared_adj, res2010.rsquared_adj)
    print 'F-statistics:%s,%s' % (fvalue2009, fvalue2010)

# Night Safari

In [4]:
from information_boards import statisticsAllDrivers_ns_dpath
from information_boards import statisticsAllDriversMonth_ns2023_prefix
Y2009 = pd.read_csv('%s/Filtered-%s2009.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversMonth_ns2023_prefix))
Y2010 = pd.read_csv('%s/Filtered-%s2010.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversMonth_ns2023_prefix))

In [6]:
print len(set(Y2009['driverID'])), len(set(Y2010['driverID']))

6753 7051


In [7]:
# Models
dep_v = 'QTime/locTrip'
ib_impact = ['locInRatio']
cv0 = ['wleTripNumber', 'locTripNumber', 'wleProductivity']
# cv0 = ['wleTripNumber', 'wleProductivity']
cv1 = cv0 + ['EP/locTrip']
cv2 = cv0 + ['locProductivity']
cv3 = cv0 + ['EP/locTrip', 'locProductivity']
m1_inDepV = ib_impact
m2a_inDepV = ib_impact + cv0
m2b_inDepV = ib_impact + cv1
m2c_inDepV = ib_impact + cv2
m2d_inDepV = ib_impact + cv3

In [9]:
%time
print ''
# M1
display_res(Y2009, Y2010, m1_inDepV)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.01 µs

locInRatio:3.0727***,1.2414***
(0.2138),(0.1866)
const:17.8716***,14.5067***
(0.1173),(0.1009)

N:15038,15544
R-squared:0.0136,0.0028
Adj R-squared:0.0135,0.0028
F-statistics:206.6339***,44.2624***


In [10]:
%time
print ''
# M2a
display_res(Y2009, Y2010, m2a_inDepV)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.87 µs

locInRatio:4.6564***,2.4652***
(0.2119),(0.1863)
wleTripNumber:-0.3477***,-0.2295***
(0.0366),(0.0313)
locTripNumber:2.4692***,1.7242***
(0.3074),(0.2778)
wleProductivity:-0.7204***,-0.6013***
(0.0219),(0.0184)
const:34.4833***,29.9574***
(0.5810),(0.5338)

N:15038,15544
R-squared:0.1239,0.0967
Adj R-squared:0.1237,0.0965
F-statistics:531.7154***,415.9042***


In [11]:
%time
print ''
# M2b
display_res(Y2009, Y2010, m2b_inDepV)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.91 µs

locInRatio:-0.7348***,-0.8618***
(0.1094),(0.0945)
wleTripNumber:-0.2809***,-0.1989***
(0.0184),(0.0157)
locTripNumber:1.9760***,1.3660***
(0.1543),(0.1390)
wleProductivity:-0.0232**,0.0018
(0.0115),(0.0096)
EP/locTrip:-2.0802***,-1.8979***
(0.0098),(0.0088)
const:16.3094***,12.6642***
(0.3041),(0.2788)

N:15038,15544
R-squared:0.7792,0.7739
Adj R-squared:0.7791,0.7739
F-statistics:10608.4430***,10638.6195***


In [12]:
%time
print ''
# M2c
display_res(Y2009, Y2010, m2c_inDepV)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 3.81 µs

locInRatio:1.1257***,-0.2402**
(0.1168),(0.1016)
wleTripNumber:-0.3224***,-0.2348***
(0.0199),(0.0169)
locTripNumber:1.2141***,0.7708***
(0.1673),(0.1502)
wleProductivity:0.0707***,0.0733***
(0.0126),(0.0105)
locProductivity:-1.3150***,-1.1117***
(0.0070),(0.0057)
const:48.7972***,43.2823***
(0.3250),(0.2965)

N:15038,15544
R-squared:0.7407,0.7362
Adj R-squared:0.7406,0.7361
F-statistics:8589.2696***,8674.0555***


In [13]:
%time
print ''
# M2d
display_res(Y2009, Y2010, m2d_inDepV)

CPU times: user 1e+03 ns, sys: 1e+03 ns, total: 2 µs
Wall time: 5.01 µs

locInRatio:-0.2865***,-0.7854***
(0.1053),(0.0903)
wleTripNumber:-0.2937***,-0.2115***
(0.0176),(0.0150)
locTripNumber:1.6532***,1.1131***
(0.1479),(0.1330)
wleProductivity:0.0494***,0.0606***
(0.0112),(0.0093)
EP/locTrip:-1.3754***,-1.2457***
(0.0211),(0.0190)
locProductivity:-0.5133***,-0.4384***
(0.0137),(0.0114)
const:28.0544***,23.8617***
(0.4282),(0.3956)

N:15038,15544
R-squared:0.7980,0.7934
Adj R-squared:0.7979,0.7933
F-statistics:9894.1475***,9946.1739***


In [17]:
# M3
display_res(Y2009, Y2010, m3_inDepV)

locInRatio:-4.3341***,-4.3223***
(0.0785),(0.0604)
wleTripNumber:-0.0003,-0.0025***
(0.0004),(0.0003)
locTripNumber:0.0375***,0.0740***
(0.0068),(0.0044)
wleProductivity:0.0506***,0.0473***
(0.0058),(0.0038)
EP/locTrip:-1.4359***,-1.1051***
(0.0097),(0.0078)
locProductivity:-0.8191***,-0.8007***
(0.0082),(0.0060)
timePassed:0.3462***,-1.0850***
(0.0220),(0.0435)
timePassed^2:-0.0510***,0.0273***
(0.0021),(0.0012)
const:51.7958***,59.0955***
(0.2370),(0.4050)

N:69958,97232
R-squared:0.8020,0.7889
Adj R-squared:0.8020,0.7889
F-statistics:35416.2855***,45410.8602***


# Zoo

In [18]:
from information_boards import statisticsAllDrivers_ns_dpath
from information_boards import statisticsAllDriversMonth_ns1519_prefix
Y2009 = pd.read_csv('%s/%s2009.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversMonth_ns1519_prefix))
Y2010 = pd.read_csv('%s/%s2010.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversMonth_ns1519_prefix))

In [19]:
print 'The number of drivers: (Y2009, %d), (Y2010, %d)' % (len(set(Y2009['driverID'])), len(set(Y2010['driverID'])))

The number of drivers: (Y2009, 9836), (Y2010, 11018)


In [20]:
# M1
display_res(Y2009, Y2010, m1_inDepV)

locInRatio:-474.7671***,-620.3563***
(28.3808),(28.9798)
const:484.8799***,623.9428***
(21.1517),(22.6268)

N:17795,19251
R-squared:0.0155,0.0233
Adj R-squared:0.0154,0.0232
F-statistics:279.8417***,458.2374***


In [16]:
# M2a
display_res(Y2009, Y2010, m2a_inDepV)

locInRatio:-517.2908***,-652.4528***
(29.6313),(30.1250)
wleTripNumber:8.4251***,2.5459
(3.2056),(3.1103)
locTripNumber:-96.8063***,-63.9445***
(22.0812),(21.5070)
wleProductivity:1.9392**,0.1815
(0.8335),(0.2720)
const:508.1354***,698.0090***
(37.4400),(33.1498)

N:17795,19251
R-squared:0.0170,0.0240
Adj R-squared:0.0168,0.0238
F-statistics:77.1047***,118.4940***


In [17]:
# M2b
display_res(Y2009, Y2010, m2b_inDepV)

locInRatio:-22.0277***,-19.0202***
(4.4853),(3.1130)
wleTripNumber:-0.2294,0.4446
(0.4815),(0.3177)
locTripNumber:1.1342,-1.3515
(3.3178),(2.1976)
wleProductivity:-0.0634,0.0116
(0.1252),(0.0278)
EP/locTrip:-2.2357***,-2.1444***
(0.0025),(0.0016)
const:39.5927***,29.3285***
(5.6476),(3.4224)

N:17795,19251
R-squared:0.9778,0.9898
Adj R-squared:0.9778,0.9898
F-statistics:156954.1517***,374076.2246***


# Night Safari

In [18]:
from information_boards import statisticsAllDrivers_ns_dpath
from information_boards import statisticsAllDriversMonth_ns2000_prefix
Y2009 = pd.read_csv('%s/%s2009.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversMonth_ns2000_prefix))
Y2010 = pd.read_csv('%s/%s2010.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversMonth_ns2000_prefix))

In [19]:
print 'The number of drivers: (Y2009, %d), (Y2010, %d)' % (len(set(Y2009['driverID'])), len(set(Y2010['driverID'])))

The number of drivers: (Y2009, 13572), (Y2010, 14852)


In [15]:
def display_res(Y2009, Y2010, inDepV, fixEF):
    results = []
    for i, df in enumerate([Y2009, Y2010]):
        y = df[dep_v]
        X = df[inDepV + fixEF[i]]
        X = sm.add_constant(X)
        results.append(sm.OLS(y, X, missing='drop').fit())
    res2009, res2010 = results
    for idv in inDepV + ['const']:
        coef2009 = '%.4f' % res2009.params[idv] + significance(res2009.pvalues[idv])
        coef2010 = '%.4f' % res2010.params[idv] + significance(res2010.pvalues[idv])
        print '%s:' % idv + ','.join([coef2009, coef2010])
        ste2009 = res2009.params[idv] / res2009.tvalues[idv]
        ste2010 = res2010.params[idv] / res2010.tvalues[idv]
        print '(%.4f),(%.4f)' % (ste2009, ste2010)
    print

    fvalue2009 = '%.4f' % res2009.fvalue + significance(res2009.f_pvalue)
    fvalue2010 = '%.4f' % res2010.fvalue + significance(res2010.f_pvalue)
    print 'N:%d,%d' % (res2009.nobs, res2010.nobs)
    print 'R-squared:%.4f,%.4f' % (res2009.rsquared, res2010.rsquared)
    print 'Adj R-squared:%.4f,%.4f' % (res2009.rsquared_adj, res2010.rsquared_adj)
    print 'F-statistics:%s,%s' % (fvalue2009, fvalue2010)

In [17]:
# M4
Y2009_drivers = [str(did) for did in set(Y2009['driverID'])]
for did in Y2009_drivers:
    Y2009[did] = np.where(Y2009['driverID'] == int(did), 1, 0)
Y2010_drivers = [str(did) for did in set(Y2010['driverID'])]
for did in Y2010_drivers:
    Y2010[did] = np.where(Y2010['driverID'] == int(did), 1, 0)
display_res(Y2009, Y2010, m3_inDepV, (Y2009_drivers[:-1], Y2010_drivers[:-1]))

KeyboardInterrupt: 

In [None]:
# M5
hours = ['H%d' % x for x in range(24) if x not in [2, 3, 4, 5]]
month2009 = ['M%d' % m for m in Y2009['month']]
month2010 = ['M%d' % m for m in Y2010['month']]
for Mmonth in hours:
    Y2009[Mmonth] = np.where(Y2009['month'] == int(Mmonth[1:]), 1, 0)
    Y2010[Mmonth] = np.where(Y2010['month'] == int(Mmonth[1:]), 1, 0)