In [2]:
import __init__
#
from IPython.display import HTML, display
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [3]:
def significance(pvalue):
    if pvalue < 0.01:
        num_stars = 3
    elif pvalue < 0.05:
        num_stars = 2
    elif pvalue < 0.1:
        num_stars = 1
    else:
        num_stars = 0    
    return '*' * num_stars

In [4]:
def display_res(Y2009, Y2010, inDepV):
    results = []
    for df in [Y2009, Y2010]:
        y = df[dep_v]
        X = df[inDepV]
        X = sm.add_constant(X)
        results.append(sm.OLS(y, X, missing='drop').fit())
    res2009, res2010 = results
    for idv in inDepV + ['const']:
        coef2009 = '%.4f' % res2009.params[idv] + significance(res2009.pvalues[idv])
        coef2010 = '%.4f' % res2010.params[idv] + significance(res2010.pvalues[idv])
        print '%s:' % idv + ','.join([coef2009, coef2010])
        ste2009 = res2009.params[idv] / res2009.tvalues[idv]
        ste2010 = res2010.params[idv] / res2010.tvalues[idv]
        print '(%.4f),(%.4f)' % (ste2009, ste2010)
    print

    fvalue2009 = '%.4f' % res2009.fvalue + significance(res2009.f_pvalue)
    fvalue2010 = '%.4f' % res2010.fvalue + significance(res2010.f_pvalue)
    print 'N:%d,%d' % (res2009.nobs, res2010.nobs)
    print 'R-squared:%.4f,%.4f' % (res2009.rsquared, res2010.rsquared)
    print 'Adj R-squared:%.4f,%.4f' % (res2009.rsquared_adj, res2010.rsquared_adj)
    print 'F-statistics:%s,%s' % (fvalue2009, fvalue2010)

# Airport

In [5]:
from information_boards import statisticsAllDrivers_ap_dpath
from information_boards import statisticsAllDriversMonth_ap_prefix
Y2009 = pd.read_csv('%s/Filtered-%s2009.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversMonth_ap_prefix))
Y2010 = pd.read_csv('%s/Filtered-%s2010.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversMonth_ap_prefix))

In [6]:
# Models
dep_v = 'QTime/locTrip'
ib_impact = ['locInRatio']
cv0 = ['wleTripNumber', 'locTripNumber', 'wleProductivity']
cv1 = cv0 + ['EP/locTrip']
cv2 = cv0 + ['locProductivity']
cv3 = cv0 + ['EP/locTrip', 'locProductivity']
m1_inDepV = ib_impact
m2a_inDepV = ib_impact + cv0
m2b_inDepV = ib_impact + cv1
m2c_inDepV = ib_impact + cv2
m2d_inDepV = ib_impact + cv3

In [7]:
# M1
display_res(Y2009, Y2010, m1_inDepV)

locInRatio:-0.6104***,-2.1813***
(0.1361),(0.1006)
const:34.3033***,29.9898***
(0.0962),(0.0760)

N:66868,93349
R-squared:0.0003,0.0050
Adj R-squared:0.0003,0.0050
F-statistics:20.1282***,470.6080***


In [8]:
# M2a
display_res(Y2009, Y2010, m2a_inDepV)

locInRatio:2.3964***,0.6774***
(0.1573),(0.1182)
wleTripNumber:0.0082***,0.0004
(0.0009),(0.0006)
locTripNumber:0.1900***,0.2395***
(0.0153),(0.0098)
wleProductivity:-1.1420***,-0.9325***
(0.0126),(0.0084)
const:57.8461***,50.8026***
(0.3457),(0.2557)

N:66868,93349
R-squared:0.1234,0.1410
Adj R-squared:0.1233,0.1410
F-statistics:2352.9538***,3831.8654***


In [9]:
# M2b
display_res(Y2009, Y2010, m2b_inDepV)

locInRatio:-4.5698***,-4.2848***
(0.0866),(0.0679)
wleTripNumber:0.0066***,0.0041***
(0.0005),(0.0003)
locTripNumber:0.0012,0.0322***
(0.0083),(0.0056)
wleProductivity:-0.1512***,-0.1533***
(0.0073),(0.0051)
EP/locTrip:-2.2332***,-1.9875***
(0.0055),(0.0045)
const:37.0529***,33.0904***
(0.1935),(0.1503)

N:66868,93349
R-squared:0.7448,0.7242
Adj R-squared:0.7448,0.7242
F-statistics:39030.1826***,49014.2002***


In [10]:
# M2c
display_res(Y2009, Y2010, m2c_inDepV)

locInRatio:-2.5806***,-3.5058***
(0.0863),(0.0654)
wleTripNumber:-0.0080***,-0.0092***
(0.0005),(0.0003)
locTripNumber:0.0791***,0.1247***
(0.0083),(0.0054)
wleProductivity:0.0837***,0.1204***
(0.0075),(0.0051)
locProductivity:-1.8789***,-1.5560***
(0.0047),(0.0033)
const:75.6563***,66.3665***
(0.1929),(0.1440)

N:66868,93349
R-squared:0.7417,0.7421
Adj R-squared:0.7417,0.7421
F-statistics:38392.6170***,53724.9202***


In [11]:
# M2d
display_res(Y2009, Y2010, m2d_inDepV)

locInRatio:-3.9993***,-4.1708***
(0.0796),(0.0613)
wleTripNumber:-0.0012**,-0.0035***
(0.0005),(0.0003)
locTripNumber:0.0292***,0.0722***
(0.0076),(0.0050)
wleProductivity:0.0393***,0.0618***
(0.0069),(0.0048)
EP/locTrip:-1.2132***,-0.9559***
(0.0104),(0.0082)
locProductivity:-0.9858***,-0.9155***
(0.0088),(0.0063)
const:55.8949***,51.4412***
(0.2441),(0.1852)

N:66868,93349
R-squared:0.7854,0.7752
Adj R-squared:0.7854,0.7752
F-statistics:40777.7101***,53647.1135***


In [17]:
# M3
display_res(Y2009, Y2010, m3_inDepV)

locInRatio:-4.3341***,-4.3223***
(0.0785),(0.0604)
wleTripNumber:-0.0003,-0.0025***
(0.0004),(0.0003)
locTripNumber:0.0375***,0.0740***
(0.0068),(0.0044)
wleProductivity:0.0506***,0.0473***
(0.0058),(0.0038)
EP/locTrip:-1.4359***,-1.1051***
(0.0097),(0.0078)
locProductivity:-0.8191***,-0.8007***
(0.0082),(0.0060)
timePassed:0.3462***,-1.0850***
(0.0220),(0.0435)
timePassed^2:-0.0510***,0.0273***
(0.0021),(0.0012)
const:51.7958***,59.0955***
(0.2370),(0.4050)

N:69958,97232
R-squared:0.8020,0.7889
Adj R-squared:0.8020,0.7889
F-statistics:35416.2855***,45410.8602***


# Zoo

In [18]:
from information_boards import statisticsAllDrivers_ns_dpath
from information_boards import statisticsAllDriversMonth_ns1519_prefix
Y2009 = pd.read_csv('%s/%s2009.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversMonth_ns1519_prefix))
Y2010 = pd.read_csv('%s/%s2010.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversMonth_ns1519_prefix))

In [19]:
print 'The number of drivers: (Y2009, %d), (Y2010, %d)' % (len(set(Y2009['driverID'])), len(set(Y2010['driverID'])))

The number of drivers: (Y2009, 9836), (Y2010, 11018)


In [20]:
# M1
display_res(Y2009, Y2010, m1_inDepV)

locInRatio:-474.7671***,-620.3563***
(28.3808),(28.9798)
const:484.8799***,623.9428***
(21.1517),(22.6268)

N:17795,19251
R-squared:0.0155,0.0233
Adj R-squared:0.0154,0.0232
F-statistics:279.8417***,458.2374***


In [16]:
# M2a
display_res(Y2009, Y2010, m2a_inDepV)

locInRatio:-517.2908***,-652.4528***
(29.6313),(30.1250)
wleTripNumber:8.4251***,2.5459
(3.2056),(3.1103)
locTripNumber:-96.8063***,-63.9445***
(22.0812),(21.5070)
wleProductivity:1.9392**,0.1815
(0.8335),(0.2720)
const:508.1354***,698.0090***
(37.4400),(33.1498)

N:17795,19251
R-squared:0.0170,0.0240
Adj R-squared:0.0168,0.0238
F-statistics:77.1047***,118.4940***


In [17]:
# M2b
display_res(Y2009, Y2010, m2b_inDepV)

locInRatio:-22.0277***,-19.0202***
(4.4853),(3.1130)
wleTripNumber:-0.2294,0.4446
(0.4815),(0.3177)
locTripNumber:1.1342,-1.3515
(3.3178),(2.1976)
wleProductivity:-0.0634,0.0116
(0.1252),(0.0278)
EP/locTrip:-2.2357***,-2.1444***
(0.0025),(0.0016)
const:39.5927***,29.3285***
(5.6476),(3.4224)

N:17795,19251
R-squared:0.9778,0.9898
Adj R-squared:0.9778,0.9898
F-statistics:156954.1517***,374076.2246***


# Night Safari

In [18]:
from information_boards import statisticsAllDrivers_ns_dpath
from information_boards import statisticsAllDriversMonth_ns2000_prefix
Y2009 = pd.read_csv('%s/%s2009.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversMonth_ns2000_prefix))
Y2010 = pd.read_csv('%s/%s2010.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversMonth_ns2000_prefix))

In [19]:
print 'The number of drivers: (Y2009, %d), (Y2010, %d)' % (len(set(Y2009['driverID'])), len(set(Y2010['driverID'])))

The number of drivers: (Y2009, 13572), (Y2010, 14852)


In [15]:
def display_res(Y2009, Y2010, inDepV, fixEF):
    results = []
    for i, df in enumerate([Y2009, Y2010]):
        y = df[dep_v]
        X = df[inDepV + fixEF[i]]
        X = sm.add_constant(X)
        results.append(sm.OLS(y, X, missing='drop').fit())
    res2009, res2010 = results
    for idv in inDepV + ['const']:
        coef2009 = '%.4f' % res2009.params[idv] + significance(res2009.pvalues[idv])
        coef2010 = '%.4f' % res2010.params[idv] + significance(res2010.pvalues[idv])
        print '%s:' % idv + ','.join([coef2009, coef2010])
        ste2009 = res2009.params[idv] / res2009.tvalues[idv]
        ste2010 = res2010.params[idv] / res2010.tvalues[idv]
        print '(%.4f),(%.4f)' % (ste2009, ste2010)
    print

    fvalue2009 = '%.4f' % res2009.fvalue + significance(res2009.f_pvalue)
    fvalue2010 = '%.4f' % res2010.fvalue + significance(res2010.f_pvalue)
    print 'N:%d,%d' % (res2009.nobs, res2010.nobs)
    print 'R-squared:%.4f,%.4f' % (res2009.rsquared, res2010.rsquared)
    print 'Adj R-squared:%.4f,%.4f' % (res2009.rsquared_adj, res2010.rsquared_adj)
    print 'F-statistics:%s,%s' % (fvalue2009, fvalue2010)

In [17]:
# M4
Y2009_drivers = [str(did) for did in set(Y2009['driverID'])]
for did in Y2009_drivers:
    Y2009[did] = np.where(Y2009['driverID'] == int(did), 1, 0)
Y2010_drivers = [str(did) for did in set(Y2010['driverID'])]
for did in Y2010_drivers:
    Y2010[did] = np.where(Y2010['driverID'] == int(did), 1, 0)
display_res(Y2009, Y2010, m3_inDepV, (Y2009_drivers[:-1], Y2010_drivers[:-1]))

KeyboardInterrupt: 

In [None]:
# M5
hours = ['H%d' % x for x in range(24) if x not in [2, 3, 4, 5]]
month2009 = ['M%d' % m for m in Y2009['month']]
month2010 = ['M%d' % m for m in Y2010['month']]
for Mmonth in hours:
    Y2009[Mmonth] = np.where(Y2009['month'] == int(Mmonth[1:]), 1, 0)
    Y2010[Mmonth] = np.where(Y2010['month'] == int(Mmonth[1:]), 1, 0)