In [1]:
import __init__
#
from IPython.display import HTML, display
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [2]:
def significance(pvalue):
    if pvalue < 0.01:
        num_stars = 3
    elif pvalue < 0.05:
        num_stars = 2
    elif pvalue < 0.1:
        num_stars = 1
    else:
        num_stars = 0    
    return '*' * num_stars

In [3]:
def display_res(Y2009, Y2010, inDepV):
    results = []
    for df in [Y2009, Y2010]:
        y = df[dep_v]
        X = df[inDepV]
        X = sm.add_constant(X)
        results.append(sm.OLS(y, X, missing='drop').fit())
    res2009, res2010 = results
    for idv in inDepV + ['const']:
        coef2009 = '%.4f' % res2009.params[idv] + significance(res2009.pvalues[idv])
        coef2010 = '%.4f' % res2010.params[idv] + significance(res2010.pvalues[idv])
        print '%s:' % idv + ','.join([coef2009, coef2010])
        ste2009 = res2009.params[idv] / res2009.tvalues[idv]
        ste2010 = res2010.params[idv] / res2010.tvalues[idv]
        print '(%.4f),(%.4f)' % (ste2009, ste2010)
    print

    fvalue2009 = '%.4f' % res2009.fvalue + significance(res2009.f_pvalue)
    fvalue2010 = '%.4f' % res2010.fvalue + significance(res2010.f_pvalue)
    print 'N:%d,%d' % (res2009.nobs, res2010.nobs)
    print 'R-squared:%.4f,%.4f' % (res2009.rsquared, res2010.rsquared)
    print 'Adj R-squared:%.4f,%.4f' % (res2009.rsquared_adj, res2010.rsquared_adj)
    print 'F-statistics:%s,%s' % (fvalue2009, fvalue2010)

# Night Safari

In [4]:
from information_boards import statisticsAllDrivers_ns_dpath
from information_boards import statisticsAllDriversMonth_ns2023_prefix
Y2009 = pd.read_csv('%s/%s2009.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversMonth_ns2023_prefix))
Y2010 = pd.read_csv('%s/%s2010.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversMonth_ns2023_prefix))

In [5]:
print len(set(Y2009['driverID'])), len(set(Y2010['driverID']))

7212 7533


In [8]:
# Models
dep_v = 'QTime/locTrip'
ib_impact = ['locInRatio']
cv0 = ['wleTripNumber', 'locTripNumber', 'wleProductivity']
# cv0 = ['wleTripNumber', 'wleProductivity']
cv1 = cv0 + ['EP/locTrip']
cv2 = cv0 + ['locProductivity']
cv3 = cv0 + ['EP/locTrip', 'locProductivity']
m1_inDepV = ib_impact
m2a_inDepV = ib_impact + cv0
m2b_inDepV = ib_impact + cv1
m2c_inDepV = ib_impact + cv2
m2d_inDepV = ib_impact + cv3

In [9]:
%time
print ''
# M1
display_res(Y2009, Y2010, m1_inDepV)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs

locInRatio:4.8096***,3.2166***
(0.2151),(0.1922)
const:18.4624***,15.1418***
(0.1140),(0.1006)

N:18874,19628
R-squared:0.0258,0.0141
Adj R-squared:0.0258,0.0140
F-statistics:499.8282***,279.9740***


In [10]:
%time
print ''
# M2a
display_res(Y2009, Y2010, m2a_inDepV)

CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 3.81 µs

locInRatio:6.4157***,4.4363***
(0.2130),(0.1909)
wleTripNumber:-0.1536***,-0.1428***
(0.0229),(0.0195)
locTripNumber:1.0802***,1.1329***
(0.1407),(0.1298)
wleProductivity:-0.7547***,-0.6140***
(0.0191),(0.0164)
const:35.7590***,30.6639***
(0.4654),(0.4421)

N:18874,19628
R-squared:0.1214,0.0993
Adj R-squared:0.1212,0.0991
F-statistics:651.9886***,541.0054***


In [11]:
%time
print ''
# M2b
display_res(Y2009, Y2010, m2b_inDepV)

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 6.91 µs

locInRatio:-0.8715***,-0.8567***
(0.1053),(0.0888)
wleTripNumber:-0.1026***,-0.0761***
(0.0109),(0.0089)
locTripNumber:0.7425***,0.5908***
(0.0669),(0.0589)
wleProductivity:-0.0233**,0.0132*
(0.0095),(0.0078)
EP/locTrip:-2.0894***,-1.9182***
(0.0082),(0.0070)
const:16.5432***,12.3769***
(0.2338),(0.2114)

N:18874,19628
R-squared:0.8014,0.8144
Adj R-squared:0.8013,0.8144
F-statistics:15226.4785***,17224.2559***


In [12]:
%time
print ''
# M2c
display_res(Y2009, Y2010, m2c_inDepV)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 6.91 µs

locInRatio:1.8731***,0.7029***
(0.1276),(0.1138)
wleTripNumber:-0.1891***,-0.1571***
(0.0135),(0.0115)
locTripNumber:0.8621***,0.7516***
(0.0828),(0.0763)
wleProductivity:0.0738***,0.0994***
(0.0121),(0.0103)
locProductivity:-1.3718***,-1.1870***
(0.0073),(0.0062)
const:49.9033***,44.3759***
(0.2838),(0.2693)

N:18874,19628
R-squared:0.6961,0.6891
Adj R-squared:0.6961,0.6890
F-statistics:8645.5292***,8698.0809***


In [13]:
%time
print ''
# M2d
display_res(Y2009, Y2010, m2d_inDepV)

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 7.15 µs

locInRatio:-0.5781***,-0.7773***
(0.1037),(0.0876)
wleTripNumber:-0.1203***,-0.0884***
(0.0107),(0.0088)
locTripNumber:0.7543***,0.5991***
(0.0656),(0.0582)
wleProductivity:0.0344***,0.0536***
(0.0096),(0.0079)
EP/locTrip:-1.7007***,-1.6412***
(0.0161),(0.0138)
locProductivity:-0.3208***,-0.2178***
(0.0115),(0.0094)
const:23.4250***,17.5345***
(0.3364),(0.3050)

N:18874,19628
R-squared:0.8093,0.8194
Adj R-squared:0.8092,0.8193
F-statistics:13343.0919***,14835.1400***


In [17]:
# M3
display_res(Y2009, Y2010, m3_inDepV)

locInRatio:-4.3341***,-4.3223***
(0.0785),(0.0604)
wleTripNumber:-0.0003,-0.0025***
(0.0004),(0.0003)
locTripNumber:0.0375***,0.0740***
(0.0068),(0.0044)
wleProductivity:0.0506***,0.0473***
(0.0058),(0.0038)
EP/locTrip:-1.4359***,-1.1051***
(0.0097),(0.0078)
locProductivity:-0.8191***,-0.8007***
(0.0082),(0.0060)
timePassed:0.3462***,-1.0850***
(0.0220),(0.0435)
timePassed^2:-0.0510***,0.0273***
(0.0021),(0.0012)
const:51.7958***,59.0955***
(0.2370),(0.4050)

N:69958,97232
R-squared:0.8020,0.7889
Adj R-squared:0.8020,0.7889
F-statistics:35416.2855***,45410.8602***


# Zoo

In [18]:
from information_boards import statisticsAllDrivers_ns_dpath
from information_boards import statisticsAllDriversMonth_ns1519_prefix
Y2009 = pd.read_csv('%s/%s2009.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversMonth_ns1519_prefix))
Y2010 = pd.read_csv('%s/%s2010.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversMonth_ns1519_prefix))

In [19]:
print 'The number of drivers: (Y2009, %d), (Y2010, %d)' % (len(set(Y2009['driverID'])), len(set(Y2010['driverID'])))

The number of drivers: (Y2009, 9836), (Y2010, 11018)


In [20]:
# M1
display_res(Y2009, Y2010, m1_inDepV)

locInRatio:-474.7671***,-620.3563***
(28.3808),(28.9798)
const:484.8799***,623.9428***
(21.1517),(22.6268)

N:17795,19251
R-squared:0.0155,0.0233
Adj R-squared:0.0154,0.0232
F-statistics:279.8417***,458.2374***


In [16]:
# M2a
display_res(Y2009, Y2010, m2a_inDepV)

locInRatio:-517.2908***,-652.4528***
(29.6313),(30.1250)
wleTripNumber:8.4251***,2.5459
(3.2056),(3.1103)
locTripNumber:-96.8063***,-63.9445***
(22.0812),(21.5070)
wleProductivity:1.9392**,0.1815
(0.8335),(0.2720)
const:508.1354***,698.0090***
(37.4400),(33.1498)

N:17795,19251
R-squared:0.0170,0.0240
Adj R-squared:0.0168,0.0238
F-statistics:77.1047***,118.4940***


In [17]:
# M2b
display_res(Y2009, Y2010, m2b_inDepV)

locInRatio:-22.0277***,-19.0202***
(4.4853),(3.1130)
wleTripNumber:-0.2294,0.4446
(0.4815),(0.3177)
locTripNumber:1.1342,-1.3515
(3.3178),(2.1976)
wleProductivity:-0.0634,0.0116
(0.1252),(0.0278)
EP/locTrip:-2.2357***,-2.1444***
(0.0025),(0.0016)
const:39.5927***,29.3285***
(5.6476),(3.4224)

N:17795,19251
R-squared:0.9778,0.9898
Adj R-squared:0.9778,0.9898
F-statistics:156954.1517***,374076.2246***


# Night Safari

In [18]:
from information_boards import statisticsAllDrivers_ns_dpath
from information_boards import statisticsAllDriversMonth_ns2000_prefix
Y2009 = pd.read_csv('%s/%s2009.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversMonth_ns2000_prefix))
Y2010 = pd.read_csv('%s/%s2010.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversMonth_ns2000_prefix))

In [19]:
print 'The number of drivers: (Y2009, %d), (Y2010, %d)' % (len(set(Y2009['driverID'])), len(set(Y2010['driverID'])))

The number of drivers: (Y2009, 13572), (Y2010, 14852)


In [15]:
def display_res(Y2009, Y2010, inDepV, fixEF):
    results = []
    for i, df in enumerate([Y2009, Y2010]):
        y = df[dep_v]
        X = df[inDepV + fixEF[i]]
        X = sm.add_constant(X)
        results.append(sm.OLS(y, X, missing='drop').fit())
    res2009, res2010 = results
    for idv in inDepV + ['const']:
        coef2009 = '%.4f' % res2009.params[idv] + significance(res2009.pvalues[idv])
        coef2010 = '%.4f' % res2010.params[idv] + significance(res2010.pvalues[idv])
        print '%s:' % idv + ','.join([coef2009, coef2010])
        ste2009 = res2009.params[idv] / res2009.tvalues[idv]
        ste2010 = res2010.params[idv] / res2010.tvalues[idv]
        print '(%.4f),(%.4f)' % (ste2009, ste2010)
    print

    fvalue2009 = '%.4f' % res2009.fvalue + significance(res2009.f_pvalue)
    fvalue2010 = '%.4f' % res2010.fvalue + significance(res2010.f_pvalue)
    print 'N:%d,%d' % (res2009.nobs, res2010.nobs)
    print 'R-squared:%.4f,%.4f' % (res2009.rsquared, res2010.rsquared)
    print 'Adj R-squared:%.4f,%.4f' % (res2009.rsquared_adj, res2010.rsquared_adj)
    print 'F-statistics:%s,%s' % (fvalue2009, fvalue2010)

In [17]:
# M4
Y2009_drivers = [str(did) for did in set(Y2009['driverID'])]
for did in Y2009_drivers:
    Y2009[did] = np.where(Y2009['driverID'] == int(did), 1, 0)
Y2010_drivers = [str(did) for did in set(Y2010['driverID'])]
for did in Y2010_drivers:
    Y2010[did] = np.where(Y2010['driverID'] == int(did), 1, 0)
display_res(Y2009, Y2010, m3_inDepV, (Y2009_drivers[:-1], Y2010_drivers[:-1]))

KeyboardInterrupt: 

In [None]:
# M5
hours = ['H%d' % x for x in range(24) if x not in [2, 3, 4, 5]]
month2009 = ['M%d' % m for m in Y2009['month']]
month2010 = ['M%d' % m for m in Y2010['month']]
for Mmonth in hours:
    Y2009[Mmonth] = np.where(Y2009['month'] == int(Mmonth[1:]), 1, 0)
    Y2010[Mmonth] = np.where(Y2010['month'] == int(Mmonth[1:]), 1, 0)