In [1]:
import __init__
#
from IPython.display import HTML, display
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [2]:
from information_boards import statisticsAllDrivers_ap_dpath
from information_boards import statisticsAllDriversMonth_ap_prefix
Y2009 = pd.read_csv('%s/%s2009.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversMonth_ap_prefix))
Y2010 = pd.read_csv('%s/%s2010.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversMonth_ap_prefix))

In [4]:
print Y2009.columns
print 'The number of drivers: (2009, %d), (2010, %d)' % (len(set(Y2009['driverID'])), len(set(Y2010['driverID'])))

Index([u'year', u'month', u'driverID', u'wleTripNumber', u'wleOperatingHour',
       u'wleFare', u'wleProductivity', u'locTripNumber', u'locInNumber',
       u'locOutNumber', u'locQTime', u'locEP', u'locDuration', u'locFare',
       u'QTime/locTrip', u'EP/locTrip', u'locProductivity', u'locInRatio',
       u'timePassed', u'timePassed^2'],
      dtype='object')
The number of drivers: (2009, 29057), (2010, 30598)


In [5]:
def significance(pvalue):
    if pvalue < 0.01:
        num_stars = 3
    elif pvalue < 0.05:
        num_stars = 2
    elif pvalue < 0.1:
        num_stars = 1
    else:
        num_stars = 0    
    return '*' * num_stars

In [6]:
def display_res(Y2009, Y2010, inDepV):
    results = []
    for df in [Y2009, Y2010]:
        y = df[dep_v]
        X = df[inDepV]
        X = sm.add_constant(X)
        results.append(sm.OLS(y, X, missing='drop').fit())
    res2009, res2010 = results
    for idv in inDepV + ['const']:
        coef2009 = '%.4f' % res2009.params[idv] + significance(res2009.pvalues[idv])
        coef2010 = '%.4f' % res2010.params[idv] + significance(res2010.pvalues[idv])
        print '%s:' % idv + ','.join([coef2009, coef2010])
        ste2009 = res2009.params[idv] / res2009.tvalues[idv]
        ste2010 = res2010.params[idv] / res2010.tvalues[idv]
        print '(%.4f),(%.4f)' % (ste2009, ste2010)
    print

    fvalue2009 = '%.4f' % res2009.fvalue + significance(res2009.f_pvalue)
    fvalue2010 = '%.4f' % res2010.fvalue + significance(res2010.f_pvalue)
    print 'N:%d,%d' % (res2009.nobs, res2010.nobs)
    print 'R-squared:%.4f,%.4f' % (res2009.rsquared, res2010.rsquared)
    print 'Adj R-squared:%.4f,%.4f' % (res2009.rsquared_adj, res2010.rsquared_adj)
    print 'F-statistics:%s,%s' % (fvalue2009, fvalue2010)

In [7]:
# Models
dep_v = 'QTime/locTrip'
ib_impact = ['locInRatio']
cv0 = ['wleTripNumber', 'locTripNumber', 'wleProductivity']
cv1 = cv0 + ['EP/locTrip']
cv2 = cv0 + ['locProductivity']
cv3 = cv0 + ['EP/locTrip', 'locProductivity']
learning_variables = ['timePassed', 'timePassed^2']
m1_inDepV = ib_impact
m2a_inDepV = ib_impact + cv0
m2b_inDepV = ib_impact + cv1
m2c_inDepV = ib_impact + cv2
m2d_inDepV = ib_impact + cv3
m3_inDepV = ib_impact + cv3 + learning_variables

# Aiport

In [8]:
# M1
display_res(Y2009, Y2010, m1_inDepV)

locInRatio:-41.1420***,-34.7586***
(1.0871),(0.7697)
const:64.8325***,55.9198***
(0.8431),(0.6119)

N:185317,212362
R-squared:0.0077,0.0095
Adj R-squared:0.0077,0.0095
F-statistics:1432.2321***,2039.3006***


In [9]:
# M2a
display_res(Y2009, Y2010, m2a_inDepV)

locInRatio:-53.4923***,-45.7522***
(1.2971),(0.9301)
wleTripNumber:-0.0032,0.0077**
(0.0048),(0.0030)
locTripNumber:-0.4175***,-0.3978***
(0.0337),(0.0235)
wleProductivity:-0.0046,-0.0012
(0.0058),(0.0037)
const:78.6297***,67.7941***
(1.2012),(0.8709)

N:185317,212362
R-squared:0.0093,0.0116
Adj R-squared:0.0093,0.0116
F-statistics:435.4124***,622.0262***


In [11]:
# M2b
display_res(Y2009, Y2010, m2b_inDepV)

locInRatio:-10.4891***,-7.9056***
(0.1958),(0.2005)
wleTripNumber:0.0103***,0.0106***
(0.0007),(0.0006)
locTripNumber:-0.0457***,-0.0258***
(0.0051),(0.0050)
wleProductivity:0.0002,-0.0009
(0.0009),(0.0008)
EP/locTrip:-2.4962***,-2.2461***
(0.0009),(0.0011)
const:35.8706***,30.0135***
(0.1813),(0.1878)

N:185317,212362
R-squared:0.9776,0.9544
Adj R-squared:0.9776,0.9544
F-statistics:1615648.2723***,889854.9742***


In [12]:
# M2c
display_res(Y2009, Y2010, m2c_inDepV)

locInRatio:-52.3326***,-44.9076***
(1.2949),(0.9265)
wleTripNumber:-0.0159***,-0.0048
(0.0048),(0.0030)
locTripNumber:-0.4070***,-0.3891***
(0.0336),(0.0234)
wleProductivity:-0.0026,0.0006
(0.0058),(0.0037)
locProductivity:-0.3849***,-0.4610***
(0.0134),(0.0110)
const:89.2153***,81.4648***
(1.2536),(0.9263)

N:185317,212362
R-squared:0.0137,0.0197
Adj R-squared:0.0137,0.0197
F-statistics:515.6996***,854.4595***


In [13]:
# M2d
display_res(Y2009, Y2010, m2d_inDepV)

locInRatio:-10.4891***,-7.9130***
(0.1958),(0.2005)
wleTripNumber:0.0103***,0.0108***
(0.0007),(0.0006)
locTripNumber:-0.0457***,-0.0259***
(0.0051),(0.0050)
wleProductivity:0.0002,-0.0009
(0.0009),(0.0008)
EP/locTrip:-2.4962***,-2.2464***
(0.0009),(0.0011)
locProductivity:0.0000,0.0066***
(0.0020),(0.0024)
const:35.8705***,29.8140***
(0.1900),(0.2012)

N:185317,212362
R-squared:0.9776,0.9544
Adj R-squared:0.9776,0.9544
F-statistics:1346366.2948***,741570.2641***


In [14]:
# M3
display_res(Y2009, Y2010, m3_inDepV)

locInRatio:-10.3557***,-7.8260***
(0.1958),(0.2004)
wleTripNumber:0.0107***,0.0110***
(0.0007),(0.0006)
locTripNumber:-0.0452***,-0.0256***
(0.0051),(0.0050)
wleProductivity:0.0004,-0.0009
(0.0009),(0.0008)
EP/locTrip:-2.4963***,-2.2466***
(0.0009),(0.0011)
locProductivity:0.0007,0.0070***
(0.0020),(0.0024)
timePassed:0.7357***,-0.4262**
(0.0671),(0.1739)
timePassed^2:-0.0956***,0.0037
(0.0065),(0.0049)
const:35.3959***,35.9179***
(0.2294),(1.4970)

N:185317,212362
R-squared:0.9776,0.9545
Adj R-squared:0.9776,0.9545
F-statistics:1011834.0692***,557147.0841***


In [15]:
def display_res(Y2009, Y2010, inDepV, fixEF):
    results = []
    for i, df in enumerate([Y2009, Y2010]):
        y = df[dep_v]
        X = df[inDepV + fixEF[i]]
        X = sm.add_constant(X)
        results.append(sm.OLS(y, X, missing='drop').fit())
    res2009, res2010 = results
    for idv in inDepV + ['const']:
        coef2009 = '%.4f' % res2009.params[idv] + significance(res2009.pvalues[idv])
        coef2010 = '%.4f' % res2010.params[idv] + significance(res2010.pvalues[idv])
        print '%s:' % idv + ','.join([coef2009, coef2010])
        ste2009 = res2009.params[idv] / res2009.tvalues[idv]
        ste2010 = res2010.params[idv] / res2010.tvalues[idv]
        print '(%.4f),(%.4f)' % (ste2009, ste2010)
    print

    fvalue2009 = '%.4f' % res2009.fvalue + significance(res2009.f_pvalue)
    fvalue2010 = '%.4f' % res2010.fvalue + significance(res2010.f_pvalue)
    print 'N:%d,%d' % (res2009.nobs, res2010.nobs)
    print 'R-squared:%.4f,%.4f' % (res2009.rsquared, res2010.rsquared)
    print 'Adj R-squared:%.4f,%.4f' % (res2009.rsquared_adj, res2010.rsquared_adj)
    print 'F-statistics:%s,%s' % (fvalue2009, fvalue2010)

In [None]:
# M4
Y2009_drivers = [str(did) for did in set(Y2009['driverID'])]
for did in Y2009_drivers:
    Y2009[did] = np.where(Y2009['driverID'] == int(did), 1, 0)
Y2010_drivers = [str(did) for did in set(Y2010['driverID'])]
for did in Y2010_drivers:
    Y2010[did] = np.where(Y2010['driverID'] == int(did), 1, 0)
display_res(Y2009, Y2010, m3_inDepV, (Y2009_drivers[:-1], Y2010_drivers[:-1]))

In [None]:
# M5
hours = ['H%d' % x for x in range(24) if x not in [2, 3, 4, 5]]
month2009 = ['M%d' % m for m in Y2009['month']]
month2010 = ['M%d' % m for m in Y2010['month']]
for Mmonth in hours:
    Y2009[Mmonth] = np.where(Y2009['month'] == int(Mmonth[1:]), 1, 0)
    Y2010[Mmonth] = np.where(Y2010['month'] == int(Mmonth[1:]), 1, 0)