In [1]:
import __init__
#
from IPython.display import HTML, display
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [2]:
from information_boards import statisticsAllDrivers_ap_dpath
from information_boards import statisticsAllDriversMonth_ap_prefix
Y2009 = pd.read_csv('%s/%s2009.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversMonth_ap_prefix))
Y2010 = pd.read_csv('%s/%s2010.csv' % (statisticsAllDrivers_ap_dpath, statisticsAllDriversMonth_ap_prefix))

In [3]:
len(Y2009), len(Y2010)

(185317, 212362)

In [4]:
# for df in [Y2009, Y2010]:
for cn in Y2009.columns:
    if cn in ['year', 'month', 'driverID']:
        continue
    Y2009 = Y2009[~(np.abs(Y2009[cn] - Y2009[cn].mean()) > (3 * Y2009[cn].std()))]
    
for cn in Y2010.columns:
    if cn in ['year', 'month', 'driverID']:
        continue
    Y2010 = Y2010[~(np.abs(Y2010[cn] - Y2010[cn].mean()) > (3 * Y2010[cn].std()))]

In [5]:
len(Y2009), len(Y2010)

(150268, 175587)

In [6]:
print Y2009.columns
print 'The number of drivers: (Y2009, %d), (Y2010, %d)' % (len(set(Y2009['driverID'])), len(set(Y2010['driverID'])))

Index([u'year', u'month', u'driverID', u'wleTripNumber', u'wleOperatingHour',
       u'wleFare', u'wleProductivity', u'locTripNumber', u'locInNumber',
       u'locOutNumber', u'locQTime', u'locEP', u'locDuration', u'locFare',
       u'QTime/locTrip', u'EP/locTrip', u'locProductivity', u'locInRatio',
       u'timePassed', u'timePassed^2'],
      dtype='object')
The number of drivers: (Y2009, 27339), (Y2010, 28930)


In [7]:
def significance(pvalue):
    if pvalue < 0.01:
        num_stars = 3
    elif pvalue < 0.05:
        num_stars = 2
    elif pvalue < 0.1:
        num_stars = 1
    else:
        num_stars = 0    
    return '*' * num_stars

In [8]:
def display_res(Y2009, Y2010, inDepV):
    results = []
    for df in [Y2009, Y2010]:
        y = df[dep_v]
        X = df[inDepV]
        X = sm.add_constant(X)
        results.append(sm.OLS(y, X, missing='drop').fit())
    res2009, res2010 = results
    for idv in inDepV + ['const']:
        coef2009 = '%.4f' % res2009.params[idv] + significance(res2009.pvalues[idv])
        coef2010 = '%.4f' % res2010.params[idv] + significance(res2010.pvalues[idv])
        print '%s:' % idv + ','.join([coef2009, coef2010])
        ste2009 = res2009.params[idv] / res2009.tvalues[idv]
        ste2010 = res2010.params[idv] / res2010.tvalues[idv]
        print '(%.4f),(%.4f)' % (ste2009, ste2010)
    print

    fvalue2009 = '%.4f' % res2009.fvalue + significance(res2009.f_pvalue)
    fvalue2010 = '%.4f' % res2010.fvalue + significance(res2010.f_pvalue)
    print 'N:%d,%d' % (res2009.nobs, res2010.nobs)
    print 'R-squared:%.4f,%.4f' % (res2009.rsquared, res2010.rsquared)
    print 'Adj R-squared:%.4f,%.4f' % (res2009.rsquared_adj, res2010.rsquared_adj)
    print 'F-statistics:%s,%s' % (fvalue2009, fvalue2010)

In [9]:
# Models
dep_v = 'QTime/locTrip'
ib_impact = ['locInRatio']
cv0 = ['wleTripNumber', 'locTripNumber', 'wleProductivity']
cv1 = cv0 + ['EP/locTrip']
cv2 = cv0 + ['locProductivity']
cv3 = cv0 + ['EP/locTrip', 'locProductivity']
learning_variables = ['timePassed', 'timePassed^2']
m1_inDepV = ib_impact
m2a_inDepV = ib_impact + cv0
m2b_inDepV = ib_impact + cv1
m2c_inDepV = ib_impact + cv2
m2d_inDepV = ib_impact + cv3
m3_inDepV = ib_impact + cv3 + learning_variables

# Aiport

In [10]:
Y2009 = Y2009[(Y2009['locTripNumber'] >= 4)]
Y2010 = Y2010[(Y2010['locTripNumber'] >= 4)]

In [11]:
len(Y2009), len(Y2010)

(69958, 97232)

In [21]:
len(set(Y2009['driverID'])),len(set(Y2010['driverID'])) 

(9836, 11018)

In [12]:
# M1
display_res(Y2009, Y2010, m1_inDepV)

locInRatio:-1.3050***,-2.6842***
(0.1411),(0.1035)
const:35.1536***,30.5233***
(0.1003),(0.0787)

N:69958,97232
R-squared:0.0012,0.0069
Adj R-squared:0.0012,0.0069
F-statistics:85.5634***,672.7028***


In [13]:
# M2a
display_res(Y2009, Y2010, m2a_inDepV)

locInRatio:1.9127***,0.4302***
(0.1632),(0.1218)
wleTripNumber:0.0054***,-0.0018***
(0.0009),(0.0006)
locTripNumber:0.2610***,0.2850***
(0.0144),(0.0090)
wleProductivity:-0.9212***,-0.6672***
(0.0113),(0.0071)
const:52.7453***,43.7269***
(0.3237),(0.2307)

N:69958,97232
R-squared:0.1083,0.1174
Adj R-squared:0.1082,0.1173
F-statistics:2123.4591***,3232.7436***


In [14]:
# M2b
display_res(Y2009, Y2010, m2b_inDepV)

locInRatio:-5.0473***,-4.5972***
(0.0851),(0.0666)
wleTripNumber:0.0054***,0.0035***
(0.0005),(0.0003)
locTripNumber:0.0099,0.0375***
(0.0074),(0.0049)
wleProductivity:-0.0925***,-0.0925***
(0.0061),(0.0040)
EP/locTrip:-2.2784***,-2.0254***
(0.0051),(0.0042)
const:35.8891***,31.6040***
(0.1702),(0.1270)

N:69958,97232
R-squared:0.7658,0.7430
Adj R-squared:0.7658,0.7430
F-statistics:45757.1577***,56220.4188***


In [15]:
# M2c
display_res(Y2009, Y2010, m2c_inDepV)

locInRatio:-2.7872***,-3.6701***
(0.0891),(0.0662)
wleTripNumber:-0.0069***,-0.0081***
(0.0005),(0.0003)
locTripNumber:0.0830***,0.1240***
(0.0078),(0.0049)
wleProductivity:0.0542***,0.0746***
(0.0066),(0.0041)
locProductivity:-1.8813***,-1.5435***
(0.0046),(0.0032)
const:76.6298***,67.3526***
(0.1846),(0.1335)

N:69958,97232
R-squared:0.7386,0.7436
Adj R-squared:0.7386,0.7436
F-statistics:39530.2444***,56400.4434***


In [16]:
# M2d
display_res(Y2009, Y2010, m2d_inDepV)

locInRatio:-4.4851***,-4.4408***
(0.0792),(0.0608)
wleTripNumber:-0.0002,-0.0024***
(0.0004),(0.0003)
locTripNumber:0.0263***,0.0671***
(0.0069),(0.0045)
wleProductivity:0.0312***,0.0359***
(0.0058),(0.0038)
EP/locTrip:-1.3888***,-1.0776***
(0.0097),(0.0078)
locProductivity:-0.8627***,-0.8268***
(0.0082),(0.0059)
const:53.4230***,49.9315***
(0.2290),(0.1755)

N:69958,97232
R-squared:0.7981,0.7857
Adj R-squared:0.7981,0.7857
F-statistics:46098.5272***,59401.9037***


In [17]:
# M3
display_res(Y2009, Y2010, m3_inDepV)

locInRatio:-4.3341***,-4.3223***
(0.0785),(0.0604)
wleTripNumber:-0.0003,-0.0025***
(0.0004),(0.0003)
locTripNumber:0.0375***,0.0740***
(0.0068),(0.0044)
wleProductivity:0.0506***,0.0473***
(0.0058),(0.0038)
EP/locTrip:-1.4359***,-1.1051***
(0.0097),(0.0078)
locProductivity:-0.8191***,-0.8007***
(0.0082),(0.0060)
timePassed:0.3462***,-1.0850***
(0.0220),(0.0435)
timePassed^2:-0.0510***,0.0273***
(0.0021),(0.0012)
const:51.7958***,59.0955***
(0.2370),(0.4050)

N:69958,97232
R-squared:0.8020,0.7889
Adj R-squared:0.8020,0.7889
F-statistics:35416.2855***,45410.8602***


# Zoo

In [18]:
from information_boards import statisticsAllDrivers_ns_dpath
from information_boards import statisticsAllDriversMonth_ns1519_prefix
Y2009 = pd.read_csv('%s/%s2009.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversMonth_ns1519_prefix))
Y2010 = pd.read_csv('%s/%s2010.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversMonth_ns1519_prefix))

In [19]:
print 'The number of drivers: (Y2009, %d), (Y2010, %d)' % (len(set(Y2009['driverID'])), len(set(Y2010['driverID'])))

The number of drivers: (Y2009, 9836), (Y2010, 11018)


In [20]:
# M1
display_res(Y2009, Y2010, m1_inDepV)

locInRatio:-474.7671***,-620.3563***
(28.3808),(28.9798)
const:484.8799***,623.9428***
(21.1517),(22.6268)

N:17795,19251
R-squared:0.0155,0.0233
Adj R-squared:0.0154,0.0232
F-statistics:279.8417***,458.2374***


In [16]:
# M2a
display_res(Y2009, Y2010, m2a_inDepV)

locInRatio:-517.2908***,-652.4528***
(29.6313),(30.1250)
wleTripNumber:8.4251***,2.5459
(3.2056),(3.1103)
locTripNumber:-96.8063***,-63.9445***
(22.0812),(21.5070)
wleProductivity:1.9392**,0.1815
(0.8335),(0.2720)
const:508.1354***,698.0090***
(37.4400),(33.1498)

N:17795,19251
R-squared:0.0170,0.0240
Adj R-squared:0.0168,0.0238
F-statistics:77.1047***,118.4940***


In [17]:
# M2b
display_res(Y2009, Y2010, m2b_inDepV)

locInRatio:-22.0277***,-19.0202***
(4.4853),(3.1130)
wleTripNumber:-0.2294,0.4446
(0.4815),(0.3177)
locTripNumber:1.1342,-1.3515
(3.3178),(2.1976)
wleProductivity:-0.0634,0.0116
(0.1252),(0.0278)
EP/locTrip:-2.2357***,-2.1444***
(0.0025),(0.0016)
const:39.5927***,29.3285***
(5.6476),(3.4224)

N:17795,19251
R-squared:0.9778,0.9898
Adj R-squared:0.9778,0.9898
F-statistics:156954.1517***,374076.2246***


# Night Safari

In [18]:
from information_boards import statisticsAllDrivers_ns_dpath
from information_boards import statisticsAllDriversMonth_ns2000_prefix
Y2009 = pd.read_csv('%s/%s2009.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversMonth_ns2000_prefix))
Y2010 = pd.read_csv('%s/%s2010.csv' % (statisticsAllDrivers_ns_dpath, statisticsAllDriversMonth_ns2000_prefix))

In [19]:
print 'The number of drivers: (Y2009, %d), (Y2010, %d)' % (len(set(Y2009['driverID'])), len(set(Y2010['driverID'])))

The number of drivers: (Y2009, 13572), (Y2010, 14852)


In [15]:
def display_res(Y2009, Y2010, inDepV, fixEF):
    results = []
    for i, df in enumerate([Y2009, Y2010]):
        y = df[dep_v]
        X = df[inDepV + fixEF[i]]
        X = sm.add_constant(X)
        results.append(sm.OLS(y, X, missing='drop').fit())
    res2009, res2010 = results
    for idv in inDepV + ['const']:
        coef2009 = '%.4f' % res2009.params[idv] + significance(res2009.pvalues[idv])
        coef2010 = '%.4f' % res2010.params[idv] + significance(res2010.pvalues[idv])
        print '%s:' % idv + ','.join([coef2009, coef2010])
        ste2009 = res2009.params[idv] / res2009.tvalues[idv]
        ste2010 = res2010.params[idv] / res2010.tvalues[idv]
        print '(%.4f),(%.4f)' % (ste2009, ste2010)
    print

    fvalue2009 = '%.4f' % res2009.fvalue + significance(res2009.f_pvalue)
    fvalue2010 = '%.4f' % res2010.fvalue + significance(res2010.f_pvalue)
    print 'N:%d,%d' % (res2009.nobs, res2010.nobs)
    print 'R-squared:%.4f,%.4f' % (res2009.rsquared, res2010.rsquared)
    print 'Adj R-squared:%.4f,%.4f' % (res2009.rsquared_adj, res2010.rsquared_adj)
    print 'F-statistics:%s,%s' % (fvalue2009, fvalue2010)

In [17]:
# M4
Y2009_drivers = [str(did) for did in set(Y2009['driverID'])]
for did in Y2009_drivers:
    Y2009[did] = np.where(Y2009['driverID'] == int(did), 1, 0)
Y2010_drivers = [str(did) for did in set(Y2010['driverID'])]
for did in Y2010_drivers:
    Y2010[did] = np.where(Y2010['driverID'] == int(did), 1, 0)
display_res(Y2009, Y2010, m3_inDepV, (Y2009_drivers[:-1], Y2010_drivers[:-1]))

KeyboardInterrupt: 

In [None]:
# M5
hours = ['H%d' % x for x in range(24) if x not in [2, 3, 4, 5]]
month2009 = ['M%d' % m for m in Y2009['month']]
month2010 = ['M%d' % m for m in Y2010['month']]
for Mmonth in hours:
    Y2009[Mmonth] = np.where(Y2009['month'] == int(Mmonth[1:]), 1, 0)
    Y2010[Mmonth] = np.where(Y2010['month'] == int(Mmonth[1:]), 1, 0)