In [1]:
import __init__
#
from IPython.display import HTML, display
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
#
# some functions
#
def text_display(text, font_size):
    display(HTML('<font size=%d>' % font_size + text + '</font>'))

def table_display(table_data):
    display(HTML(
    '<table><tr>{}</tr></table>'.format(
        '</tr><tr>'.join(
            '<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) for row in table_data)
        )
    ))

# Questions
* Q1: What was changed after information boards were set up?
    * H1: Queueing time at the airport in 2009 and 2010 is different
    * H2: Economic profit in 2009 and 2010 is different
    * H3: Productivity in 2009 and 2010 is different

* Q2: Is the impact of information boards valid?
    * Simple regression
    * Multivariate regression
    
# Variables
* All values are total ones during a certain period
* AP trip represents trips which depart from the airport

| Variable        | Description |
| ---------------- |------------------|
| tripNumber       | The number of trips |
| operatingHour    | The operating hour (unit hour) |
| Fare             | The amount of fare (unit S$\$$) |
| apNumber         | The number of AP trips |
| apInNumber       | The number of AP trips whose previous trip ended at the airport |
| apOutNumber      | The difference between apNumber and apInNumber |
| apDuration       | The amount of time passengers are on board (unit minute) |
| apQTime          | The amount of waiting time for taking passengers (unit minute) |
| apFare           | The amount of fare earned through AP trips (unit S$\$$) |
| apEconomicProfit | The difference between apFare and opportunity cost (unit S$\$$) |

# Derive variables for analysis
| Variable             | Description |
| --------------------- |------------------|
| QTime/apTrip          | The average queuing time at the airport (unit minute) |
| economicProfit/apTrip | The average economic profit about a AP trip (unit S$\$$) |
| Productivity          | The rate of fare per operating hour (unit S$\$$ / hour) |
| apProductivity        | The rate of apFare per the sum of apDuration and apQTime (unit S$\$$ / hour) |

In [2]:
from information_boards import ssDriversStatisticsDayBased_ap_fpath
df = pd.read_csv(ssDriversStatisticsDayBased_ap_fpath)
display(df.head())

Unnamed: 0,year,month,day,driverID,tripNumber,operatingHour,Fare,apNumber,apDuration,apFare,apEconomicProfit,apQTime,apInNumber,apOutNumber,QTime/apTrip,economicProfit/apTrip,Productivity,apProductivity
0,2009,1,1,8240,22,6.816667,274,1,12.0,12,72.400709,25.083333,0,1,25.083333,72.400709,40.195599,19.41573
1,2009,1,1,16511,22,11.0,264,1,19.0,17,38.680422,34.7,0,1,34.7,38.680422,24.0,18.994413
2,2009,1,1,8327,4,3.966667,80,2,40.0,46,-526.53657,107.816667,0,2,53.908333,-263.268285,20.168067,18.671778
3,2009,1,1,8360,17,10.7,186,2,44.0,39,-947.522072,96.033333,0,2,48.016667,-473.761036,17.383178,16.710307
4,2009,1,1,8380,6,5.083333,78,1,26.0,22,657.139597,22.116667,0,1,22.116667,657.139597,15.344262,27.433322


In [3]:
attributes = ['tripNumber','operatingHour','Fare']
stats = [np.mean(df[attr]) for attr in attributes]
table_display([attributes, stats])
attributes = ['apNumber','apInNumber','apOutNumber', 'apQTime','apDuration','apFare','apEconomicProfit']
stats = [np.mean(df[attr]) for attr in attributes]
table_display([attributes, stats])
attributes = ['QTime/apTrip', 'economicProfit/apTrip', 'Productivity', 'apProductivity']
stats = [np.mean(df[attr]) for attr in attributes]
table_display([attributes, stats])

0,1,2
tripNumber,operatingHour,Fare
20.7517159579,9.50398798283,237.288602419


0,1,2,3,4,5,6
apNumber,apInNumber,apOutNumber,apQTime,apDuration,apFare,apEconomicProfit
1.48652672649,0.763249317206,0.723277409286,51.1175071401,33.0808770972,30.1942848225,-316.058436358


0,1,2,3
QTime/apTrip,economicProfit/apTrip,Productivity,apProductivity
35.3979132848,-242.161622648,25.3206228802,22.5094033858


In [4]:
Y2009, Y2010 = df[(df['year'] == 2009)], df[(df['year'] == 2010)]
# statistics
attributes = ['QTime/apTrip', 'economicProfit/apTrip', 'Productivity', 'apProductivity']
Y2009_avg_std = [(Y2009[attr].mean(), Y2009[attr].std()) for attr in attributes]
Y2010_avg_std = [(Y2010[attr].mean(), Y2010[attr].std()) for attr in attributes]
t_test = [ttest_ind(Y2010[attr], Y2009[attr]) for attr in attributes]
# display
table_display([ ['Year'] + attributes,
                ['2009'] + ['%.2f(%.2f)' % (a, v) for a, v in Y2009_avg_std],
                ['2010'] + ['%.2f(%.2f)' % (a, v) for a, v in Y2010_avg_std],
                ['Diff.'] + ['%.2f' % (Y2010_avg_std[i][0] - Y2009_avg_std[i][0]) for i in range(len(Y2009_avg_std))],
                ['t-test'] + ['%.2f(%.2f)' % (t, p) for t, p in t_test]
                ])

0,1,2,3,4
Year,QTime/apTrip,economicProfit/apTrip,Productivity,apProductivity
2009,38.70(17.77),-272.16(717.88),24.02(5.43),21.26(6.69)
2010,32.74(15.07),-218.05(668.71),26.37(6.05),23.51(6.78)
Diff.,-5.95,54.11,2.35,2.25
t-test,-102.59(0.00),22.03(0.00),114.49(0.00),94.04(0.00)


In [8]:
from information_boards import ssDriversStatisticsMonthBased2009_ap_fpath
from information_boards import ssDriversStatisticsMonthBased2010_ap_fpath

Y2009_df = pd.read_csv(ssDriversStatisticsMonthBased2009_ap_fpath)
Y2010_df = pd.read_csv(ssDriversStatisticsMonthBased2010_ap_fpath)

items = [cn for cn in Y2009_df.columns if cn not in ['month', 'driverID']]

Y2009_pd = pd.Panel(dict(zip(items, [Y2009_df.pivot(index='month', columns='driverID', values=i) for i in items])))
Y2010_pd = pd.Panel(dict(zip(items, [Y2010_df.pivot(index='month', columns='driverID', values=i) for i in items])))

<class 'pandas.core.panel.Panel'>
Dimensions: 15 (items) x 11 (major_axis) x 711 (minor_axis)
Items axis: Fare to tripNumber
Major_axis axis: 1 to 12
Minor_axis axis: 87 to 49415