In [1]:
import __init__
#
from IPython.display import HTML, display
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind
#
# some functions
#
def text_display(text, font_size):
    display(HTML('<font size=%d>' % font_size + text + '</font>'))

def table_display(table_data):
    display(HTML(
    '<table><tr>{}</tr></table>'.format(
        '</tr><tr>'.join(
            '<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) for row in table_data)
        )
    ))

# Questions
* Q1: What was changed after information boards were set up?
    * H1: Queueing time at the airport in 2009 and 2010 is different
    * H2: Economic profit in 2009 and 2010 is different
    * H3: Productivity in 2009 and 2010 is different

* Q2: Is the impact of information boards valid?
    * Simple regression
    * Multivariate regression
    
# Variables
* All values are total ones during a certain period
* AP trip represents trips which depart from the airport

| Variable        | Description |
| ---------------- |------------------|
| tripNumber       | The number of trips |
| operatingHour    | The operating hour (unit hour) |
| Fare             | The amount of fare (unit S$\$$) |
| apNumber         | The number of AP trips |
| apInNumber       | The number of AP trips whose previous trip ended at the airport |
| apOutNumber      | The difference between apNumber and apInNumber |
| apDuration       | The amount of time passengers are on board (unit minute) |
| apQTime          | The amount of waiting time for taking passengers (unit minute) |
| apFare           | The amount of fare earned through AP trips (unit S$\$$) |
| apEconomicProfit | The difference between apFare and opportunity cost (unit S$\$$) |

# Derive variables for analysis
| Variable             | Description |
| --------------------- |------------------|
| QTime/apTrip          | The average queuing time at the airport (unit minute) |
| economicProfit/apTrip | The average economic profit about a AP trip (unit S$\$$) |
| Productivity          | The rate of fare per operating hour (unit S$\$$ / hour) |
| apProductivity        | The rate of apFare per the sum of apDuration and apQTime (unit S$\$$ / hour) |

In [4]:
from information_boards import ssDriversStatisticsDayBasedModi_ap_fpath
df = pd.read_csv(ssDriversStatisticsDayBasedModi_ap_fpath)
display(df.head())

Unnamed: 0,year,month,day,driverID,tripNumber,operatingHour,Fare,apNumber,apDuration,apFare,apEconomicProfit,apQTime,apInNumber,apOutNumber,QTime/apTrip,economicProfit/apTrip,Productivity,apProductivity
0,2009,1,1,8240,22,6.816667,274,1,12.0,12,0.724007,25.083333,0,1,25.083333,0.724007,40.195599,19.41573
1,2009,1,1,16511,22,11.0,264,1,19.0,17,0.386804,34.7,0,1,34.7,0.386804,24.0,18.994413
2,2009,1,1,8327,4,3.966667,80,2,40.0,46,-5.265366,107.816667,0,2,53.908333,-2.632683,20.168067,18.671778
3,2009,1,1,8360,17,10.7,186,2,44.0,39,-9.475221,96.033333,0,2,48.016667,-4.73761,17.383178,16.710307
4,2009,1,1,8380,6,5.083333,78,1,26.0,22,6.571396,22.116667,0,1,22.116667,6.571396,15.344262,27.433322


# Average

In [5]:
attributes = ['tripNumber','operatingHour','Fare']
stats = [np.mean(df[attr]) for attr in attributes]
table_display([attributes, stats])
attributes = ['apNumber','apInNumber','apOutNumber', 'apQTime','apDuration','apFare','apEconomicProfit']
stats = [np.mean(df[attr]) for attr in attributes]
table_display([attributes, stats])
attributes = ['QTime/apTrip', 'economicProfit/apTrip', 'Productivity', 'apProductivity']
stats = [np.mean(df[attr]) for attr in attributes]
table_display([attributes, stats])

0,1,2
tripNumber,operatingHour,Fare
20.5815415104,9.51356016834,236.40508611


0,1,2,3,4,5,6
apNumber,apInNumber,apOutNumber,apQTime,apDuration,apFare,apEconomicProfit
1.49209613041,0.761818587817,0.73027754259,51.436393466,33.2073032872,30.3966309698,-3.20255776634


0,1,2,3
QTime/apTrip,economicProfit/apTrip,Productivity,apProductivity
35.5094687806,-2.45178834157,25.1918826042,22.5313039418


# Standard deviation

In [6]:
attributes = ['tripNumber','operatingHour','Fare']
stats = [np.std(df[attr]) for attr in attributes]
table_display([attributes, stats])
attributes = ['apNumber','apInNumber','apOutNumber', 'apQTime','apDuration','apFare','apEconomicProfit']
stats = [np.std(df[attr]) for attr in attributes]
table_display([attributes, stats])
attributes = ['QTime/apTrip', 'economicProfit/apTrip', 'Productivity', 'apProductivity']
stats = [np.std(df[attr]) for attr in attributes]
table_display([attributes, stats])

0,1,2
tripNumber,operatingHour,Fare
7.29893772745,2.82046256045,75.5451044897


0,1,2,3,4,5,6
apNumber,apInNumber,apOutNumber,apQTime,apDuration,apFare,apEconomicProfit
0.738754429934,0.620429397624,0.824249611985,31.1182085348,17.7599411017,16.1323612755,9.37535889989


0,1,2,3
QTime/apTrip,economicProfit/apTrip,Productivity,apProductivity
16.6516425523,6.9523295689,5.81513439411,6.85364485303


# t-test

In [7]:
Y2009, Y2010 = df[(df['year'] == 2009)], df[(df['year'] == 2010)]
# statistics
attributes = ['QTime/apTrip', 'economicProfit/apTrip', 'Productivity', 'apProductivity']
Y2009_avg_std = [(Y2009[attr].mean(), Y2009[attr].std()) for attr in attributes]
Y2010_avg_std = [(Y2010[attr].mean(), Y2010[attr].std()) for attr in attributes]
t_test = [ttest_ind(Y2010[attr], Y2009[attr]) for attr in attributes]
# display
# table_display([ ['Year'] + attributes,
#                 ['2009'] + ['%.2f(%.2f)' % (a, v) for a, v in Y2009_avg_std],
#                 ['2010'] + ['%.2f(%.2f)' % (a, v) for a, v in Y2010_avg_std],
#                 ['Diff.'] + ['%.2f' % (Y2010_avg_std[i][0] - Y2009_avg_std[i][0]) for i in range(len(Y2009_avg_std))],
#                 ['t-test'] + ['%.2f(%.2f)' % (t, p) for t, p in t_test]
#                 ])
table_display([ ['Year'] + attributes,
                ['2009'] + ['%f(%f)' % (a, v) for a, v in Y2009_avg_std],
                ['2010'] + ['%f(%f)' % (a, v) for a, v in Y2010_avg_std],
                ['Diff.'] + ['%f' % (Y2010_avg_std[i][0] - Y2009_avg_std[i][0]) for i in range(len(Y2009_avg_std))],
                ['t-test'] + ['%f(%f)' % (t, p) for t, p in t_test]
                ])

0,1,2,3,4
Year,QTime/apTrip,economicProfit/apTrip,Productivity,apProductivity
2009,38.915466(17.891110),-2.704482(7.204491),23.883347(5.435751),21.269859(6.699606)
2010,32.870265(15.104609),-2.255984(6.743986),26.205827(5.896494),23.508759(6.811910)
Diff.,-6.045201,0.448498,2.322480,2.238900
t-test,-120.018634(0.000000),20.988980(0.000000),132.502021(0.000000),107.653457(0.000000)
