In [2]:
# PS2 - CE264

# importing the requried libraries
from collections import OrderedDict    # For recording the model specification 
import pandas as pd                    # For file input/output
import numpy as np                     # For vectorized math operations
import pylogit as pl                   # For MNL model estimation and
                                       # conversion from wide to long format

# reading the data file 
data_01 = pd.read_csv("Air_Travel_Survey.csv",sep=",")

In [3]:
ind_variables = data_01.columns.tolist()[:14]

alt_varying_variables = {u'aircraft_type': dict([(1, 'a1aircraft'),
                                                 (2, 'a2aircraft')]),
                          u'departure_time': dict([(1, 'a1departMAM'),
                                                   (2, 'a2departMAM')]),
                          u'connections': dict([(1, 'a1connections'),
                                                (2, 'a2connections')]),
                          u'travel_time': dict([(1, 'a1travtime'),
                                                (2, 'a2travtime')]),                         
                          u'arrival_time': dict([(1, 'a1arriveMAM'),
                                                 (2, 'a2arriveMAM')]),       
                          u'time_diff': dict([(1, 'a1timediff'),
                                              (2, 'a2timediff')]), 
                          u'performance': dict([(1, 'a1performance'),
                                                (2, 'a2performance')]), 
                          u'fare': dict([(1, 'a1fare'),
                                         (2, 'a2fare')]), 
                          u'airline': dict([(1, 'a1airline'),
                                            (2, 'a2airline')])}

In [4]:
availability_variables = {1: 'a1_AV',
                          2: 'a2_AV'}

In [5]:
custom_alt_id = "alternative_id"

obs_id_column = "choiceSituationID"

choice_column = "choice"

data_long = pl.convert_wide_to_long(data_01, 
                                    ind_variables, 
                                    alt_varying_variables, 
                                    availability_variables, 
                                    obs_id_column, 
                                    choice_column,
                                    new_alt_id_name=custom_alt_id)

In [102]:
def airline(i):
    if  0 < data_long['airline'][i] < 8:
        return data_long.iloc[i, data_long['airline'][i] + 9]
    else:
        return 1

data_long['FFP membership'] = [airline(i)  for i in range(len(data_long['airline']))]

In [103]:
data_long["travel_time_hrs"] = data_long["travel_time"] / 60.0
data_long["fare_100$"] = data_long["fare"] / 100.0

basic_specification = OrderedDict()
basic_names = OrderedDict()

basic_specification["connections"] = 'all_same'
basic_names["connections"] = 'Connection'

basic_specification["travel_time"] = 'all_same'
basic_names["travel_time"] = 'Time'

basic_specification["fare"] = 'all_same'
basic_names["fare"] = 'Fare'

basic_specification["FFP membership"] = 'all_same'
basic_names["FFP membership"] = 'Membership'

In [104]:
a_priori = pl.create_choice_model(data=data_long,
                                        alt_id_col=custom_alt_id,
                                        obs_id_col=obs_id_column,
                                        choice_col=choice_column,
                                        specification=basic_specification,
                                        model_type="MNL",
                                        names=basic_names)
a_priori.fit_mle(np.zeros(4)) 

Log-likelihood at zero: -4,868.6658
Initial Log-likelihood: -4,868.6658
Estimation Time for Point Estimation: 0.12 seconds.
Final log-likelihood: -3,968.9836


In [105]:
a_priori.get_statsmodels_summary()

0,1,2,3
Dep. Variable:,choice,No. Observations:,7024.0
Model:,Multinomial Logit Model,Df Residuals:,7020.0
Method:,MLE,Df Model:,4.0
Date:,"Thu, 08 Feb 2018",Pseudo R-squ.:,0.185
Time:,12:09:06,Pseudo R-bar-squ.:,0.184
AIC:,7945.967,Log-Likelihood:,-3968.984
BIC:,7973.395,LL-Null:,-4868.666

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Connection,-0.5883,0.056,-10.595,0.000,-0.697,-0.479
Time,-0.0041,0.001,-7.145,0.000,-0.005,-0.003
Fare,-0.0050,0.000,-23.985,0.000,-0.005,-0.005
Membership,0.4222,0.041,10.275,0.000,0.342,0.503


In [107]:
## Model for likelihood ratio test
basic_specification = OrderedDict()
basic_names = OrderedDict()

basic_specification["connections"] = [1,2]
basic_names["connections"] = ['Connection A1', 'Connection A2']

basic_specification["travel_time"] = 'all_same'
basic_names["travel_time"] = 'Time'

basic_specification["fare"] = 'all_same'
basic_names["fare"] = 'Fare'

basic_specification["FFP membership"] = 'all_same'
basic_names["FFP membership"] = 'Membership'

likeli_test = pl.create_choice_model(data=data_long,
                                        alt_id_col=custom_alt_id,
                                        obs_id_col=obs_id_column,
                                        choice_col=choice_column,
                                        specification=basic_specification,
                                        model_type="MNL",
                                        names=basic_names)

likeli_test.fit_mle(np.zeros(5))

likeli_test.get_statsmodels_summary()

Log-likelihood at zero: -4,868.6658
Initial Log-likelihood: -4,868.6658
Estimation Time for Point Estimation: 0.05 seconds.
Final log-likelihood: -3,964.3559


0,1,2,3
Dep. Variable:,choice,No. Observations:,7024.0
Model:,Multinomial Logit Model,Df Residuals:,7019.0
Method:,MLE,Df Model:,5.0
Date:,"Thu, 08 Feb 2018",Pseudo R-squ.:,0.186
Time:,12:09:30,Pseudo R-bar-squ.:,0.185
AIC:,7938.712,Log-Likelihood:,-3964.356
BIC:,7972.997,LL-Null:,-4868.666

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Connection A1,-0.5370,0.058,-9.277,0.000,-0.650,-0.424
Connection A2,-0.6448,0.059,-11.002,0.000,-0.760,-0.530
Time,-0.0041,0.001,-7.115,0.000,-0.005,-0.003
Fare,-0.0050,0.000,-24.000,0.000,-0.005,-0.005
Membership,0.4203,0.041,10.233,0.000,0.340,0.501


In [109]:
## Model for t test
data_long["departure_hrs"] = data_long["departure_time"] / 60.0
data_long["arrival_hrs"] = data_long["arrival_time"] / 60.0

basic_specification = OrderedDict()
basic_names = OrderedDict()

basic_specification["connections"] = 'all_same'
basic_names["connections"] = 'Connection'

basic_specification["travel_time"] = 'all_same'
basic_names["travel_time"] = 'Time'

basic_specification["fare"] = 'all_same'
basic_names["fare"] = 'Fare'

basic_specification["FFP membership"] = 'all_same'
basic_names["FFP membership"] = 'Membership'

basic_specification["FFP membership"] = 'all_same'
basic_names["FFP membership"] = 'Membership'

basic_specification["departure_hrs"] = 'all_same'
basic_names["departure_hrs"] = 'Departure Time'

basic_specification["arrival_hrs"] = 'all_same'
basic_names["arrival_hrs"] = 'Arrival Time'

t_test = pl.create_choice_model(data=data_long,
                                        alt_id_col=custom_alt_id,
                                        obs_id_col=obs_id_column,
                                        choice_col=choice_column,
                                        specification=basic_specification,
                                        model_type="MNL",
                                        names=basic_names)
t_test.fit_mle(np.zeros(6)) 

t_test.get_statsmodels_summary()

Log-likelihood at zero: -4,868.6658
Initial Log-likelihood: -4,868.6658
Estimation Time for Point Estimation: 0.05 seconds.
Final log-likelihood: -3,965.3954


0,1,2,3
Dep. Variable:,choice,No. Observations:,7024.0
Model:,Multinomial Logit Model,Df Residuals:,7018.0
Method:,MLE,Df Model:,6.0
Date:,"Thu, 08 Feb 2018",Pseudo R-squ.:,0.186
Time:,12:11:26,Pseudo R-bar-squ.:,0.184
AIC:,7942.791,Log-Likelihood:,-3965.395
BIC:,7983.933,LL-Null:,-4868.666

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Connection,-0.5846,0.056,-10.511,0.000,-0.694,-0.476
Time,-0.0039,0.001,-6.638,0.000,-0.005,-0.003
Fare,-0.0050,0.000,-23.967,0.000,-0.005,-0.005
Membership,0.4223,0.041,10.276,0.000,0.342,0.503
Departure Time,0.0246,0.009,2.665,0.008,0.007,0.043
Arrival Time,-0.0041,0.013,-0.324,0.746,-0.029,0.021


In [114]:
## Best Model
data_long["performance/purpose"] = data_long["performance"] / 10 / data_long["purpose"]
data_long["affordability"] = data_long["fare"] / (data_long["income"]** data_long["payment"])
data_long["time"] = data_long["travel_time_hrs"] * data_long["age"]
data_long["unit-price"] = data_long["fare"] / (data_long["classTicket"])

basic_specification = OrderedDict()
basic_names = OrderedDict()

basic_specification["connections"] = [1,2]
basic_names["connections"] = ['Connection A1', 'Connection A2']

basic_specification["FFP membership"] = [1,2]
basic_names["FFP membership"] = ['Membership A1', 'Membership A2']

basic_specification["departure_hrs"] = 'all_same'
basic_names["departure_hrs"] = 'Departure Time'

#heterogeneity
basic_specification["performance/purpose"] = 'all_same'
basic_names["performance/purpose"] = 'performance'

basic_specification["affordability"] = 'all_same'
basic_names["affordability"] = 'affordability'

basic_specification["time"] = 'all_same'
basic_names["time"] = 'time'

basic_specification["unit-price"] = 'all_same'
basic_names["unit-price"] = 'unit-price'

best = pl.create_choice_model(data=data_long,
                                        alt_id_col=custom_alt_id,
                                        obs_id_col=obs_id_column,
                                        choice_col=choice_column,
                                        specification=basic_specification,
                                        model_type="MNL",
                                        names=basic_names)
best.fit_mle(np.zeros(9))

best.get_statsmodels_summary()

Log-likelihood at zero: -4,868.6658
Initial Log-likelihood: -4,868.6658
Estimation Time for Point Estimation: 0.06 seconds.
Final log-likelihood: -3,756.1497


0,1,2,3
Dep. Variable:,choice,No. Observations:,7024.0
Model:,Multinomial Logit Model,Df Residuals:,7015.0
Method:,MLE,Df Model:,9.0
Date:,"Thu, 08 Feb 2018",Pseudo R-squ.:,0.229
Time:,14:44:30,Pseudo R-bar-squ.:,0.227
AIC:,7530.299,Log-Likelihood:,-3756.15
BIC:,7592.013,LL-Null:,-4868.666

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Connection A1,-0.6651,0.059,-11.333,0.000,-0.780,-0.550
Connection A2,-0.6755,0.059,-11.508,0.000,-0.791,-0.560
Membership A1,0.4876,0.044,11.021,0.000,0.401,0.574
Membership A2,0.4134,0.044,9.430,0.000,0.328,0.499
Departure Time,0.0283,0.009,3.119,0.002,0.011,0.046
performance,0.2335,0.030,7.673,0.000,0.174,0.293
affordability,0.0023,0.001,2.777,0.005,0.001,0.004
time,-0.0527,0.008,-7.016,0.000,-0.067,-0.038
unit-price,-0.0085,0.000,-25.682,0.000,-0.009,-0.008


In [1]:
#best.get_statsmodels_summary().as_latex()