## Note: mode1-taxi, mode2-FHV, mode3-shared FHV, mode4-PT, mode5-walking

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import time
import timeit
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV
import re

## Data

In [30]:
acs = pd.read_csv('final_acs_transportation_choice.csv')
acs.head()

Unnamed: 0,taxi_zone,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5),P(mode6)
0,3.0,0.115434,17.843262,7.334361,3885.402712,478.622467,4316.681764
1,4.0,42.851015,140.89118,84.609811,6937.780033,2631.714648,779.153313
2,5.0,0.081377,13.158607,2.120444,1860.706347,40.495673,6312.437553
3,6.0,0.109017,7.637848,1.878344,1974.772111,198.118892,3086.483787
4,7.0,25.522468,142.614028,37.582529,28436.765508,2680.39788,5365.117587


In [39]:
mode_data = pd.read_csv('final_allMode_with_2017wage_cleaned_update.csv', index_col=0)
mode_data.head(10)

Unnamed: 0,DOlocationID,ODpair,PUlocationID,duration,mode,nest,price,2500,7500,12500,17500,22500,30000,42500,62500,87500,125000,225000
0,4,3-4,3,39.695,2,1,64.0,0.250819,0.360216,0.648452,0.282211,0.414755,0.592009,0.671282,0.93859,0.586936,0.740725,0.514005
1,4,3-4,3,45.216667,3,1,61.5,0.250819,0.360216,0.648452,0.282211,0.414755,0.592009,0.671282,0.93859,0.586936,0.740725,0.514005
2,4,3-4,3,83.0,4,2,5.5,0.250819,0.360216,0.648452,0.282211,0.414755,0.592009,0.671282,0.93859,0.586936,0.740725,0.514005
3,4,3-4,3,225.933333,5,3,0.0,0.250819,0.360216,0.648452,0.282211,0.414755,0.592009,0.671282,0.93859,0.586936,0.740725,0.514005
4,4,3-4,3,39.695,6,4,9.424,0.250819,0.360216,0.648452,0.282211,0.414755,0.592009,0.671282,0.93859,0.586936,0.740725,0.514005
5,7,3-7,3,47.880952,1,1,43.157143,1.504915,2.161294,3.890709,1.693267,2.488532,3.552056,4.027693,5.63154,3.521615,4.44435,3.084029
6,7,3-7,3,30.521739,2,1,47.0,1.504915,2.161294,3.890709,1.693267,2.488532,3.552056,4.027693,5.63154,3.521615,4.44435,3.084029
7,7,3-7,3,37.159009,3,1,45.0,1.504915,2.161294,3.890709,1.693267,2.488532,3.552056,4.027693,5.63154,3.521615,4.44435,3.084029
8,7,3-7,3,69.0,4,2,5.5,1.504915,2.161294,3.890709,1.693267,2.488532,3.552056,4.027693,5.63154,3.521615,4.44435,3.084029
9,7,3-7,3,231.366667,5,3,0.0,1.504915,2.161294,3.890709,1.693267,2.488532,3.552056,4.027693,5.63154,3.521615,4.44435,3.084029


### Adding parking cost to private vehicles

In [40]:
mode_data[mode_data['mode']==6].head(10)

Unnamed: 0,DOlocationID,ODpair,PUlocationID,duration,mode,nest,price,2500,7500,12500,17500,22500,30000,42500,62500,87500,125000,225000
4,4,3-4,3,39.695,6,4,9.424,0.250819,0.360216,0.648452,0.282211,0.414755,0.592009,0.671282,0.93859,0.586936,0.740725,0.514005
10,7,3-7,3,39.201346,6,4,7.7216,1.504915,2.161294,3.890709,1.693267,2.488532,3.552056,4.027693,5.63154,3.521615,4.44435,3.084029
15,9,3-9,3,19.547126,6,4,7.513497,0.083606,0.120072,0.216151,0.09407,0.138252,0.197336,0.223761,0.312863,0.195645,0.246908,0.171335
20,10,3-10,3,28.5,6,4,11.6614,0.12541,0.180108,0.324226,0.141106,0.207378,0.296005,0.335641,0.469295,0.293468,0.370363,0.257002
24,11,3-11,3,98.925,6,4,17.531072,0.167213,0.240144,0.432301,0.188141,0.276504,0.394673,0.447521,0.625727,0.391291,0.493817,0.34267
29,13,3-13,3,54.867754,6,4,12.175743,0.543441,0.780467,1.404978,0.611458,0.898637,1.282687,1.454445,2.033612,1.271694,1.604904,1.113677
34,14,3-14,3,58.197917,6,4,16.898312,0.459835,0.660395,1.188828,0.517387,0.760385,1.08535,1.230684,1.720748,1.076049,1.357996,0.942342
39,15,3-15,3,20.229487,6,4,5.782094,0.836064,1.200719,2.161505,0.940704,1.382518,1.973364,2.237607,3.128633,1.956453,2.469084,1.71335
44,16,3-16,3,19.294928,6,4,7.154699,0.668851,0.960575,1.729204,0.752563,1.106014,1.578691,1.790086,2.502907,1.565162,1.975267,1.37068
49,17,3-17,3,50.945,6,4,11.9776,0.877867,1.260755,2.26958,0.987739,1.451644,2.072033,2.349487,3.285065,2.054275,2.592538,1.799017


In [41]:
mode_data.shape[0]/5

59726.4

In [42]:
selected_zones_2 = [140,141, 237, 236, 263, 262, 43, 238, 239, 143,142,12, \
                  88, 261, 13, 87, 209, 231, 45, 232, 148, 144, 211, 125, 158,\
                  249, 114, 113, 79, 4, 224, 107, 234, 90, 68, 246, 186, 164, 100,\
                  170, 137, 233, 162, 161, 230, 48, 50, 163, 229]

In [43]:
selected_zones_3 = [12, 88, 261, 13, 87, 209, 231, 45, 232, 148, 144, 211, 125, 158, 249, 114, 113, 79, 4, 
                  224, 107, 234, 90, 68, 246, 186, 164, 100, 170, 137, 233, 162, 161, 230, 48, 50, 163, 229]

In [44]:
mode_data.loc[(mode_data['mode'] == 6) & (mode_data.DOlocationID.isin(selected_zones_3)),'price'] += 20

In [45]:
mode_data.loc[(mode_data['mode'] == 6) & (mode_data.DOlocationID.isin(selected_zones_2)) & ~(mode_data.DOlocationID.isin(selected_zones_3)),'price'] += 15

In [46]:
mode_data.loc[(mode_data['mode'] == 6) & ~(mode_data.DOlocationID.isin(selected_zones_2)) & ~(mode_data.DOlocationID.isin(selected_zones_3)),'price'] += 5

In [47]:
mode_data[mode_data['mode']==6].head(10)

Unnamed: 0,DOlocationID,ODpair,PUlocationID,duration,mode,nest,price,2500,7500,12500,17500,22500,30000,42500,62500,87500,125000,225000
4,4,3-4,3,39.695,6,4,29.424,0.250819,0.360216,0.648452,0.282211,0.414755,0.592009,0.671282,0.93859,0.586936,0.740725,0.514005
10,7,3-7,3,39.201346,6,4,12.7216,1.504915,2.161294,3.890709,1.693267,2.488532,3.552056,4.027693,5.63154,3.521615,4.44435,3.084029
15,9,3-9,3,19.547126,6,4,12.513497,0.083606,0.120072,0.216151,0.09407,0.138252,0.197336,0.223761,0.312863,0.195645,0.246908,0.171335
20,10,3-10,3,28.5,6,4,16.6614,0.12541,0.180108,0.324226,0.141106,0.207378,0.296005,0.335641,0.469295,0.293468,0.370363,0.257002
24,11,3-11,3,98.925,6,4,22.531072,0.167213,0.240144,0.432301,0.188141,0.276504,0.394673,0.447521,0.625727,0.391291,0.493817,0.34267
29,13,3-13,3,54.867754,6,4,32.175743,0.543441,0.780467,1.404978,0.611458,0.898637,1.282687,1.454445,2.033612,1.271694,1.604904,1.113677
34,14,3-14,3,58.197917,6,4,21.898312,0.459835,0.660395,1.188828,0.517387,0.760385,1.08535,1.230684,1.720748,1.076049,1.357996,0.942342
39,15,3-15,3,20.229487,6,4,10.782094,0.836064,1.200719,2.161505,0.940704,1.382518,1.973364,2.237607,3.128633,1.956453,2.469084,1.71335
44,16,3-16,3,19.294928,6,4,12.154699,0.668851,0.960575,1.729204,0.752563,1.106014,1.578691,1.790086,2.502907,1.565162,1.975267,1.37068
49,17,3-17,3,50.945,6,4,16.9776,0.877867,1.260755,2.26958,0.987739,1.451644,2.072033,2.349487,3.285065,2.054275,2.592538,1.799017


In [48]:
mode_data.to_csv('final_allMode_with_2017wage_cleaned_update.csv')

## Modeling

### Nested Logit Model

In [6]:
def denoSum(T, nestList, wage, Lambda, dataset):
    '''
    Calculate the denomenator for P(y=Nk)
    T: the dict that contains Tk for each Nest, in our case T={1:T1, 2:T2, 3:T3}; T1, T2, T3 defined by us 
    TotK: the total number of nest this model has, in our case TotK = 3
    '''
    deno = 0
    dictIVk = {}
    dictVj = {}
    for Nk in nestList: #k is the k Nest, in our case k=1,2,3
        Tk = T[Nk] #get the tao for nest k 
        subsetNk = dataset[dataset['nest'] == Nk]
        modes = list(subsetNk['mode'].unique()) #what modes contained in this nest k
    #     print(modes)
        sumIV = 0
        for j in (modes):   
            subset = dataset[dataset['mode'] == j]
            vj = Lambda * (float(int(wage)/124800) * float(subset['duration']) + float(subset['price']))
#             vj = utility(j, wage, Lambda, subsetNk)
            dictVj[j] = -vj
            sumIV += np.exp((1/Tk)*vj)  
    #     print(sumIV,Nk)
        IVk = (sumIV)
#         dictVj, IVk = InclusiveValue(Nk, Tk, wage, dictVj, Lambda, dataset)
        denok = np.exp(Tk*np.log(IVk))
        deno += denok
        dictIVk[Nk] = IVk
    return dictVj, dictIVk, deno


# def InclusiveValue(Nk, Tk, wage, dictVj, Lambda, dataset):
#     '''
#     Get the inclusive value for nest K
#     Nk:nest k
#     T: the dict that contains Tao(dissmilarity parameter) for each Nest. for example, Tk meeas Tao for nest K
#     '''
#     subsetNk = dataset[dataset['nest'] == Nk]
#     modes = list(subsetNk['mode'].unique()) #what modes contained in this nest k
# #     print(modes)
#     sumIV = 0
#     for j in (modes):   
#         vj = utility(j, wage, Lambda, subsetNk)
#         dictVj[j] = vj
#         sumIV += np.exp((1/Tk)*vj)  
# #     print(sumIV,Nk)
#     IVk = (sumIV)
# #     print(IVk,Nk)
#     return dictVj, IVk
    

# def utility(mode, wage, Lambda, dataset):
#     '''
#     Get the utility for mode j under specific OD pair
    
#     Arguments:
#         mode:
#         wage:
#         Lambda: parameter that trade-off different transportation mode
#         dataset:
        
#     Returns: 
#         -vj: Negative utility for the given 
#     '''
#     subset = dataset[dataset['mode'] == mode]
#     vj = Lambda * (float(int(wage)/124800) * float(subset['duration']) + float(subset['price'])) #525600: convert wage scale 'year' to 'minutes'
#     return -vj #assign negative utility?


def probability (j, Nk, T, dictVj, dictIVk, deno):  
    '''
    Calculate the probability for the specific mode j and Nest Nk
    '''
    Tk = T[Nk]
    IVk = dictIVk[Nk]
    vj = dictVj[j]
    pjk = np.exp((1/Tk)*vj)/(IVk) #pjk: P(y=j, y belong to Nk)
    pk = np.exp(Tk*np.log(IVk))/deno #pk: P(y belong to Nk)
    pj = pjk*pk #pj: P(y=j)
    return pj

### Apply to our case

In [7]:
def apply_model(T, nestList, wageList, Lambda, dataAll):
    '''
    apply to our case
    '''
    start = timeit.default_timer()

    ODpair_list = list(dataAll.index.unique())
#     print(len(ODpair_list))
    p = []
    for j in tqdm(ODpair_list,position=0): #identify OD pair  
        dataOD = dataAll[dataAll.index==j] 
        modeList = list(dataOD['mode'])
        nestList = list(dataOD['nest'])
        pop_OD = [] #store the results under each OD pair
        for wage in (wageList):            
            dictVj, dictIVk, deno = denoSum(T, set(nestList), wage, Lambda, dataOD)
            pop_mode = [] #store the results under each OD pair and each wage
            for i in (range(1,6)):
                if i in modeList: #not all modes appear in every OD pair
                    pop = dataOD[wage].mean() * probability(i, nestList[modeList.index(i)], T, dictVj, dictIVk, deno)
                    if np.isnan(pop) == True: #if predicted population is nan, replace it as 0, means no people choose
                        pop = 0
                
                    pop_mode.append(pop)
                else: 
                    pop_mode.append(0)
            pop_OD.append(pop_mode)
        pop_OD_sum = [sum(x) for x in zip(*pop_OD)] #sum the weighted population under each wage scenario
        p.append(pop_OD_sum)

    df = pd.DataFrame(p, columns=['P(mode1)', 'P(mode2)', 'P(mode3)', 'P(mode4)', 'P(mode5)']
                      , index=ODpair_list)
    
    stop = timeit.default_timer()
    timeslot = stop - start
    return df, timeslot

def compare_with_ground_truth(predictdf, truedf):
    '''
    compare our predicted transportation choice with ground truth
    
    The header of the datafrme after merge (named 'data_compare') should be like:
    taxi_zone | P(mode1)_x | P(mode2)_x | P(mode3)_x | P(mode4)_x | P(mode5)_x | P(mode1)_y | P(mode2)_y | P(mode3)_y | P(mode4)_y | P(mode5)_y
    '''
    import numpy as np
    # makesure predictdf and truedf have the same formats
    for col in predictdf.columns:
        predictdf[col] = predictdf[col].astype(float) 
    predictdf = predictdf.fillna(0)
    predictdf = predictdf.replace([np.inf, -np.inf], np.nan)
    predictdf = predictdf.dropna()
    predictdf['taxi_zone'] = predictdf.index.map(lambda x: x.split('-')[0]) #get origin taxi zone from each OD pair
    predictdf = predictdf.groupby('taxi_zone').sum().reset_index() #group the popuation by taxi zone
    predictdf['taxi_zone'] = predictdf['taxi_zone'].astype(int)
    truedf['taxi_zone'] = truedf['taxi_zone'].astype(int)
    
    data_compare = pd.merge(predictdf, truedf, left_on='taxi_zone', right_on = 'taxi_zone', how = 'left')
    data_compare = data_compare.dropna() 
    
    rmse = 0
    for i in range(1,6):
        rmsei = np.sqrt(sum((data_compare[data_compare.columns[i]] - data_compare[data_compare.columns[i+5]])**2))
    rmse += rmsei
    return data_compare, rmse

In [8]:
#choose ODpair 3-1 to test the algrithm whether bug-free:
T1 = 10
T2 = 1
T3 = 1
T = {1:T1, 2:T2, 3:T3} #Tao for each nest
Lambda = 2 
nestList = [1, 2, 3]
wagelist = ['2500', '7500', '12500', '17500', '22500', '30000', '42500', '62500', '87500', '125000', '225000']
testdf = mode_data[mode_data.index.isin(list(mode_data.index[:10]))] #test OD pair 3-1,3-2,3-3,3-4

predict_choice_test, timeslot_test = apply_model(T, nestList, wagelist, Lambda, testdf)
results_test,rmse_test = compare_with_ground_truth(predict_choice_test, acs)
print('The time used to run the code:', timeslot_test)
print('The rmse of this model is:', rmse_test)
print()
print('The predict transportation choice is:')
predict_choice_test

100%|██████████| 5/5 [00:00<00:00, 20.73it/s]

The time used to run the code: 0.22991685901070014
The rmse of this model is: 949.3244680851063

The predict transportation choice is:





Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
3-4,0.0,2.407918e-12,3.676785e-12,6.341576000000001e-66,1.237151e-63
3-7,2.005013e-08,1.195894e-08,1.613853e-08,2.8162e-51,2.521473e-49
3-9,0.0,2.097967e-08,2.215934e-08,3.656066e-47,4.546049e-46
3-10,0.0,1.392308e-13,1.602258e-13,4.4932010000000004e-76,5.865468000000001e-73
3-11,0.0,1.6660310000000002e-17,0.0,6.8995e-89,9.013368e-88


In [35]:
np.logspace(-5,5,50)

array([1.00000000e-05, 1.59985872e-05, 2.55954792e-05, 4.09491506e-05,
       6.55128557e-05, 1.04811313e-04, 1.67683294e-04, 2.68269580e-04,
       4.29193426e-04, 6.86648845e-04, 1.09854114e-03, 1.75751062e-03,
       2.81176870e-03, 4.49843267e-03, 7.19685673e-03, 1.15139540e-02,
       1.84206997e-02, 2.94705170e-02, 4.71486636e-02, 7.54312006e-02,
       1.20679264e-01, 1.93069773e-01, 3.08884360e-01, 4.94171336e-01,
       7.90604321e-01, 1.26485522e+00, 2.02358965e+00, 3.23745754e+00,
       5.17947468e+00, 8.28642773e+00, 1.32571137e+01, 2.12095089e+01,
       3.39322177e+01, 5.42867544e+01, 8.68511374e+01, 1.38949549e+02,
       2.22299648e+02, 3.55648031e+02, 5.68986603e+02, 9.10298178e+02,
       1.45634848e+03, 2.32995181e+03, 3.72759372e+03, 5.96362332e+03,
       9.54095476e+03, 1.52641797e+04, 2.44205309e+04, 3.90693994e+04,
       6.25055193e+04, 1.00000000e+05])

In [33]:
#Run the model for the whole dataset
T1 = 0.05 #should set grid search for T1 
T2 = 1
T3 = 1
T = {1:T1, 2:T2, 3:T3} #Tao for each nest
Lambda = 0.45 #should set grid search for Lambda 
nestList = [1, 2, 3]
wagelist = ['2500', '7500', '12500', '17500', '22500', '30000', '42500', '62500', '87500', '125000', '225000']

predict_transportation_choice, timeslot = apply_model(T, nestList, wagelist, Lambda, mode_data[:7000])
results,rmse = compare_with_ground_truth(mode_data[:7000], acs[:6])

100%|██████████| 1816/1816 [01:09<00:00, 28.01it/s]


In [9]:
rmse_best = np.inf

In [10]:
rmse_all = []

In [None]:
for Tau in np.logspace(-2,2,20):
    for lamb in np.logspace(-2,2,20):
        T1 = Tau #should set grid search for T1 
        T2 = 1
        T3 = 1
        T = {1:T1, 2:T2, 3:T3} #Tao for each nest
        Lambda = lamb #should set grid search for Lambda 
        nestList = [1, 2, 3]
        wagelist = ['2500', '7500', '12500', '17500', '22500', '30000', '42500', '62500', '87500', '125000', '225000']

        predict_transportation_choice, timeslot = apply_model(T, nestList, wagelist, Lambda, mode_data[:6616])
        results,rmse = compare_with_ground_truth(predict_transportation_choice, acs[:12])
        rmse_all.append(rmse)
        if rmse < rmse_best:
            predictions_best = predict_transportation_choice.copy()
            T1_best = T1
            Lambda_best = Lambda
            rmse_best = rmse
            results_best = results.copy()
        

100%|██████████| 2782/2782 [00:57<00:00, 53.28it/s]
100%|██████████| 2782/2782 [00:53<00:00, 56.30it/s]
100%|██████████| 2782/2782 [00:56<00:00, 49.20it/s]
100%|██████████| 2782/2782 [00:55<00:00, 50.21it/s]
100%|██████████| 2782/2782 [00:59<00:00, 46.69it/s]
100%|██████████| 2782/2782 [00:56<00:00, 49.40it/s]
100%|██████████| 2782/2782 [00:56<00:00, 49.47it/s]
100%|██████████| 2782/2782 [00:57<00:00, 48.81it/s]
100%|██████████| 2782/2782 [00:56<00:00, 48.83it/s]
100%|██████████| 2782/2782 [00:56<00:00, 53.66it/s]
100%|██████████| 2782/2782 [00:57<00:00, 53.04it/s]
100%|██████████| 2782/2782 [00:59<00:00, 46.88it/s]
100%|██████████| 2782/2782 [00:56<00:00, 49.01it/s]
100%|██████████| 2782/2782 [00:56<00:00, 48.84it/s]
100%|██████████| 2782/2782 [00:58<00:00, 47.91it/s]
100%|██████████| 2782/2782 [00:57<00:00, 48.75it/s]
100%|██████████| 2782/2782 [00:56<00:00, 48.90it/s]
100%|██████████| 2782/2782 [00:56<00:00, 48.85it/s]
100%|██████████| 2782/2782 [00:56<00:00, 48.94it/s]
100%|███████

In [34]:
print('The time used to run the code:', timeslot)
print('The rmse of this model is:', rmse_best)
print()
print('The predict transportation choice is:')
predictions_best.head()

The time used to run the code: 52.08275239600334
The rmse of this model is: 6436.741757990545

The predict transportation choice is:


Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
3-4,0.0,1.621239e-06,1.957537e-06,0.0,0.0
3-7,0.000292,0.0002437485,0.000274276,0.0,0.0
3-9,0.0,0.0001235899,0.0001139474,0.0,0.0
3-10,0.0,2.869206e-07,2.579036e-07,0.0,0.0
3-11,0.0,3.350715e-09,0.0,0.0,0.0


In [35]:
T1_best

0.1

In [36]:
Lambda_best

0.01

In [30]:
print('The time used to run the code:', timeslot)
print('The rmse of this model is:', rmse_best)
print()
print('The predict transportation choice is:')
predictions_best.head()

The time used to run the code: 57.63289005699335
The rmse of this model is: 6436.741757990545

The predict transportation choice is:


Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
3-4,0.0,1.621239e-06,1.957537e-06,0.0,0.0
3-7,0.000292,0.0002437485,0.000274276,0.0,0.0
3-9,0.0,0.0001235899,0.0001139474,0.0,0.0
3-10,0.0,2.869206e-07,2.579036e-07,0.0,0.0
3-11,0.0,3.350715e-09,0.0,0.0,0.0


In [31]:
T1_best

0.1

In [32]:
Lambda_best

0.01

In [95]:
rmse_best_new = np.inf

In [96]:
rmse_all_new = []

In [97]:
acs[:7]

Unnamed: 0,taxi_zone,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
0,3.0,0.526074,81.318503,33.425463,7948.271384,915.458576
1,4.0,32.88845,108.134955,64.938615,7212.769464,3373.268515
2,5.0,0.245777,39.742128,6.404244,8270.173881,114.433969
3,6.0,0.453222,31.753306,7.808959,4919.798684,480.18583
4,7.0,27.457307,153.425489,40.431632,34826.940798,2987.744775
5,9.0,0.051277,8.292491,2.34583,6118.01365,397.296751
6,10.0,8.828095,113.555345,48.233656,10257.551077,647.831828


  8%|▊         | 140/1742 [00:23<01:02, 25.70it/s]

In [100]:
for Tau in np.logspace(-1,1,10):
    for lamb in np.logspace(-2,2,10):
        T1 = Tau #should set grid search for T1 
        T2 = 1
        T3 = 1
        T = {1:T1, 2:T2, 3:T3} #Tao for each nest
        Lambda = lamb #should set grid search for Lambda 
        nestList = [1, 2, 3]
        wagelist = ['2500', '7500', '12500', '17500', '22500', '30000', '42500', '62500', '87500', '125000', '225000']

        predict_transportation_choice, timeslot = apply_model(T, nestList, wagelist, Lambda, mode_data[:7242])
        results,rmse = compare_with_ground_truth(predict_transportation_choice, acs[:7])
        rmse_all_new.append(rmse)
        if rmse < rmse_best_new:
            predictions_best_new = predict_transportation_choice.copy()
            T1_best_new = T1
            Lambda_best_new = Lambda
            rmse_best_new = rmse
            results_best_new = results.copy()
        

100%|██████████| 1742/1742 [01:07<00:00, 25.66it/s]
100%|██████████| 1742/1742 [01:04<00:00, 27.13it/s]
100%|██████████| 1742/1742 [29:50<00:00,  1.03s/it]
100%|██████████| 1742/1742 [01:05<00:00, 26.66it/s]
100%|██████████| 1742/1742 [01:08<00:00, 25.27it/s]
100%|██████████| 1742/1742 [01:08<00:00, 24.75it/s]
100%|██████████| 1742/1742 [01:05<00:00, 26.60it/s]
100%|██████████| 1742/1742 [01:06<00:00, 24.31it/s]
100%|██████████| 1742/1742 [01:08<00:00, 25.57it/s]
100%|██████████| 1742/1742 [01:07<00:00, 25.74it/s]
100%|██████████| 1742/1742 [01:03<00:00, 27.23it/s]
100%|██████████| 1742/1742 [01:04<00:00, 27.05it/s]
100%|██████████| 1742/1742 [07:42<00:00,  3.77it/s]
100%|██████████| 1742/1742 [01:11<00:00, 24.45it/s]
100%|██████████| 1742/1742 [01:12<00:00, 22.70it/s]
100%|██████████| 1742/1742 [01:09<00:00, 25.05it/s]
100%|██████████| 1742/1742 [01:04<00:00, 27.12it/s]
100%|██████████| 1742/1742 [01:03<00:00, 27.23it/s]
100%|██████████| 1742/1742 [01:09<00:00, 21.40it/s]
100%|███████

In [101]:
print('The time used to run the code:', timeslot)
print('The rmse of this model is:', rmse_best_new)
print()
print('The predict transportation choice is:')
predictions_best.head()

The time used to run the code: 66.04060706000018
The rmse of this model is: 2303.899468476459

The predict transportation choice is:


Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
3-3,1.600013e-09,0.0,0.0,0.0,0.0
3-4,0.0,3.026883e-43,2.046884e-42,0.024686,0.034683
3-7,8.317963e-30,4.598054e-31,2.096455e-30,0.528232,0.733263
3-9,0.0,5.444108e-27,1.1124769999999999e-26,0.040346,0.062675
3-10,0.0,3.446623e-47,2.934572e-46,0.005241,0.009984


In [102]:
T1_best_new

0.1

In [103]:
Lambda_best_new

0.0774263682681127

In [72]:
len(rmse_all)

385

In [74]:
np.min(np.array(rmse_all[250:]))

2114.54804310397

In [76]:
np.array(rmse_all[250:])

array([4635.28588546, 4634.99639662, 4634.03263556, 4630.8962934 ,
       4621.5652708 , 4604.45675757, 4614.39449745, 4633.1622889 ,
       4640.64603161, 4641.65298494, 4641.67265529, 4641.67443939,
       4641.67448129, 4641.67448129, 4641.67448129, 4641.67448129,
       4641.67448129, 4641.67448129, 4641.67448129, 4671.62722016,
       4671.52995778, 4671.20382924, 4670.11556122, 4666.54593084,
       4655.61976274, 4632.3888258 , 4626.55107618, 4634.39941346,
       4640.6562189 , 4641.65298506, 4641.67265529, 4641.67443939,
       4641.67448129, 4641.67448129, 4641.67448129, 7469.69799175,
       6210.75683358, 4412.70750061, 2430.80433813, 2303.87454569,
       3649.85965585, 4395.93669405, 4605.23565116, 4638.50870755,
       4641.43137962, 7180.98421512, 5951.08264099, 4209.53804258,
       2347.93149025, 2362.03710135, 3679.7094174 , 4402.56493235,
       4606.07954601, 4638.56579518, 4641.43270592, 6685.69471522,
       5500.10241295, 3856.14086899, 2225.27216469, 2480.08980

In [69]:
rmse_all[216]

2102.631062881024

In [70]:
T1_best

0.5455594781168515

In [71]:
Lambda_best

0.04832930238571752

In [52]:
np.argmin(np.array(rmse_all))

216

In [65]:
rmse_all[216]

2102.631062881024

In [53]:
T1_best

0.5455594781168515

In [54]:
Lambda_best

0.04832930238571752

### Getting data ready for plotting

In [135]:
results_2 = pd.read_csv('../results/original_predict_transportation_choice.csv',index_col=0)

In [136]:
results_2.head()

Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
3-3,273.0,0.0,0.0,0.0,0.0
3-4,0.0,0.73925,0.565102,3.753164,0.942485
3-7,2.514212,5.736936,3.988082,19.19231,4.568461
3-9,0.0,0.645932,0.321317,0.904404,0.128346
3-10,0.0,0.8963,0.111315,1.759122,0.233263


In [138]:
results_2_selected = pd.read_csv('../results/results_2_selected.csv',index_col=0)

In [139]:
results_2_selected.head()

Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
3-4,0.0,0.644911,0.595489,3.811452,0.948148
3-13,0.0,0.287853,1.780154,8.977651,1.954342
3-43,0.0,0.616507,0.495885,6.599487,1.288121
3-45,0.0,0.269604,0.685894,7.561866,1.482636
3-48,11.719155,1.373324,1.487324,41.788931,8.631266


In [148]:
results_2.loc[results_2_selected.index] = results_2_selected.loc[results_2_selected.index]

In [149]:
results_2.head()

Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
3-3,273.0,0.0,0.0,0.0,0.0
3-4,0.0,0.644911,0.595489,3.811452,0.948148
3-7,2.514212,5.736936,3.988082,19.19231,4.568461
3-9,0.0,0.645932,0.321317,0.904404,0.128346
3-10,0.0,0.8963,0.111315,1.759122,0.233263


In [152]:
results_2.rename_axis('ODpair',inplace=True)
results_2.reset_index(inplace=True)

In [153]:
results_2['O'] = results_2['ODpair'].apply(lambda x: str(x).split('-')[0]).astype('int')
results_2['D'] = results_2['ODpair'].apply(lambda x: str(x).split('-')[1]).astype('int')

In [156]:
results_2.set_index('ODpair',inplace=True)

In [157]:
results_2.head()

Unnamed: 0_level_0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5),O,D
ODpair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3-3,273.0,0.0,0.0,0.0,0.0,3,3
3-4,0.0,0.644911,0.595489,3.811452,0.948148,3,4
3-7,2.514212,5.736936,3.988082,19.19231,4.568461,3,7
3-9,0.0,0.645932,0.321317,0.904404,0.128346,3,9
3-10,0.0,0.8963,0.111315,1.759122,0.233263,3,10


In [158]:
results_2.shape

(56108, 7)

In [159]:
results_2.drop(results_2[results_2.O==results_2.D].index,inplace=True)

In [160]:
results_2.shape

(55868, 7)

In [161]:
results_2.to_csv('results_scenario_2.csv')

## Daily Impacts

In [412]:
results_scene_1 = pd.read_csv('../results/Final_results/results_scenario1_SRM.csv',index_col=0,usecols=range(6))
results_scene_1.head()

Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
3-4,0.0,0.514361,2.955623e-09,5.216414,0.269225
3-7,1.992068e-07,3.910663,1.179624e-06,30.705659,1.383677
3-9,0.0,0.6564801,4.014125e-07,1.288755,0.054765
3-10,0.0,0.6715375,4.977698e-13,2.220289,0.108173
3-11,0.0,2.1441130000000002e-18,0.0,3.883684,0.116316


In [413]:
results_scene_2 = pd.read_csv('../results/Final_results/results_scenario2 -$2.75.csv',index_col=0,usecols=range(6))
results_scene_2.head()

Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
3-4,0.0,0.5139874,7.175188e-07,5.216787,0.269225
3-7,1.992068e-07,3.910663,1.179624e-06,30.705659,1.383677
3-9,0.0,0.6564801,4.014125e-07,1.288755,0.054765
3-10,0.0,0.6715375,4.977698e-13,2.220289,0.108173
3-11,0.0,2.1441130000000002e-18,0.0,3.883684,0.116316


In [414]:
results_scene_3 = pd.read_csv('../results/Final_results/results_scenario3 -$10.csv',index_col=0,usecols=range(6))
results_scene_3.head()

Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
3-4,0.0,0.5097572,1.991519e-11,5.221018,0.269225
3-7,1.992068e-07,3.910663,1.179624e-06,30.705659,1.383677
3-9,0.0,0.6564801,4.014125e-07,1.288755,0.054765
3-10,0.0,0.6715375,4.977698e-13,2.220289,0.108173
3-11,0.0,2.1441130000000002e-18,0.0,3.883684,0.116316


In [415]:
wages = pd.read_csv('final_allMode_with_wage_cleaned_update.csv')
wages.head(5)

Unnamed: 0,ODpair,12500,125000,17500,22500,225000,2500,30000,42500,62500,7500,87500,DOlocationID,PUlocationID,duration,mode,nest,price
0,3-4,0.648452,0.740725,0.282211,0.414755,0.514005,0.250819,0.592009,0.671282,0.93859,0.360216,0.586936,4,3,39.695,2,1,64.0
1,3-4,0.648452,0.740725,0.282211,0.414755,0.514005,0.250819,0.592009,0.671282,0.93859,0.360216,0.586936,4,3,45.216667,3,1,61.5
2,3-4,0.648452,0.740725,0.282211,0.414755,0.514005,0.250819,0.592009,0.671282,0.93859,0.360216,0.586936,4,3,83.0,4,2,5.5
3,3-4,0.648452,0.740725,0.282211,0.414755,0.514005,0.250819,0.592009,0.671282,0.93859,0.360216,0.586936,4,3,225.933333,5,3,0.0
4,3-7,3.890709,4.44435,1.693267,2.488532,3.084029,1.504915,3.552056,4.027693,5.63154,2.161294,3.521615,7,3,47.880952,1,1,43.157143


In [416]:
# agg_scene_1 = results_scene_1.drop('destination',axis=1).groupby('origin').sum()
# agg_scene_1.head()
# agg_scene_1.columns = [name + '_predicted' for name in agg_scene_1.columns]
# acs = pd.read_csv('final_acs_transportation_choice.csv',index_col=0)
# acs.head()
# acs.columns = [name + '_actual' for name in acs.columns]
# scene_1_combined = agg_scene_1.merge(acs,how='left',left_index=True,right_index=True)
# scene_1_combined.head()

In [417]:
# Renaming for convenience in calculation
results_scene_1.columns = [re.sub(r'[^\d.]+','',col) for col in results_scene_1.columns]
results_scene_2.columns = [re.sub(r'[^\d.]+','',col) for col in results_scene_2.columns]
results_scene_3.columns = [re.sub(r'[^\d.]+','',col) for col in results_scene_3.columns]

In [418]:
results_scene_1.reset_index(inplace=True)
results_scene_2.reset_index(inplace=True)
results_scene_3.reset_index(inplace=True)
results_scene_1.rename_axis({'index':'ODpair'},axis=1,inplace=True)
results_scene_2.rename_axis({'index':'ODpair'},axis=1,inplace=True)
results_scene_3.rename_axis({'index':'ODpair'},axis=1,inplace=True)

In [419]:
results_scene_1.head()

Unnamed: 0,ODpair,1,2,3,4,5
0,3-4,0.0,0.514361,2.955623e-09,5.216414,0.269225
1,3-7,1.992068e-07,3.910663,1.179624e-06,30.705659,1.383677
2,3-9,0.0,0.6564801,4.014125e-07,1.288755,0.054765
3,3-10,0.0,0.6715375,4.977698e-13,2.220289,0.108173
4,3-11,0.0,2.1441130000000002e-18,0.0,3.883684,0.116316


In [420]:
# Getting the origin taxi zone
results_scene_1['origin'] = results_scene_1.ODpair.apply(lambda x: x.split('-')[0]).astype('int')
results_scene_2['origin'] = results_scene_2.ODpair.apply(lambda x: x.split('-')[0]).astype('int')
results_scene_3['origin'] = results_scene_3.ODpair.apply(lambda x: x.split('-')[0]).astype('int')

In [421]:
results_scene_1.head()

Unnamed: 0,ODpair,1,2,3,4,5,origin
0,3-4,0.0,0.514361,2.955623e-09,5.216414,0.269225,3
1,3-7,1.992068e-07,3.910663,1.179624e-06,30.705659,1.383677,3
2,3-9,0.0,0.6564801,4.014125e-07,1.288755,0.054765,3
3,3-10,0.0,0.6715375,4.977698e-13,2.220289,0.108173,3
4,3-11,0.0,2.1441130000000002e-18,0.0,3.883684,0.116316,3


In [422]:
# Aggregating the number of people in each ODpair by the origin zone
agg_scene_1 = results_scene_1.groupby('origin').sum()
agg_scene_2 = results_scene_2.groupby('origin').sum()
agg_scene_3 = results_scene_3.groupby('origin').sum()

In [425]:
agg_scene_1.columns = agg_scene_1.columns.astype('int')
agg_scene_2.columns = agg_scene_2.columns.astype('int')
agg_scene_3.columns = agg_scene_3.columns.astype('int')

In [426]:
agg_scene_1.head()

Unnamed: 0_level_0,1,2,3,4,5
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,685.603794,977.251252,75.745589,6180.570657,786.828708
4,3702.898406,143.502105,18.156156,4343.766378,2408.676955
5,20.785981,2194.47761,332.31474,5113.791586,566.630083
6,469.347879,833.12334,458.550032,3303.577898,204.400852
7,11600.290406,1758.881372,56.419983,21768.173576,1504.234664


In [427]:
pickup_list = wages.PUlocationID.unique()

In [428]:
origin_time_and_money_avg = wages.groupby(['PUlocationID','mode'],as_index=False)['duration','price'].mean()

In [429]:
origin_time_and_money_avg.head()

Unnamed: 0,PUlocationID,mode,duration,price
0,3,1,38.302359,37.428045
1,3,2,37.718869,56.582192
2,3,3,41.066196,50.717949
3,3,4,81.756356,5.948623
4,3,5,268.553355,0.0


In [443]:
pivoted_time = origin_time_and_money_avg.reset_index().pivot(index='PUlocationID',columns='mode',values='duration')

In [444]:
commute_time = pivoted_time.multiply(agg_scene_1).sum().sum()

In [445]:
avg_commute_time = commute_time/agg_scene_1.sum().sum()
avg_commute_time

78.74569460772672

In [447]:
pivoted_money = origin_time_and_money_avg.reset_index().pivot(index='PUlocationID',columns='mode',values='price')

In [449]:
commute_cost = pivoted_money.multiply(agg_scene_1).sum().sum()

In [450]:
avg_commute_cost = commute_cost/agg_scene_1.sum().sum()
avg_commute_cost

13.736955111569769

In [283]:
agg_scene_1.shape

(256, 5)

In [181]:
len(pickup_list)

256

In [289]:
agg_scene_1.head()

Unnamed: 0_level_0,1,2,3,4,5
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
3,685.603794,977.251252,75.745589,6180.570657,786.828708
4,3702.898406,143.502105,18.156156,4343.766378,2408.676955
5,20.785981,2194.47761,332.31474,5113.791586,566.630083
6,469.347879,833.12334,458.550032,3303.577898,204.400852
7,11600.290406,1758.881372,56.419983,21768.173576,1504.234664


In [301]:
agg_scene_1.loc[3,'3']

75.7455894490215

In [311]:
(agg_scene_1.loc[3,'3']-agg_scene_2.loc[3,'3'])

-3.3565324350312977

In [315]:
test = (origin_time_and_money_avg.loc[(origin_time_and_money_avg['PUlocationID']==3)\
                                                       & (origin_time_and_money_avg['mode']==3)])
int(test.price)

50

### Scenario 1 VS 2

In [350]:
time_diff = 0
money_diff = 0
for zone in tqdm(pickup_list,position=0):
    for col in agg_scene_1.columns:
        diff = (agg_scene_2.loc[zone,col]-agg_scene_1.loc[zone,col])
        time_and_money = (origin_time_and_money_avg.loc[(origin_time_and_money_avg['PUlocationID']==zone)\
                                                       & (origin_time_and_money_avg['mode']==int(col))])
        time_diff+= diff * (0 if time_and_money.empty else time_and_money['duration'].values[0])
        money_diff+= diff * (0 if time_and_money.empty else time_and_money['price'].values[0])

100%|██████████| 256/256 [00:01<00:00, 235.76it/s]

In [351]:
time_diff

1667692.829832572

In [352]:
money_diff

-1286908.224275543

In [353]:
population = agg_scene_1.sum().sum()

In [354]:
time_diff/population

0.5787885396030046

In [355]:
money_diff/population

-0.446633647640205

In [369]:
agg_scene_1.sum()[4]

247082.53426691794

In [370]:
agg_scene_2.sum()[4]

249534.08844091842

In [371]:
agg_scene_3.sum()[4]

251654.04940363334

### Scenario 1 VS 3

In [356]:
time_diff = 0
money_diff = 0
for zone in tqdm(pickup_list,position=0):
    for col in agg_scene_1.columns:
        diff = (agg_scene_3.loc[zone,col]-agg_scene_1.loc[zone,col])
        time_and_money = (origin_time_and_money_avg.loc[(origin_time_and_money_avg['PUlocationID']==zone)\
                                                       & (origin_time_and_money_avg['mode']==int(col))])
        time_diff+= diff * (0 if time_and_money.empty else time_and_money['duration'].values[0])
        money_diff+= diff * (0 if time_and_money.empty else time_and_money['price'].values[0])

100%|██████████| 256/256 [00:01<00:00, 207.93it/s]

In [357]:
time_diff

4214488.158604856

In [358]:
money_diff

-3453331.6291632275

In [359]:
population = agg_scene_2.sum().sum()

In [360]:
time_diff/population

1.4626779001849843

In [361]:
money_diff/population

-1.1985113413295143

### Scenario 2 VS 3

In [362]:
time_diff = 0
money_diff = 0
for zone in tqdm(pickup_list,position=0):
    for col in agg_scene_2.columns:
        diff = (agg_scene_3.loc[zone,col]-agg_scene_2.loc[zone,col])
        time_and_money = (origin_time_and_money_avg.loc[(origin_time_and_money_avg['PUlocationID']==zone)\
                                                       & (origin_time_and_money_avg['mode']==int(col))])
        time_diff+= diff * (0 if time_and_money.empty else time_and_money['duration'].values[0])
        money_diff+= diff * (0 if time_and_money.empty else time_and_money['price'].values[0])

100%|██████████| 256/256 [00:01<00:00, 230.74it/s]

In [363]:
time_diff

2546795.3287722874

In [364]:
money_diff

-2166423.404887686

In [365]:
population = agg_scene_2.sum().sum()

In [366]:
time_diff/population

0.883889360581981

In [367]:
money_diff/population

-0.7518776936893098

#### Use below code only if you want exact values for the difference in time/money for commuters as it takes a very long time to run (>3 hours). It can also be used to get the difference for each taxi zone (after some minor changes).

In [148]:
# time_total = 0
# money_total = 0
# for pair in tqdm(ODpair_list,position=0):
#     time_pair = 0
#     money_pair = 0
#     for col in results_scene_1.columns:
#         time_pair+=(results_scene_1.loc[results_scene_1.index==pair,col]-results_scene_2.loc[results_scene_2.index==pair,col])\
#         *wages.loc[(wages['ODpair']==pair) & wages['mode']==int(col),'duration']
#         money_pair+=(results_scene_1.loc[results_scene_1.index==pair,col]-results_scene_2.loc[results_scene_2.index==pair,col])\
#         *wages.loc[(wages['ODpair']==pair) & wages['mode']==int(col),'price']
#     time_total+=time_pair
#     money_total+=money_pair

array(['3-4', '3-7', '3-9', ..., '263-260', '263-261', '263-262'],
      dtype=object)

In [5]:
scene_1 = pd.read_csv('../results/scenario1_origin_with_shp.csv',index_col=0)
scene_1.head()

Unnamed: 0,zone,geometry,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5),Top_choice_mode,Nested_percet
3,Allerton/Pelham Gardens,POLYGON ((-73.84792614099985 40.87134223399991...,919.220815,1032.030446,856.827889,4532.134189,1365.786661,4,0.322545
4,Alphabet City,POLYGON ((-73.97177410965318 40.72582128133706...,2478.643144,1244.857983,1137.066639,3614.908023,2141.524211,4,0.45781
5,Arden Heights,"POLYGON ((-74.17421738099989 40.5625680859999,...",49.629786,1716.445501,1186.672071,4558.662219,716.590423,4,0.358866
6,Arrochar/Fort Wadsworth,POLYGON ((-74.06367318899999 40.60219816599994...,310.881579,1042.891135,902.121017,2651.376865,361.729405,4,0.428145
7,Astoria,POLYGON ((-73.90413637799996 40.76752031699986...,10889.777667,4194.086863,3496.564102,15648.80536,2458.766008,4,0.506444


In [15]:
test_1 = pd.read_csv('results_scenario17_7_1.csv',index_col=0)