## Note: mode1-taxi, mode2-FHV, mode3-shared FHV, mode4-PT, mode5-walking

In [88]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import time
import timeit
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV
import re

## Data

In [2]:
acs = pd.read_csv('final_acs_transportation_choice.csv')
acs.head()

Unnamed: 0,taxi_zone,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
0,3.0,0.228957,35.391246,14.54735,7706.507979,949.324468
1,4.0,46.244797,152.049702,91.310873,7487.249289,2840.14534
2,5.0,0.349401,56.498123,9.104389,7989.174863,173.873224
3,6.0,0.263186,18.439186,4.534672,4767.467108,478.295847
4,7.0,29.894066,167.04157,44.019826,33307.536619,3139.50792


In [4]:
mode_data = pd.read_csv('final_allMode_with_wage_cleaned.csv', index_col=0)
mode_data.head(10)

Unnamed: 0_level_0,PUlocationID,DOlocationID,duration,price,2500,7500,12500,17500,22500,30000,42500,62500,87500,125000,225000,mode,nest
ODpair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
3-4,3,4,39.695,64.0,0.250819,0.360216,0.648452,0.282211,0.414755,0.592009,0.671282,0.93859,0.586936,0.740725,0.514005,2,1
3-4,3,4,45.216667,61.5,0.250819,0.360216,0.648452,0.282211,0.414755,0.592009,0.671282,0.93859,0.586936,0.740725,0.514005,3,1
3-7,3,7,47.880952,43.157143,1.504915,2.161294,3.890709,1.693267,2.488532,3.552056,4.027693,5.63154,3.521615,4.44435,3.084029,1,1
3-7,3,7,30.521739,47.0,1.504915,2.161294,3.890709,1.693267,2.488532,3.552056,4.027693,5.63154,3.521615,4.44435,3.084029,2,1
3-7,3,7,37.159009,45.0,1.504915,2.161294,3.890709,1.693267,2.488532,3.552056,4.027693,5.63154,3.521615,4.44435,3.084029,3,1
3-9,3,9,19.547126,39.0,0.083606,0.120072,0.216151,0.09407,0.138252,0.197336,0.223761,0.312863,0.195645,0.246908,0.171335,2,1
3-9,3,9,31.3,38.0,0.083606,0.120072,0.216151,0.09407,0.138252,0.197336,0.223761,0.312863,0.195645,0.246908,0.171335,3,1
3-10,3,10,28.5,69.5,0.12541,0.180108,0.324226,0.141106,0.207378,0.296005,0.335641,0.469295,0.293468,0.370363,0.257002,2,1
3-10,3,10,69.55,66.5,0.12541,0.180108,0.324226,0.141106,0.207378,0.296005,0.335641,0.469295,0.293468,0.370363,0.257002,3,1
3-11,3,11,98.925,91.0,0.167213,0.240144,0.432301,0.188141,0.276504,0.394673,0.447521,0.625727,0.391291,0.493817,0.34267,2,1


In [5]:
mode_data.shape[0]/5

48403.2

## Modeling

### Nested Logit Model

In [6]:
def denoSum(T, nestList, wage, Lambda, dataset):
    '''
    Calculate the denomenator for P(y=Nk)
    T: the dict that contains Tk for each Nest, in our case T={1:T1, 2:T2, 3:T3}; T1, T2, T3 defined by us 
    TotK: the total number of nest this model has, in our case TotK = 3
    '''
    deno = 0
    dictIVk = {}
    dictVj = {}
    for Nk in nestList: #k is the k Nest, in our case k=1,2,3
        Tk = T[Nk] #get the tao for nest k 
        subsetNk = dataset[dataset['nest'] == Nk]
        modes = list(subsetNk['mode'].unique()) #what modes contained in this nest k
    #     print(modes)
        sumIV = 0
        for j in (modes):   
            subset = dataset[dataset['mode'] == j]
            vj = Lambda * (float(int(wage)/124800) * float(subset['duration']) + float(subset['price']))
#             vj = utility(j, wage, Lambda, subsetNk)
            dictVj[j] = -vj
            sumIV += np.exp((1/Tk)*vj)  
    #     print(sumIV,Nk)
        IVk = (sumIV)
#         dictVj, IVk = InclusiveValue(Nk, Tk, wage, dictVj, Lambda, dataset)
        denok = np.exp(Tk*np.log(IVk))
        deno += denok
        dictIVk[Nk] = IVk
    return dictVj, dictIVk, deno


# def InclusiveValue(Nk, Tk, wage, dictVj, Lambda, dataset):
#     '''
#     Get the inclusive value for nest K
#     Nk:nest k
#     T: the dict that contains Tao(dissmilarity parameter) for each Nest. for example, Tk meeas Tao for nest K
#     '''
#     subsetNk = dataset[dataset['nest'] == Nk]
#     modes = list(subsetNk['mode'].unique()) #what modes contained in this nest k
# #     print(modes)
#     sumIV = 0
#     for j in (modes):   
#         vj = utility(j, wage, Lambda, subsetNk)
#         dictVj[j] = vj
#         sumIV += np.exp((1/Tk)*vj)  
# #     print(sumIV,Nk)
#     IVk = (sumIV)
# #     print(IVk,Nk)
#     return dictVj, IVk
    

# def utility(mode, wage, Lambda, dataset):
#     '''
#     Get the utility for mode j under specific OD pair
    
#     Arguments:
#         mode:
#         wage:
#         Lambda: parameter that trade-off different transportation mode
#         dataset:
        
#     Returns: 
#         -vj: Negative utility for the given 
#     '''
#     subset = dataset[dataset['mode'] == mode]
#     vj = Lambda * (float(int(wage)/124800) * float(subset['duration']) + float(subset['price'])) #525600: convert wage scale 'year' to 'minutes'
#     return -vj #assign negative utility?


def probability (j, Nk, T, dictVj, dictIVk, deno):  
    '''
    Calculate the probability for the specific mode j and Nest Nk
    '''
    Tk = T[Nk]
    IVk = dictIVk[Nk]
    vj = dictVj[j]
    pjk = np.exp((1/Tk)*vj)/(IVk) #pjk: P(y=j, y belong to Nk)
    pk = np.exp(Tk*np.log(IVk))/deno #pk: P(y belong to Nk)
    pj = pjk*pk #pj: P(y=j)
    return pj

### Apply to our case

In [7]:
def apply_model(T, nestList, wageList, Lambda, dataAll):
    '''
    apply to our case
    '''
    start = timeit.default_timer()

    ODpair_list = list(dataAll.index.unique())
#     print(len(ODpair_list))
    p = []
    for j in tqdm(ODpair_list,position=0): #identify OD pair  
        dataOD = dataAll[dataAll.index==j] 
        modeList = list(dataOD['mode'])
        nestList = list(dataOD['nest'])
        pop_OD = [] #store the results under each OD pair
        for wage in (wageList):            
            dictVj, dictIVk, deno = denoSum(T, set(nestList), wage, Lambda, dataOD)
            pop_mode = [] #store the results under each OD pair and each wage
            for i in (range(1,6)):
                if i in modeList: #not all modes appear in every OD pair
                    pop = dataOD[wage].mean() * probability(i, nestList[modeList.index(i)], T, dictVj, dictIVk, deno)
                    if np.isnan(pop) == True: #if predicted population is nan, replace it as 0, means no people choose
                        pop = 0
                
                    pop_mode.append(pop)
                else: 
                    pop_mode.append(0)
            pop_OD.append(pop_mode)
        pop_OD_sum = [sum(x) for x in zip(*pop_OD)] #sum the weighted population under each wage scenario
        p.append(pop_OD_sum)

    df = pd.DataFrame(p, columns=['P(mode1)', 'P(mode2)', 'P(mode3)', 'P(mode4)', 'P(mode5)']
                      , index=ODpair_list)
    
    stop = timeit.default_timer()
    timeslot = stop - start
    return df, timeslot

def compare_with_ground_truth(predictdf, truedf):
    '''
    compare our predicted transportation choice with ground truth
    
    The header of the datafrme after merge (named 'data_compare') should be like:
    taxi_zone | P(mode1)_x | P(mode2)_x | P(mode3)_x | P(mode4)_x | P(mode5)_x | P(mode1)_y | P(mode2)_y | P(mode3)_y | P(mode4)_y | P(mode5)_y
    '''
    import numpy as np
    # makesure predictdf and truedf have the same formats
    for col in predictdf.columns:
        predictdf[col] = predictdf[col].astype(float) 
    predictdf = predictdf.fillna(0)
    predictdf = predictdf.replace([np.inf, -np.inf], np.nan)
    predictdf = predictdf.dropna()
    predictdf['taxi_zone'] = predictdf.index.map(lambda x: x.split('-')[0]) #get origin taxi zone from each OD pair
    predictdf = predictdf.groupby('taxi_zone').sum().reset_index() #group the popuation by taxi zone
    predictdf['taxi_zone'] = predictdf['taxi_zone'].astype(int)
    truedf['taxi_zone'] = truedf['taxi_zone'].astype(int)
    
    data_compare = pd.merge(predictdf, truedf, left_on='taxi_zone', right_on = 'taxi_zone', how = 'left')
    data_compare = data_compare.dropna() 
    
    rmse = 0
    for i in range(1,6):
        rmsei = np.sqrt(sum((data_compare[data_compare.columns[i]] - data_compare[data_compare.columns[i+5]])**2))
    rmse += rmsei
    return data_compare, rmse

In [8]:
#choose ODpair 3-1 to test the algrithm whether bug-free:
T1 = 10
T2 = 1
T3 = 1
T = {1:T1, 2:T2, 3:T3} #Tao for each nest
Lambda = 2 
nestList = [1, 2, 3]
wagelist = ['2500', '7500', '12500', '17500', '22500', '30000', '42500', '62500', '87500', '125000', '225000']
testdf = mode_data[mode_data.index.isin(list(mode_data.index[:10]))] #test OD pair 3-1,3-2,3-3,3-4

predict_choice_test, timeslot_test = apply_model(T, nestList, wagelist, Lambda, testdf)
results_test,rmse_test = compare_with_ground_truth(predict_choice_test, acs)
print('The time used to run the code:', timeslot_test)
print('The rmse of this model is:', rmse_test)
print()
print('The predict transportation choice is:')
predict_choice_test

100%|██████████| 5/5 [00:00<00:00, 20.73it/s]

The time used to run the code: 0.22991685901070014
The rmse of this model is: 949.3244680851063

The predict transportation choice is:





Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
3-4,0.0,2.407918e-12,3.676785e-12,6.341576000000001e-66,1.237151e-63
3-7,2.005013e-08,1.195894e-08,1.613853e-08,2.8162e-51,2.521473e-49
3-9,0.0,2.097967e-08,2.215934e-08,3.656066e-47,4.546049e-46
3-10,0.0,1.392308e-13,1.602258e-13,4.4932010000000004e-76,5.865468000000001e-73
3-11,0.0,1.6660310000000002e-17,0.0,6.8995e-89,9.013368e-88


In [35]:
np.logspace(-5,5,50)

array([1.00000000e-05, 1.59985872e-05, 2.55954792e-05, 4.09491506e-05,
       6.55128557e-05, 1.04811313e-04, 1.67683294e-04, 2.68269580e-04,
       4.29193426e-04, 6.86648845e-04, 1.09854114e-03, 1.75751062e-03,
       2.81176870e-03, 4.49843267e-03, 7.19685673e-03, 1.15139540e-02,
       1.84206997e-02, 2.94705170e-02, 4.71486636e-02, 7.54312006e-02,
       1.20679264e-01, 1.93069773e-01, 3.08884360e-01, 4.94171336e-01,
       7.90604321e-01, 1.26485522e+00, 2.02358965e+00, 3.23745754e+00,
       5.17947468e+00, 8.28642773e+00, 1.32571137e+01, 2.12095089e+01,
       3.39322177e+01, 5.42867544e+01, 8.68511374e+01, 1.38949549e+02,
       2.22299648e+02, 3.55648031e+02, 5.68986603e+02, 9.10298178e+02,
       1.45634848e+03, 2.32995181e+03, 3.72759372e+03, 5.96362332e+03,
       9.54095476e+03, 1.52641797e+04, 2.44205309e+04, 3.90693994e+04,
       6.25055193e+04, 1.00000000e+05])

In [33]:
#Run the model for the whole dataset
T1 = 0.05 #should set grid search for T1 
T2 = 1
T3 = 1
T = {1:T1, 2:T2, 3:T3} #Tao for each nest
Lambda = 0.45 #should set grid search for Lambda 
nestList = [1, 2, 3]
wagelist = ['2500', '7500', '12500', '17500', '22500', '30000', '42500', '62500', '87500', '125000', '225000']

predict_transportation_choice, timeslot = apply_model(T, nestList, wagelist, Lambda, mode_data[:7000])
results,rmse = compare_with_ground_truth(mode_data[:7000], acs[:6])

100%|██████████| 1816/1816 [01:09<00:00, 28.01it/s]


In [9]:
rmse_best = np.inf

In [10]:
rmse_all = []

In [None]:
for Tau in np.logspace(-2,2,20):
    for lamb in np.logspace(-2,2,20):
        T1 = Tau #should set grid search for T1 
        T2 = 1
        T3 = 1
        T = {1:T1, 2:T2, 3:T3} #Tao for each nest
        Lambda = lamb #should set grid search for Lambda 
        nestList = [1, 2, 3]
        wagelist = ['2500', '7500', '12500', '17500', '22500', '30000', '42500', '62500', '87500', '125000', '225000']

        predict_transportation_choice, timeslot = apply_model(T, nestList, wagelist, Lambda, mode_data[:6616])
        results,rmse = compare_with_ground_truth(predict_transportation_choice, acs[:12])
        rmse_all.append(rmse)
        if rmse < rmse_best:
            predictions_best = predict_transportation_choice.copy()
            T1_best = T1
            Lambda_best = Lambda
            rmse_best = rmse
            results_best = results.copy()
        

100%|██████████| 2782/2782 [00:57<00:00, 53.28it/s]
100%|██████████| 2782/2782 [00:53<00:00, 56.30it/s]
100%|██████████| 2782/2782 [00:56<00:00, 49.20it/s]
100%|██████████| 2782/2782 [00:55<00:00, 50.21it/s]
100%|██████████| 2782/2782 [00:59<00:00, 46.69it/s]
100%|██████████| 2782/2782 [00:56<00:00, 49.40it/s]
100%|██████████| 2782/2782 [00:56<00:00, 49.47it/s]
100%|██████████| 2782/2782 [00:57<00:00, 48.81it/s]
100%|██████████| 2782/2782 [00:56<00:00, 48.83it/s]
100%|██████████| 2782/2782 [00:56<00:00, 53.66it/s]
100%|██████████| 2782/2782 [00:57<00:00, 53.04it/s]
100%|██████████| 2782/2782 [00:59<00:00, 46.88it/s]
100%|██████████| 2782/2782 [00:56<00:00, 49.01it/s]
100%|██████████| 2782/2782 [00:56<00:00, 48.84it/s]
100%|██████████| 2782/2782 [00:58<00:00, 47.91it/s]
100%|██████████| 2782/2782 [00:57<00:00, 48.75it/s]
100%|██████████| 2782/2782 [00:56<00:00, 48.90it/s]
100%|██████████| 2782/2782 [00:56<00:00, 48.85it/s]
100%|██████████| 2782/2782 [00:56<00:00, 48.94it/s]
100%|███████

In [34]:
print('The time used to run the code:', timeslot)
print('The rmse of this model is:', rmse_best)
print()
print('The predict transportation choice is:')
predictions_best.head()

The time used to run the code: 52.08275239600334
The rmse of this model is: 6436.741757990545

The predict transportation choice is:


Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
3-4,0.0,1.621239e-06,1.957537e-06,0.0,0.0
3-7,0.000292,0.0002437485,0.000274276,0.0,0.0
3-9,0.0,0.0001235899,0.0001139474,0.0,0.0
3-10,0.0,2.869206e-07,2.579036e-07,0.0,0.0
3-11,0.0,3.350715e-09,0.0,0.0,0.0


In [35]:
T1_best

0.1

In [36]:
Lambda_best

0.01

In [30]:
print('The time used to run the code:', timeslot)
print('The rmse of this model is:', rmse_best)
print()
print('The predict transportation choice is:')
predictions_best.head()

The time used to run the code: 57.63289005699335
The rmse of this model is: 6436.741757990545

The predict transportation choice is:


Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
3-4,0.0,1.621239e-06,1.957537e-06,0.0,0.0
3-7,0.000292,0.0002437485,0.000274276,0.0,0.0
3-9,0.0,0.0001235899,0.0001139474,0.0,0.0
3-10,0.0,2.869206e-07,2.579036e-07,0.0,0.0
3-11,0.0,3.350715e-09,0.0,0.0,0.0


In [31]:
T1_best

0.1

In [32]:
Lambda_best

0.01

In [95]:
rmse_best_new = np.inf

In [96]:
rmse_all_new = []

In [97]:
acs[:7]

Unnamed: 0,taxi_zone,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
0,3.0,0.526074,81.318503,33.425463,7948.271384,915.458576
1,4.0,32.88845,108.134955,64.938615,7212.769464,3373.268515
2,5.0,0.245777,39.742128,6.404244,8270.173881,114.433969
3,6.0,0.453222,31.753306,7.808959,4919.798684,480.18583
4,7.0,27.457307,153.425489,40.431632,34826.940798,2987.744775
5,9.0,0.051277,8.292491,2.34583,6118.01365,397.296751
6,10.0,8.828095,113.555345,48.233656,10257.551077,647.831828


  8%|▊         | 140/1742 [00:23<01:02, 25.70it/s]

In [100]:
for Tau in np.logspace(-1,1,10):
    for lamb in np.logspace(-2,2,10):
        T1 = Tau #should set grid search for T1 
        T2 = 1
        T3 = 1
        T = {1:T1, 2:T2, 3:T3} #Tao for each nest
        Lambda = lamb #should set grid search for Lambda 
        nestList = [1, 2, 3]
        wagelist = ['2500', '7500', '12500', '17500', '22500', '30000', '42500', '62500', '87500', '125000', '225000']

        predict_transportation_choice, timeslot = apply_model(T, nestList, wagelist, Lambda, mode_data[:7242])
        results,rmse = compare_with_ground_truth(predict_transportation_choice, acs[:7])
        rmse_all_new.append(rmse)
        if rmse < rmse_best_new:
            predictions_best_new = predict_transportation_choice.copy()
            T1_best_new = T1
            Lambda_best_new = Lambda
            rmse_best_new = rmse
            results_best_new = results.copy()
        

100%|██████████| 1742/1742 [01:07<00:00, 25.66it/s]
100%|██████████| 1742/1742 [01:04<00:00, 27.13it/s]
100%|██████████| 1742/1742 [29:50<00:00,  1.03s/it]
100%|██████████| 1742/1742 [01:05<00:00, 26.66it/s]
100%|██████████| 1742/1742 [01:08<00:00, 25.27it/s]
100%|██████████| 1742/1742 [01:08<00:00, 24.75it/s]
100%|██████████| 1742/1742 [01:05<00:00, 26.60it/s]
100%|██████████| 1742/1742 [01:06<00:00, 24.31it/s]
100%|██████████| 1742/1742 [01:08<00:00, 25.57it/s]
100%|██████████| 1742/1742 [01:07<00:00, 25.74it/s]
100%|██████████| 1742/1742 [01:03<00:00, 27.23it/s]
100%|██████████| 1742/1742 [01:04<00:00, 27.05it/s]
100%|██████████| 1742/1742 [07:42<00:00,  3.77it/s]
100%|██████████| 1742/1742 [01:11<00:00, 24.45it/s]
100%|██████████| 1742/1742 [01:12<00:00, 22.70it/s]
100%|██████████| 1742/1742 [01:09<00:00, 25.05it/s]
100%|██████████| 1742/1742 [01:04<00:00, 27.12it/s]
100%|██████████| 1742/1742 [01:03<00:00, 27.23it/s]
100%|██████████| 1742/1742 [01:09<00:00, 21.40it/s]
100%|███████

In [101]:
print('The time used to run the code:', timeslot)
print('The rmse of this model is:', rmse_best_new)
print()
print('The predict transportation choice is:')
predictions_best.head()

The time used to run the code: 66.04060706000018
The rmse of this model is: 2303.899468476459

The predict transportation choice is:


Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
3-3,1.600013e-09,0.0,0.0,0.0,0.0
3-4,0.0,3.026883e-43,2.046884e-42,0.024686,0.034683
3-7,8.317963e-30,4.598054e-31,2.096455e-30,0.528232,0.733263
3-9,0.0,5.444108e-27,1.1124769999999999e-26,0.040346,0.062675
3-10,0.0,3.446623e-47,2.934572e-46,0.005241,0.009984


In [102]:
T1_best_new

0.1

In [103]:
Lambda_best_new

0.0774263682681127

In [72]:
len(rmse_all)

385

In [74]:
np.min(np.array(rmse_all[250:]))

2114.54804310397

In [76]:
np.array(rmse_all[250:])

array([4635.28588546, 4634.99639662, 4634.03263556, 4630.8962934 ,
       4621.5652708 , 4604.45675757, 4614.39449745, 4633.1622889 ,
       4640.64603161, 4641.65298494, 4641.67265529, 4641.67443939,
       4641.67448129, 4641.67448129, 4641.67448129, 4641.67448129,
       4641.67448129, 4641.67448129, 4641.67448129, 4671.62722016,
       4671.52995778, 4671.20382924, 4670.11556122, 4666.54593084,
       4655.61976274, 4632.3888258 , 4626.55107618, 4634.39941346,
       4640.6562189 , 4641.65298506, 4641.67265529, 4641.67443939,
       4641.67448129, 4641.67448129, 4641.67448129, 7469.69799175,
       6210.75683358, 4412.70750061, 2430.80433813, 2303.87454569,
       3649.85965585, 4395.93669405, 4605.23565116, 4638.50870755,
       4641.43137962, 7180.98421512, 5951.08264099, 4209.53804258,
       2347.93149025, 2362.03710135, 3679.7094174 , 4402.56493235,
       4606.07954601, 4638.56579518, 4641.43270592, 6685.69471522,
       5500.10241295, 3856.14086899, 2225.27216469, 2480.08980

In [69]:
rmse_all[216]

2102.631062881024

In [70]:
T1_best

0.5455594781168515

In [71]:
Lambda_best

0.04832930238571752

In [52]:
np.argmin(np.array(rmse_all))

216

In [65]:
rmse_all[216]

2102.631062881024

In [53]:
T1_best

0.5455594781168515

In [54]:
Lambda_best

0.04832930238571752

### Getting data ready for plotting

In [135]:
results_2 = pd.read_csv('../results/original_predict_transportation_choice.csv',index_col=0)

In [136]:
results_2.head()

Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
3-3,273.0,0.0,0.0,0.0,0.0
3-4,0.0,0.73925,0.565102,3.753164,0.942485
3-7,2.514212,5.736936,3.988082,19.19231,4.568461
3-9,0.0,0.645932,0.321317,0.904404,0.128346
3-10,0.0,0.8963,0.111315,1.759122,0.233263


In [138]:
results_2_selected = pd.read_csv('../results/results_2_selected.csv',index_col=0)

In [139]:
results_2_selected.head()

Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
3-4,0.0,0.644911,0.595489,3.811452,0.948148
3-13,0.0,0.287853,1.780154,8.977651,1.954342
3-43,0.0,0.616507,0.495885,6.599487,1.288121
3-45,0.0,0.269604,0.685894,7.561866,1.482636
3-48,11.719155,1.373324,1.487324,41.788931,8.631266


In [148]:
results_2.loc[results_2_selected.index] = results_2_selected.loc[results_2_selected.index]

In [149]:
results_2.head()

Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
3-3,273.0,0.0,0.0,0.0,0.0
3-4,0.0,0.644911,0.595489,3.811452,0.948148
3-7,2.514212,5.736936,3.988082,19.19231,4.568461
3-9,0.0,0.645932,0.321317,0.904404,0.128346
3-10,0.0,0.8963,0.111315,1.759122,0.233263


In [152]:
results_2.rename_axis('ODpair',inplace=True)
results_2.reset_index(inplace=True)

In [153]:
results_2['O'] = results_2['ODpair'].apply(lambda x: str(x).split('-')[0]).astype('int')
results_2['D'] = results_2['ODpair'].apply(lambda x: str(x).split('-')[1]).astype('int')

In [156]:
results_2.set_index('ODpair',inplace=True)

In [157]:
results_2.head()

Unnamed: 0_level_0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5),O,D
ODpair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3-3,273.0,0.0,0.0,0.0,0.0,3,3
3-4,0.0,0.644911,0.595489,3.811452,0.948148,3,4
3-7,2.514212,5.736936,3.988082,19.19231,4.568461,3,7
3-9,0.0,0.645932,0.321317,0.904404,0.128346,3,9
3-10,0.0,0.8963,0.111315,1.759122,0.233263,3,10


In [158]:
results_2.shape

(56108, 7)

In [159]:
results_2.drop(results_2[results_2.O==results_2.D].index,inplace=True)

In [160]:
results_2.shape

(55868, 7)

In [161]:
results_2.to_csv('results_scenario_2.csv')

## Checking results csv

In [190]:
results_scene_1 = pd.read_csv('../results/Final_results/results_scenario1_SRM.csv',index_col=0,usecols=range(6))
results_scene_1.head()

Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
3-4,0.0,0.514361,2.955623e-09,5.216414,0.269225
3-7,1.992068e-07,3.910663,1.179624e-06,30.705659,1.383677
3-9,0.0,0.6564801,4.014125e-07,1.288755,0.054765
3-10,0.0,0.6715375,4.977698e-13,2.220289,0.108173
3-11,0.0,2.1441130000000002e-18,0.0,3.883684,0.116316


In [191]:
results_scene_2 = pd.read_csv('../results/Final_results/results_scenario2 -$2.75.csv',index_col=0,usecols=range(6))
results_scene_2.head()

Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
3-4,0.0,0.5139874,7.175188e-07,5.216787,0.269225
3-7,1.992068e-07,3.910663,1.179624e-06,30.705659,1.383677
3-9,0.0,0.6564801,4.014125e-07,1.288755,0.054765
3-10,0.0,0.6715375,4.977698e-13,2.220289,0.108173
3-11,0.0,2.1441130000000002e-18,0.0,3.883684,0.116316


In [192]:
results_scene_3 = pd.read_csv('../results/Final_results/results_scenario3 -$10.csv',index_col=0,usecols=range(6))
results_scene_3.head()

Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
3-4,0.0,0.5097572,1.991519e-11,5.221018,0.269225
3-7,1.992068e-07,3.910663,1.179624e-06,30.705659,1.383677
3-9,0.0,0.6564801,4.014125e-07,1.288755,0.054765
3-10,0.0,0.6715375,4.977698e-13,2.220289,0.108173
3-11,0.0,2.1441130000000002e-18,0.0,3.883684,0.116316


In [193]:
wages = pd.read_csv('final_allMode_with_wage_cleaned_update.csv')
wages.head(5)

Unnamed: 0,ODpair,12500,125000,17500,22500,225000,2500,30000,42500,62500,7500,87500,DOlocationID,PUlocationID,duration,mode,nest,price
0,3-4,0.648452,0.740725,0.282211,0.414755,0.514005,0.250819,0.592009,0.671282,0.93859,0.360216,0.586936,4,3,39.695,2,1,64.0
1,3-4,0.648452,0.740725,0.282211,0.414755,0.514005,0.250819,0.592009,0.671282,0.93859,0.360216,0.586936,4,3,45.216667,3,1,61.5
2,3-4,0.648452,0.740725,0.282211,0.414755,0.514005,0.250819,0.592009,0.671282,0.93859,0.360216,0.586936,4,3,83.0,4,2,5.5
3,3-4,0.648452,0.740725,0.282211,0.414755,0.514005,0.250819,0.592009,0.671282,0.93859,0.360216,0.586936,4,3,225.933333,5,3,0.0
4,3-7,3.890709,4.44435,1.693267,2.488532,3.084029,1.504915,3.552056,4.027693,5.63154,2.161294,3.521615,7,3,47.880952,1,1,43.157143


In [194]:
# agg_scene_1 = results_scene_1.drop('destination',axis=1).groupby('origin').sum()
# agg_scene_1.head()
# agg_scene_1.columns = [name + '_predicted' for name in agg_scene_1.columns]
# acs = pd.read_csv('final_acs_transportation_choice.csv',index_col=0)
# acs.head()
# acs.columns = [name + '_actual' for name in acs.columns]
# scene_1_combined = agg_scene_1.merge(acs,how='left',left_index=True,right_index=True)
# scene_1_combined.head()

In [195]:
# Renaming for convenience in calculation
results_scene_1.columns = [re.sub(r'[^\d.]+','',col) for col in results_scene_1.columns]
results_scene_2.columns = [re.sub(r'[^\d.]+','',col) for col in results_scene_2.columns]
results_scene_3.columns = [re.sub(r'[^\d.]+','',col) for col in results_scene_3.columns]

In [196]:
results_scene_1.reset_index(inplace=True)
results_scene_2.reset_index(inplace=True)
results_scene_3.reset_index(inplace=True)
results_scene_1.rename_axis({'index':'ODpair'},axis=1,inplace=True)
results_scene_2.rename_axis({'index':'ODpair'},axis=1,inplace=True)
results_scene_3.rename_axis({'index':'ODpair'},axis=1,inplace=True)

In [197]:
# Getting the origin taxi zone
results_scene_1['origin'] = results_scene_1.ODpair.apply(lambda x: x.split('-')[0])
results_scene_2['origin'] = results_scene_2.ODpair.apply(lambda x: x.split('-')[0])
results_scene_3['origin'] = results_scene_3.ODpair.apply(lambda x: x.split('-')[0])

In [198]:
# Aggregating the number of people in each ODpair by the origin zone
agg_scene_1 = results_scene_1.groupby('origin').sum()
agg_scene_2 = results_scene_2.groupby('origin').sum()
agg_scene_3 = results_scene_3.groupby('origin').sum()

In [199]:
agg_scene_1.head()

Unnamed: 0_level_0,1,2,3,4,5
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10,696.22652,2210.845068,186.978763,6407.563514,1355.386135
100,131.163394,1.268864,3.497878,560.384303,5.685561
101,784.254402,953.617604,359.841589,3240.818084,798.468322
102,2813.388459,1512.98024,118.758824,6902.522214,2263.350262
106,655.970129,70.909008,16.412553,2468.909214,49.799097


In [200]:
pickup_list = wages.PUlocationID.unique()

In [201]:
origin_time_and_money_avg = wages.groupby(['PUlocationID','mode'],as_index=False)['duration','price'].mean()

In [202]:
origin_time_and_money_avg.head()

Unnamed: 0,PUlocationID,mode,duration,price
0,3,1,38.302359,37.428045
1,3,2,37.718869,56.582192
2,3,3,41.066196,50.717949
3,3,4,81.756356,5.948623
4,3,5,268.553355,0.0


In [212]:
agg_scene_1.index.astype(int).merge(origin_time_and_money_avg,how='left',left_index=True,right_on='PUlocationID')

AttributeError: 'Int64Index' object has no attribute 'merge'

In [180]:
origin_time_and_money_avg.shape

(1275, 4)

In [203]:
1275/5

255.0

In [204]:
agg_scene_1.shape

(256, 5)

In [181]:
len(pickup_list)

256

In [185]:
agg_scene_3.dropna()

Unnamed: 0_level_0,1,2,3,4,5,Unnamed: 6_level_0,Unnamed: 7_level_0
origin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10,687.155797,2.176601e+03,180.319747,6457.520792,1355.402380,2430,33067
100,17.650088,1.709375e-01,0.583985,677.722317,5.872673,9300,13730
101,726.778854,9.308637e+02,330.114006,3350.772208,798.471185,23937,32036
102,2658.990857,1.458862e+03,114.212729,7114.402184,2264.532478,24888,32946
106,512.262577,7.075395e+01,16.401419,2612.780200,49.801856,21518,27501
107,1526.002103,1.665616e+02,7.620313,10303.610549,348.205448,23647,30172
108,523.706887,1.132548e+03,73.892328,6480.130073,911.722625,25380,31972
109,239.157105,2.023250e+03,568.434078,7893.206814,2074.952088,26269,32357
11,972.940983,1.540836e+03,105.018795,6878.862737,355.341124,2607,32288
110,0.000000,8.387595e+00,0.773842,4.852381,3.986182,1430,1503


In [178]:
time_diff = 0
money_diff = 0
for zone in tqdm(pickup_list,position=0):
    for col in agg_scene_1.columns:
        time_diff+=(agg_scene_1.loc[agg_scene_1.index==zone,col]-agg_scene_2.loc[agg_scene_2.index==zone,col])\
        *origin_time_and_money_avg.loc[(origin_time_and_money_avg['PUlocationID']==zone) & origin_time_and_money_avg['mode']==int(col),'duration']
        money_diff+=(agg_scene_1.loc[agg_scene_1.index==zone,col]-agg_scene_2.loc[agg_scene_2.index==zone,col])\
        *origin_time_and_money_avg.loc[(origin_time_and_money_avg['PUlocationID']==zone) & origin_time_and_money_avg['mode']==int(col),'price']

100%|██████████| 256/256 [00:07<00:00, 36.64it/s]

In [179]:
time_diff

0   NaN
2   NaN
4   NaN
dtype: float64

#### Use below code only if you want exact values for the difference in time/money for commuters as it takes a very long time to run (>3 hours). It can also be used to get the difference for each taxi zone (after some minor changes).

In [148]:
# time_total = 0
# money_total = 0
# for pair in tqdm(ODpair_list,position=0):
#     time_pair = 0
#     money_pair = 0
#     for col in results_scene_1.columns:
#         time_pair+=(results_scene_1.loc[results_scene_1.index==pair,col]-results_scene_2.loc[results_scene_2.index==pair,col])\
#         *wages.loc[(wages['ODpair']==pair) & wages['mode']==int(col),'duration']
#         money_pair+=(results_scene_1.loc[results_scene_1.index==pair,col]-results_scene_2.loc[results_scene_2.index==pair,col])\
#         *wages.loc[(wages['ODpair']==pair) & wages['mode']==int(col),'price']
#     time_total+=time_pair
#     money_total+=money_pair

array(['3-4', '3-7', '3-9', ..., '263-260', '263-261', '263-262'],
      dtype=object)

In [5]:
scene_1 = pd.read_csv('../results/scenario1_origin_with_shp.csv',index_col=0)
scene_1.head()

Unnamed: 0,zone,geometry,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5),Top_choice_mode,Nested_percet
3,Allerton/Pelham Gardens,POLYGON ((-73.84792614099985 40.87134223399991...,919.220815,1032.030446,856.827889,4532.134189,1365.786661,4,0.322545
4,Alphabet City,POLYGON ((-73.97177410965318 40.72582128133706...,2478.643144,1244.857983,1137.066639,3614.908023,2141.524211,4,0.45781
5,Arden Heights,"POLYGON ((-74.17421738099989 40.5625680859999,...",49.629786,1716.445501,1186.672071,4558.662219,716.590423,4,0.358866
6,Arrochar/Fort Wadsworth,POLYGON ((-74.06367318899999 40.60219816599994...,310.881579,1042.891135,902.121017,2651.376865,361.729405,4,0.428145
7,Astoria,POLYGON ((-73.90413637799996 40.76752031699986...,10889.777667,4194.086863,3496.564102,15648.80536,2458.766008,4,0.506444


In [15]:
test_1 = pd.read_csv('results_scenario17_7_1.csv',index_col=0)