In [1]:
import numpy as np
import pandas as pd
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

## Data

In [3]:
acs = pd.read_csv('../modeling/final_acs_transportation_choice.csv')#, index_col=0)
acs.head()

Unnamed: 0,taxi_zone,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5),P(mode6)
0,3.0,0.115434,17.843262,7.334361,3885.402712,478.622467,4316.681764
1,4.0,42.851015,140.89118,84.609811,6937.780033,2631.714648,779.153313
2,5.0,0.081377,13.158607,2.120444,1860.706347,40.495673,6312.437553
3,6.0,0.109017,7.637848,1.878344,1974.772111,198.118892,3086.483787
4,7.0,25.522468,142.614028,37.582529,28436.765508,2680.39788,5365.117587


In [4]:
mode_data = pd.read_csv('../modeling/final_allMode_with_2017wage_cleaned_update.csv', index_col=0)
mode_data = mode_data.dropna()
mode_data.head()

Unnamed: 0,DOlocationID,ODpair,PUlocationID,duration,mode,nest,price,2500,7500,12500,17500,22500,30000,42500,62500,87500,125000,225000
0,4,3-4,3,39.695,2,1,64.0,0.250819,0.360216,0.648452,0.282211,0.414755,0.592009,0.671282,0.93859,0.586936,0.740725,0.514005
1,4,3-4,3,45.216667,3,1,61.5,0.250819,0.360216,0.648452,0.282211,0.414755,0.592009,0.671282,0.93859,0.586936,0.740725,0.514005
2,4,3-4,3,83.0,4,2,5.5,0.250819,0.360216,0.648452,0.282211,0.414755,0.592009,0.671282,0.93859,0.586936,0.740725,0.514005
3,4,3-4,3,225.933333,5,3,0.0,0.250819,0.360216,0.648452,0.282211,0.414755,0.592009,0.671282,0.93859,0.586936,0.740725,0.514005
4,4,3-4,3,39.695,6,4,29.424,0.250819,0.360216,0.648452,0.282211,0.414755,0.592009,0.671282,0.93859,0.586936,0.740725,0.514005


### Apply to our case

In [23]:
def compare_with_ground_truth(predictdf, truedf):
    '''
    compare our predicted transportation choice with ground truth
    
    The header of the datafrme after merge (named 'data_compare') should be like:
    taxi_zone | P(mode1)_x | P(mode2)_x | P(mode3)_x | P(mode4)_x | P(mode5)_x | P(mode1)_y | P(mode2)_y | P(mode3)_y | P(mode4)_y | P(mode5)_y
    '''
    import numpy as np
    # makesure predictdf and truedf have the same formats
    for col in predictdf.columns:
        predictdf[col] = predictdf[col].astype(float) 
    predictdf = predictdf.fillna(0)
    predictdf = predictdf.replace([np.inf, -np.inf], np.nan)
    predictdf = predictdf.dropna()
    predictdf['taxi_zone'] = predictdf.index.map(lambda x: x.split('-')[0]) #get origin taxi zone from each OD pair
    predictdf = predictdf.groupby('taxi_zone').sum().reset_index() #group the popuation by taxi zone
    predictdf['taxi_zone'] = predictdf['taxi_zone'].astype(int)
    
    #do same thing as predictdf to loss_function_deno dataframe
#     for col in loss_function_deno.columns:
#         loss_function_deno[col] = loss_function_deno[col].astype(float) 
#     loss_function_deno = loss_function_deno.fillna(0)
#     loss_function_deno = loss_function_deno.replace([np.inf, -np.inf], np.nan)
#     loss_function_deno = loss_function_deno.dropna()
#     loss_function_deno['taxi_zone'] = loss_function_deno.index.map(lambda x: x.split('-')[0]) #get origin taxi zone from each OD pair
#     loss_function_deno = loss_function_deno.groupby('taxi_zone').sum().reset_index() #group the popuation by taxi zone
#     loss_function_deno['taxi_zone'] = loss_function_deno['taxi_zone'].astype(int)   
    
    truedf['taxi_zone'] = truedf['taxi_zone'].astype(int)
    
    data_compare = pd.merge(predictdf, truedf, left_on='taxi_zone', right_on = 'taxi_zone')

    data_compare = data_compare.dropna() 
    data_compare_zone = data_compare['taxi_zone'].unique()
#     loss_function_deno = loss_function_deno[loss_function_deno['taxi_zone'].isin(data_compare_zone)]

    rloss = 0
    rmse = 0
    for i in range(1,totmode+1):
        data_compare_deno = (data_compare[data_compare.columns[i+totmode]].replace(to_replace=0,value = 1)).copy()
        rlossi = (np.sqrt(sum(((data_compare[data_compare.columns[i]] - data_compare[data_compare.columns[i+totmode]])**2)\
                        /(data_compare_deno))))
        rmsei = (np.sqrt(sum((data_compare[data_compare.columns[i]] - data_compare[data_compare.columns[i+totmode]])**2)))
        if rlossi == np.nan: print('Nan encountered')
    rloss += rlossi
    rmse += rmsei
    tot_mse = np.sqrt(sum((np.asarray(predictdf.sum()[1:]) - np.asarray(truedf.sum()[1:]))**2))
    return rloss, rmse, tot_mse

In [24]:
totmode = 6

1

In [297]:
predict_choice_1 = pd.read_csv('Final_results/results_scenario1_.04_1.5_for_uncertainty.csv',\
                               index_col=0)

In [298]:
rloss1, rmse1, tot_mse1 = compare_with_ground_truth(predict_choice_1, acs)

In [299]:
predict_choice_1.head()

Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5),P(mode6)
3-4,0.0,0.0,0.0,3.877147,0.24957,1.873283
3-7,0.0,0.0,0.0,12.012387,1.458571,22.529042
3-9,0.0,0.0,0.0,0.140546,0.072559,1.786895
3-10,0.0,0.0,0.0,0.157064,0.124834,2.718103
3-11,0.0,0.0,0.0,3.853464,0.145975,0.000561


2

In [300]:
predict_choice_2 = pd.read_csv('Final_results/results_scenario1_0.05_1.5.csv',index_col=0)

In [301]:
rloss2, rmse2, tot_mse2 = compare_with_ground_truth(predict_choice_2, acs)

3

In [302]:
predict_choice_3 = pd.read_csv('Final_results/results_scenario1_.06_1.5.csv',index_col=0)

In [303]:
rloss3, rmse3, tot_mse3 = compare_with_ground_truth(predict_choice_3, acs)

4

In [304]:
predict_choice_4 = pd.read_csv('Final_results/results_scenario1_.05_1.4_for_uncertainty.csv',index_col=0)

In [305]:
rloss4, rmse4, tot_mse4 = compare_with_ground_truth(predict_choice_4, acs)

5

In [306]:
predict_choice_5 = pd.read_csv('Final_results/results_scenario1_.05_1.6_for_uncertainty.csv',index_col=0)

In [307]:
rloss5, rmse5, tot_mse5 = compare_with_ground_truth(predict_choice_5, acs)

In [308]:
predict_choice_5.shape

(55798, 6)

In [309]:
predict_choice_1['rmse'] = rmse1
predict_choice_2['rmse'] = rmse2
predict_choice_3['rmse'] = rmse3
predict_choice_4['rmse'] = rmse4
predict_choice_5['rmse'] = rmse5

In [310]:
combined = predict_choice_1.append(predict_choice_2.append(predict_choice_3.append\
                                                           (predict_choice_4.append(predict_choice_5))))

In [311]:
# wm = lambda dfx: (dfx * np.exp(-dfx["rmse"])).sum() / np.exp(-dfx["rmse"]).sum()

In [312]:
wm = lambda x: pd.Series(np.abs(np.average(x, weights= (-x['rmse']),axis=0)))

In [313]:
mean_combined = combined.groupby(combined.index).apply(wm)

In [314]:
mean_combined.columns = predict_choice_1.columns

In [315]:
mean_combined.drop(['rmse'],axis=1,inplace=True)

In [316]:
mean_combined.head()

Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5),P(mode6)
10-100,0.0,0.0,0.0,131.060007,4.939993,3.993261e-08
10-101,0.0,3.744052e-28,8.997463e-16,1.942515,18.87511,71.18238
10-102,8.390911e-09,3.573734e-65,7.722155999999999e-57,0.042102,5.670467,43.28743
10-106,0.0,0.0,0.0,5.24287,2.022791,20.73434
10-107,0.0,0.0,0.0,39.001556,1.200242,6.798201


In [317]:
mean_combined.shape

(55798, 6)

In [318]:
mean_combined.dropna().shape

(55798, 6)

In [319]:
def weighted_std(values):
    average = np.average(values, weights=(-values['rmse']),axis=0)
    # Fast and numerically precise:
    variance = np.average((values-average)**2, weights=(-values['rmse']),axis=0)
    return (pd.Series(np.sqrt(np.abs(variance))))

In [320]:
std_combined = combined.groupby(combined.index).apply(weighted_std)

In [321]:
std_combined.columns = predict_choice_1.columns

In [322]:
std_combined.drop(['rmse'],axis=1,inplace=True)

In [323]:
std_combined.head()

Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5),P(mode6)
10-100,0.0,0.0,0.0,0.055691,0.055691,3.468015e-08
10-101,0.0,7.546766e-28,1.81359e-15,0.110762,0.089119,0.02169638
10-102,1.070083e-08,7.203461e-65,1.55653e-56,0.007986,0.025508,0.033448
10-106,0.0,0.0,0.0,0.007417,0.004049,0.003367437
10-107,0.0,0.0,0.0,0.029124,0.000804,0.0299265


In [324]:
std_combined.shape

(55798, 6)

In [325]:
std_combined.dropna().shape

(55798, 6)

In [326]:
mean_combined.to_csv('Final_results/Uncertainty/mean_scenario_1.csv')
std_combined.to_csv('Final_results/Uncertainty/std_scenario_1.csv')

In [327]:
mean_combined.sum()

P(mode1)    5.606551e+04
P(mode2)    7.136258e+03
P(mode3)    2.463458e+03
P(mode4)    1.449029e+06
P(mode5)    2.282187e+05
P(mode6)    1.138438e+06
dtype: float64

In [328]:
predict_choice_2.sum()

P(mode1)    5.696786e+04
P(mode2)    6.993359e+03
P(mode3)    2.557339e+03
P(mode4)    1.448266e+06
P(mode5)    2.281302e+05
P(mode6)    1.138436e+06
rmse        2.030742e+09
dtype: float64