## Note: mode1-taxi, mode2-FHV, mode3-shared FHV, mode4-PT, mode5-walking

In [9]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline

In [10]:
acs = pd.read_csv('final_acs_transportation_choice.csv')
acs.head()

Unnamed: 0,taxi_zone,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
0,3.0,0.228957,35.391246,14.54735,7706.507979,949.324468
1,4.0,46.244797,152.049702,91.310873,7487.249289,2840.14534
2,5.0,0.349401,56.498123,9.104389,7989.174863,173.873224
3,6.0,0.263186,18.439186,4.534672,4767.467108,478.295847
4,7.0,29.894066,167.04157,44.019826,33307.536619,3139.50792


In [11]:
mode_data = pd.read_csv('final_allMode_with_wage_cleaned.csv', index_col=0)
mode_data.head()

Unnamed: 0_level_0,PUlocationID,DOlocationID,duration,price,2500,7500,12500,17500,22500,30000,42500,62500,87500,125000,225000,mode,nest
ODpair,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
3-4,3,4,39.695,64.0,0.250819,0.360216,0.648452,0.282211,0.414755,0.592009,0.671282,0.93859,0.586936,0.740725,0.514005,2,1
3-4,3,4,45.216667,61.5,0.250819,0.360216,0.648452,0.282211,0.414755,0.592009,0.671282,0.93859,0.586936,0.740725,0.514005,3,1
3-7,3,7,47.880952,43.157143,1.504915,2.161294,3.890709,1.693267,2.488532,3.552056,4.027693,5.63154,3.521615,4.44435,3.084029,1,1
3-7,3,7,30.521739,47.0,1.504915,2.161294,3.890709,1.693267,2.488532,3.552056,4.027693,5.63154,3.521615,4.44435,3.084029,2,1
3-7,3,7,37.159009,45.0,1.504915,2.161294,3.890709,1.693267,2.488532,3.552056,4.027693,5.63154,3.521615,4.44435,3.084029,3,1


## Modeling

### Nested Logit Model

In [12]:
def utility(mode, wage, Lambda, dataset):
    '''
    Get the utility for mode j under specific OD pair
    Lambda: parameter that trade-off different transportation mode
    '''
    subset = dataset[dataset['mode'] == mode]
    vj = Lambda * (float(int(wage)/124800) * float(subset['duration']) + float(subset['price'])) #525600: convert wage scale 'year' to 'minitues'
    return -vj #assign negative utility?


def InclusiveValue(Nk, Tk, wage, dictVj, Lambda, dataset):
    '''
    Get the inclusive value for nest K
    Nk:nest k
    T: the dict that contains Tao(dissmilarity parameter) for each Nest. for example, Tk meeas Tao for nest K
    '''
    subsetNk = dataset[dataset['nest'] == Nk]
    modes = list(subsetNk['mode'].unique()) #what modes contained in this nest k
    sumIV = 0
    for j in modes:   
        vj = utility(j, wage, Lambda, subsetNk)
        dictVj[j] = vj
        sumIV += np.exp((1/Tk)*vj)  
    IVk = np.log(sumIV)
    return dictVj, IVk
    

def denoSum(T, nestList, wage, Lambda, dataset):
    '''
    Calculate the denomenator for P(y=Nk)
    T: the dict that contains Tk for each Nest, in our case T={1:T1, 2:T2, 3:T3}; T1, T2, T3 defined by us 
    TotK: the total number of nest this model has, in our case TotK = 3
    '''
    deno = 0
    dictIVk = {}
    dictVj = {}
    for Nk in nestList: #k is the k Nest, in our case k=1,2,3
        Tk = T[Nk] #get the tao for nest k 
        dictVj, IVk = InclusiveValue(Nk, Tk, wage, dictVj, Lambda, dataset)
        denok = np.exp(Tk*IVk)
        deno += denok
        dictIVk[Nk] = IVk
    return dictVj, dictIVk, deno


def probability (j, Nk, T, dictVj, dictIVk, deno):  
    '''
    Calculate the probability for the specific mode j and Nest Nk
    '''
    Tk = T[Nk]
    IVk = dictIVk[Nk]
    vj = dictVj[j]
    pjk = np.exp((1/Tk)*vj)/np.exp(IVk) #pjk: P(y=j, y belong to Nk)
    pk = np.exp(Tk*IVk)/deno #pk: P(y belong to Nk)
    pj = pjk*pk #pj: P(y=j)
    return pj

### Apply to our case

In [15]:
def apply_model(T, nestList, wageList, Lambda, dataAll):
    '''
    apply to our case
    '''
    import timeit
    start = timeit.default_timer()

    ODpair_list = list(dataAll.index.unique())
    p = []
    for i in tqdm(ODpair_list): #indentify OD pair  
        dataOD = dataAll[dataAll.index==i] 
        modeList = list(dataOD['mode'])
        nestList = list(dataOD['nest'])
        
        pop_OD = [] #store the results under each OD pair
        for wage in wageList:            
            dictVj, dictIVk, deno = denoSum(T, set(nestList), wage, Lambda, dataOD)
            pop_mode = [] #store the results under each OD pair and each wage
            for i in range(1,6):
                if i in modeList: #not all modes appear in every OD pair
                    pop = dataOD[wage].mean() * probability(i, nestList[modeList.index(i)], T, dictVj, dictIVk, deno)
#                     print(wage, i,nestList[modeList.index(i)], pop)
                    if np.isnan(pop) == True: #if predicted population is nan, replace it as 0, means no people choose
                        pop = 0
                
                    pop_mode.append(pop)
                else: 
                    pop_mode.append(0)
            pop_OD.append(pop_mode)
        pop_OD_sum = [sum(x) for x in zip(*pop_OD)] #sum the weighted population under each wage scenario
        p.append(pop_OD_sum)

    df = pd.DataFrame(p, columns=['P(mode1)', 'P(mode2)', 'P(mode3)', 'P(mode4)', 'P(mode5)']
                      , index=ODpair_list)
    
    stop = timeit.default_timer()
    timeslot = stop - start
    return df, timeslot

def compare_with_ground_truth(predictdf, truedf):
    '''
    compare our predicted transportation choice with ground truth
    
    The header of the datafrme after merge (named 'data_compare') should be like:
    taxi_zone | P(mode1)_x | P(mode2)_x | P(mode3)_x | P(mode4)_x | P(mode5)_x | P(mode1)_y | P(mode2)_y | P(mode3)_y | P(mode4)_y | P(mode5)_y
    '''
    import numpy as np
    # makesure predictdf and truedf have the same formats
    for col in predictdf.columns:
        predictdf[col] = predictdf[col].astype(float) 
    predictdf = predictdf.fillna(0)
    predictdf = predictdf.replace([np.inf, -np.inf], np.nan)
    predictdf = predictdf.dropna()
    predictdf['taxi_zone'] = predictdf.index.map(lambda x: x.split('-')[0]) #get origin taxi zone from each OD pair
    predictdf = predictdf.groupby('taxi_zone').sum().reset_index() #group the popuation by taxi zone
    predictdf['taxi_zone'] = predictdf['taxi_zone'].astype(int)
    truedf['taxi_zone'] = truedf['taxi_zone'].astype(int)
    
    data_compare = pd.merge(predictdf, truedf, left_on='taxi_zone', right_on = 'taxi_zone', how = 'left')
    data_compare = data_compare.dropna() 
    
    rmse = 0
    for i in range(1,6):
        rmsei = np.sqrt(sum((data_compare[data_compare.columns[i]] - data_compare[data_compare.columns[i+5]])**2))
    rmse += rmsei
    return data_compare, rmse

In [16]:
#choose ODpair 3-1 to test the algrithm whether bug-free:
T1 = 3
T2 = 1
T3 = 1
T = {1:T1, 2:T2, 3:T3} #Tao for each nest
Lambda = 1 
nestList = [1, 2, 3]
wagelist = ['2500', '7500', '12500', '17500', '22500', '30000', '42500', '62500', '87500', '125000', '225000']
# testdf = mode_data[mode_data.index.isin(list(mode_data.index[:20]))] #test OD pair 3-1,3-2,3-3,3-4
testdf = mode_data[mode_data['PUlocationID']==3]
# testdf = mode_data[mode_data.index == '3-2']

predict_choice_test, timeslot_test = apply_model(T, nestList, wagelist, Lambda, testdf)
combine_test, rmse_test = compare_with_ground_truth(predict_choice_test, acs)
print('The time used to run the code:', timeslot_test)
print('The rmse of this model is:', rmse_test)
print()
print('The predict transportation choice is:')
predict_choice_test.head()

100%|██████████| 237/237 [00:22<00:00, 10.75it/s]


The time used to run the code: 22.06189552694559
The rmse of this model is: 200.16818552

The predict transportation choice is:


Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
3-4,0.0,0.4744689,0.039536,5.236155,0.249839
3-7,0.003762,3.291113,0.174238,31.139544,1.391343
3-9,0.0,0.6040043,0.02547,1.305387,0.065139
3-10,0.0,0.6362928,3e-06,2.24167,0.122034
3-11,0.0,8.805902000000001e-33,0.0,3.868835,0.131165


In [17]:
combine_test

Unnamed: 0,taxi_zone,P(mode1)_x,P(mode2)_x,P(mode3)_x,P(mode4)_x,P(mode5)_x,P(mode1)_y,P(mode2)_y,P(mode3)_y,P(mode4)_y,P(mode5)_y
0,3,709.266929,867.421554,448.755073,5931.400161,749.156283,0.228957,35.391246,14.54735,7706.507979,949.324468


### Tune the parameters

## Apply to Scenario 1

In [None]:
#Run the model for the whole dataset
T1 = 0.1 #should set grid search for T1 
T2 = 1
T3 = 1
T = {1:T1, 2:T2, 3:T3} #Tao for each nest
Lambda = 0.01 #should set grid search for Lambda 
nestList = [1, 2, 3]
wagelist = ['2500', '7500', '12500', '17500', '22500', '30000', '42500', '62500', '87500', '125000', '225000']

predict_transportation_choice, timeslot = apply_model(T, nestList, wagelist, Lambda, mode_data)
combine, rmse = compare_with_ground_truth(predict_transportation_choice, acs)

 22%|██▏       | 12184/55868 [24:09<1:42:25,  7.11it/s]

In [None]:
print('The time used to run the code:', timeslot)
print('The mse of this model is:', mse)
print()
print('The predict transportation choice is:')
predict_transportation_choice

In [None]:
# Save df to csv
predict_transportation_choice.to_csv('results_1.csv')
combine.to_csv('combined_results_1.csv')

### Apply to scenario 2 and 3
Only use effeccted taxi zone for Scenario2 (+$2.75) and scenario 3 (+$10):

**For Scenario 2**
Taxi-zones below 96th street: 140,141, 237, 236, 263, 262, 43, 238, 239, 143,142, 12, 88, 261, 13, 87, 209, 231, 45, 232, 148, 144, 211, 125, 158, 249, 114, 113, 79, 4, 224, 107, 234, 90, 68, 246, 186, 164, 100, 170, 137, 233, 162, 161, 230, 48, 50, 163, 229.

**For Scenario 3**
Taxi-zones under 60th street: 12, 88, 261, 13, 87, 209, 231, 45, 232, 148, 144, 211, 125, 158, 249, 114, 113, 79, 4, 224, 107, 234, 90, 68, 246, 186, 164, 100, 170, 137, 233, 162, 161, 230, 48, 50, 163, 229.