## Note: mode1-taxi, mode2-FHV, mode3-shared FHV, mode4-PT, mode5-walking

In [165]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

## Data

In [167]:
acs = pd.read_csv('final_acs_transportation_choice.csv', index_col=0)
acs.head()

Unnamed: 0,taxi_zone,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
0,3,177.762595,73.068204,1.150001,16668.822855,2001.196346
1,4,201.297419,120.885755,61.223127,13318.123341,6279.470358
2,5,49.601278,7.992997,0.306749,8847.276451,142.822526
3,6,164.390442,40.427858,2.34638,22929.859149,2485.97617
4,7,616.150103,162.371679,110.267353,138012.557541,11998.653324


In [None]:
mode_data = pd.read_csv('final_allMode_with_wage.csv', index_col=0).set_index('ODpair')
mode_data = mode_data.dropna()
mode_data.head()

## Modeling

### Nested Logit Model

In [97]:
def utility(mode, wage, Lambda, dataset):
    '''
    Get the utility for mode j under specific OD pair
    Lambda: parameter that trade-off different transportation mode
    '''
    subset = dataset[dataset['mode'] == mode]
    vj = Lambda * (float(int(wage)/525600) * float(subset['duration']) + float(subset['price'])) #525600: convert wage scale 'year' to 'minitues'
    return -vj #assign negative utility?


def InclusiveValue(Nk, Tk, wage, dictVj, Lambda, dataset):
    '''
    Get the inclusive value for nest K
    Nk:nest k
    T: the dict that contains Tao(dissmilarity parameter) for each Nest. for example, Tk meeas Tao for nest K
    '''
    subsetNk = dataset[dataset['nest'] == Nk]
    modes = list(subsetNk['mode'].unique()) #what modes contained in this nest k
    sumIV = 0
    for j in modes:   
        vj = utility(j, wage, Lambda, subsetNk)
        dictVj[j] = vj
        sumIV += np.exp((1/Tk)*vj)  
    IVk = np.log(sumIV)
    return dictVj, IVk
    

def denoSum(T, nestList, wage, Lambda, dataset):
    '''
    Calculate the denomenator for P(y=Nk)
    T: the dict that contains Tk for each Nest, in our case T={1:T1, 2:T2, 3:T3}; T1, T2, T3 defined by us 
    TotK: the total number of nest this model has, in our case TotK = 3
    '''
    deno = 0
    dictIVk = {}
    dictVj = {}
    for Nk in nestList: #k is the k Nest, in our case k=1,2,3
        Tk = T[Nk] #get the tao for nest k 
        dictVj, IVk = InclusiveValue(Nk, Tk, wage, dictVj, Lambda, dataset)
        denok = np.exp(Tk*IVk)
        deno += denok
        dictIVk[Nk] = IVk
    return dictVj, dictIVk, deno


def probability (j, Nk, T, dictVj, dictIVk, deno):  
    '''
    Calculate the probability for the specific mode j and Nest Nk
    '''
    Tk = T[Nk]
    IVk = dictIVk[Nk]
    vj = dictVj[j]
    pjk = np.exp((1/Tk)*vj)/np.exp(IVk) #pjk: P(y=j, y belong to Nk)
    pk = np.exp(Tk*IVk)/deno #pk: P(y belong to Nk)
    pj = pjk*pk #pj: P(y=j)
    return pj

### Apply to our case

In [170]:
def apply_model(T, nestList, wageList, Lambda, dataAll):
    '''
    apply to our case
    '''
    import timeit
    start = timeit.default_timer()

    ODpair_list = list(dataAll.index.unique())
    p = []
    for i in ODpair_list: #indentify OD pair  
        dataOD = dataAll[dataAll.index==i] 
        modeList = list(dataOD['mode'])
        nestList = list(dataOD['nest'])
        
        pop_OD = [] #store the results under each OD pair
        for wage in wageList:            
            dictVj, dictIVk, deno = denoSum(T, set(nestList), wage, Lambda, dataOD)
            pop_mode = [] #store the results under each OD pair and each wage
            for i in range(1,6):
                if i in modeList: #not all modes appear in every OD pair
                    pop = dataOD[wage].mean() * probability(i, nestList[modeList.index(i)], T, dictVj, dictIVk, deno)
                    if np.isnan(pop) == True: #if predicted population is nan, replace it as 0, means no people choose
                        pop = 0
                
                    pop_mode.append(pop)
                else: 
                    pop_mode.append(0)
            pop_OD.append(pop_mode)
        pop_OD_sum = [sum(x) for x in zip(*pop_OD)] #sum the weighted population under each wage scenario
        p.append(pop_OD_sum)

    df = pd.DataFrame(p, columns=['P(mode1)', 'P(mode2)', 'P(mode3)', 'P(mode4)', 'P(mode5)']
                      , index=ODpair_list)
    
    stop = timeit.default_timer()
    timeslot = stop - start
    return df, timeslot

def compare_with_ground_truth(predictdf, truedf):
    '''
    compare our predicted transportation choice with ground truth
    
    The header of the datafrme after merge (named 'data_compare') should be like:
    taxi_zone | P(mode1)_x | P(mode2)_x | P(mode3)_x | P(mode4)_x | P(mode5)_x | P(mode1)_y | P(mode2)_y | P(mode3)_y | P(mode4)_y | P(mode5)_y
    '''
    import numpy as np
    # makesure predictdf and truedf have the same formats
    for col in predictdf.columns:
        predictdf[col] = predictdf[col].astype(float) 
    predictdf = predictdf.fillna(0)
    predictdf = predictdf.replace([np.inf, -np.inf], np.nan)
    predictdf = predictdf.dropna()
    predictdf['taxi_zone'] = predictdf.index.map(lambda x: x.split('-')[0])
    predictdf = predictdf.groupby('taxi_zone').mean().reset_index() #group the popuation by taxi zone
    predictdf['taxi_zone'] = predictdf['taxi_zone'].astype(int)
    truedf['taxi_zone'] = truedf['taxi_zone'].astype(int)
    
    data_compare = pd.merge(predictdf, truedf, left_on='taxi_zone', right_on = 'taxi_zone', how = 'left')
    data_compare = data_compare.dropna() 
    
    mse = 0
    for i in range(1,6):
        msei = (data_compare[data_compare.columns[i]] - data_compare[data_compare.columns[i+5]])**2
    mse += msei
    return float(mse)

In [172]:
#choose ODpair 3-1 to test the algrithm whether bug-free:
T1 = 2500
T2 = 1
T3 = 1
T = {1:T1, 2:T2, 3:T3} #Tao for each nest
Lambda = 2 
nestList = [1, 2, 3]
wagelist = ['2500', '7500', '12500', '17500', '22500', '30000', '42500', '62500', '87500', '125000', '225000']
testdf = mode_data[mode_data.index.isin(list(mode_data.index[:10]))] #test OD pair 3-1,3-2,3-3,3-4

predict_choice_test, timeslot_test = apply_model(T, nestList, wagelist, Lambda, testdf)
mse_test = compare_with_ground_truth(predict_choice_test, acs)
print('The time used to run the code:', timeslot_test)
print('The mse of this model is:', mse_test)
print()
print('The predict transportation choice is:')
predict_choice_test

The time used to run the code: 0.22920473299745936
The mse of this model is: 4004786.8137141904

The predict transportation choice is:


Unnamed: 0,P(mode1),P(mode2),P(mode3),P(mode4),P(mode5)
3-1,565.151604,561.849947,493.998449,0.0,0.0
3-2,0.0,0.0,0.0,6170.0,1.1647350000000001e-43
3-3,18922.0,0.0,0.0,0.0,0.0
3-4,0.0,2059.254069,1897.745931,0.0,0.0


In [None]:
#Run the model for the whole dataset
T1 = 3000 #should set grid search for T1 
T2 = 1
T3 = 1
T = {1:T1, 2:T2, 3:T3} #Tao for each nest
Lambda = 2 #should set grid search for Lambda 
nestList = [1, 2, 3]
wagelist = ['2500', '7500', '12500', '17500', '22500', '30000', '42500', '62500', '87500', '125000', '225000']

predict_transportation_choice, timeslot = apply_model(T, nestList, wagelist, Lambda, mode_data)
mse = compare_with_ground_truth(predict_choice_test, acs)

In [None]:
print('The time used to run the code:', timeslot)
print('The mse of this model is:', mse)
print()
print('The predict transportation choice is:')
predict_transportation_choice