In [16]:
import pandas as pd
import numpy as np
import time
import collections

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings(action='ignore', category=DeprecationWarning)
warnings.filterwarnings(action='ignore', category=FutureWarning)

In [17]:
import statsmodels.discrete.discrete_model as sm

In [18]:
class Classify:
    def __init__(self, col):
        # self.data = data
        self.data = None
        self.col = col
        
    def process_data(self, data):
        col = self.col
        xname = [# 'primary_role',
                'op_travel_wasted',
                'op_like_biking',
                'op_eco_concern',
                'op_like_driving',
                'op_need_car',
                'op_schedule_transit',
                'op_feel_safe',
                'op_like_transit',
                'op_need_own_car',
                'op_limit_driving',
                'op_smartphone',
                'op_dress_professional',
                'op_travel_stress']
        data[col].replace(['Personal bike', 'Bike share (e.g. JUMP)'], 'bike', inplace = True)
        data[col].replace(['Bus and/or shuttle', 'Train and/or light rail'], 'bus', inplace = True)
        data[col].replace(['Lyft, Uber, or other ride-hailing service', 'Carpool and/or vanpool with others', 'Drive alone in a car (or other vehicle)', 'Get dropped off by a friend of family'], 'drive', inplace = True)
        data[col].replace(['Walk (or wheelchair)', 'Skate, skateboard, or scooter', 'Other:'], 'other', inplace = True)
        if col == 'lastmile_bus':
            data[col].replace(['drive'], 'other', inplace = True)
        if col == 'lastmile_train':
            data[col].replace(['Get dropped off by a friend or family'], 'drive', inplace = True)
            # data[col].replace(['drive'], 'drive&bus', inplace = True)
            # data[col].replace(['bus'], 'drive&bus', inplace = True)
        if col == 'firstmile_train':
            data[col].replace(['bus'], 'other', inplace = True)
        data.replace(['Strongly agree'], 5, inplace = True)
        data.replace(['Somewhat agree'], 4, inplace = True)
        data.replace(['Neither agree nor disagree'], 3, inplace = True)
        data.replace(['Somewhat disagree'], 2, inplace = True)
        data.replace(['Strongly disagree'], 1, inplace = True)
        
        data['primary_role'].replace(['Undergraduate student (including Post-baccalaureate)'], 'undergra', inplace = True)
        data['primary_role'].replace(['Graduate student'], 'gra', inplace = True)
        data['primary_role'].replace(['Faculty'], 'fac', inplace = True)
        data['primary_role'].replace(["I'm no longer affiliated with UC Davis", 'Other:'], 'other', inplace = True)
        data['primary_role'].replace(['Visiting scholar', 'Staff', 'Post doc'], 'staff', inplace = True)
        
        data[col].replace(['bike'], 1, inplace = True)
        data[col].replace(['bus'], 2, inplace = True)
        data[col].replace(['drive'], 3, inplace = True)
        data[col].replace(['other'], 4, inplace = True)
        
        data = data.dropna(subset=[col] + xname + ['primary_role'])
        
        df1 = data[xname+[col]]
        self.data = pd.concat([df1, pd.get_dummies(data['primary_role'])], axis=1)
        # self.data = pd.get_dummies(data)
        return self.data
    
    def forward_selected(self):
        """Linear model designed by forward selection.

        Parameters:
        -----------
        data : pandas DataFrame with all possible predictors and response

        response: string, name of response column in data

        Returns:
        --------
        model: an "optimal" fitted statsmodels linear model
               with an intercept
               selected by forward selection
               evaluated by adjusted R-squared
        """
        response = self.col
        data = self.data
        remaining = set(data.columns)
        print(remaining)
        remaining.remove(response)
        selected = []
        current_score, best_new_score = float('Inf'), float('Inf')
        # print(remaining)
        while remaining and current_score == best_new_score:
            scores_with_candidates = []
            for candidate in remaining:
                l = sm.MNLogit(data[response].astype(int), data[selected+[candidate]].astype(int)).fit_regularized(penalty = 'l2')
                score = l.aic
                scores_with_candidates.append((score, candidate))
            scores_with_candidates.sort(reverse = True)
            best_new_score, best_candidate = scores_with_candidates.pop()
            # print(current_score, best_new_score)
            if current_score > best_new_score:
                remaining.remove(best_candidate)
                selected.append(best_candidate)
                current_score = best_new_score
        model = sm.MNLogit(data[response].astype(int), data[selected].astype(int)).fit_regularized(penalty = 'l2')
        return model



In [19]:
# Load the data
# data_dic = pd.read_csv("data/cts1819_data_dictionary.csv")
data = pd.read_csv("data/CTS_noID_1819.csv")

In [20]:
colname = 'firstmile_bus'
task = Classify(colname)
d = task.process_data(data)
print(d.shape)
firstmile_bus = task.forward_selected()

(898, 19)
{'op_need_car', 'op_limit_driving', 'gra', 'other', 'op_like_biking', 'op_schedule_transit', 'op_like_driving', 'op_need_own_car', 'op_travel_stress', 'firstmile_bus', 'staff', 'undergra', 'op_like_transit', 'op_feel_safe', 'op_dress_professional', 'op_travel_wasted', 'fac', 'op_smartphone', 'op_eco_concern'}
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.5672033301908667
            Iterations: 13
            Function evaluations: 14
            Gradient evaluations: 13
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.5617964909730151
            Iterations: 13
            Function evaluations: 14
            Gradient evaluations: 13
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 1.2857602718620051
            Iterations: 32
            Function evaluations: 32
            Gradient evaluations: 32
Optimization terminated successfully.    (Exit

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.4974571942128458
            Iterations: 50
            Function evaluations: 53
            Gradient evaluations: 50
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.504850617627999
            Iterations: 54
            Function evaluations: 56
            Gradient evaluations: 54
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.5018898796059584
            Iterations: 48
            Function evaluations: 51
            Gradient evaluations: 48
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.5055984886435726
            Iterations: 54
            Function evaluations: 57
            Gradient evaluations: 54
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.5047456977096609
            Iterations: 52
            Function

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.47103730410390504
            Iterations: 101
            Function evaluations: 104
            Gradient evaluations: 101
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.47091361022993916
            Iterations: 119
            Function evaluations: 123
            Gradient evaluations: 119
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.4710912500347194
            Iterations: 100
            Function evaluations: 103
            Gradient evaluations: 100
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.4694057762405865
            Iterations: 100
            Function evaluations: 104
            Gradient evaluations: 100
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.4718166797516325
            Iterations: 90
     

In [21]:
colname = 'firstmile_train'
task = Classify(colname)
d = task.process_data(data)
print(d.shape)
firstmile_train = task.forward_selected()

(74, 18)
{'op_need_car', 'op_limit_driving', 'gra', 'staff', 'op_like_biking', 'op_schedule_transit', 'op_like_driving', 'op_need_own_car', 'op_travel_stress', 'undergra', 'op_like_transit', 'op_feel_safe', 'op_dress_professional', 'op_travel_wasted', 'fac', 'op_smartphone', 'firstmile_train', 'op_eco_concern'}
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 1.0550238014880513
            Iterations: 6
            Function evaluations: 7
            Gradient evaluations: 6
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 1.0666031210197153
            Iterations: 5
            Function evaluations: 6
            Gradient evaluations: 5
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 1.0501097617969486
            Iterations: 13
            Function evaluations: 14
            Gradient evaluations: 13
Optimization terminated successfully.    (Exit mode 0)
     

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.8910392896588227
            Iterations: 64
            Function evaluations: 66
            Gradient evaluations: 64
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.8999466085871296
            Iterations: 65
            Function evaluations: 66
            Gradient evaluations: 65
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.8953903805286035
            Iterations: 64
            Function evaluations: 66
            Gradient evaluations: 64
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.9015317865668814
            Iterations: 65
            Function evaluations: 65
            Gradient evaluations: 65
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.9003615440684256
            Iterations: 67
            Functio

In [22]:
colname = 'lastmile_bus'
task = Classify(colname)
d = task.process_data(data)
print(d.shape)
lastmile_bus = task.forward_selected()

(898, 19)
{'op_need_car', 'op_limit_driving', 'gra', 'other', 'op_like_biking', 'op_schedule_transit', 'op_like_driving', 'op_need_own_car', 'op_travel_stress', 'lastmile_bus', 'staff', 'undergra', 'op_like_transit', 'op_feel_safe', 'op_dress_professional', 'op_travel_wasted', 'fac', 'op_smartphone', 'op_eco_concern'}
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.41393356941890697
            Iterations: 14
            Function evaluations: 14
            Gradient evaluations: 14
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.40825061470440094
            Iterations: 12
            Function evaluations: 12
            Gradient evaluations: 12
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 1.012464898661371
            Iterations: 20
            Function evaluations: 20
            Gradient evaluations: 20
Optimization terminated successfully.    (Exit

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.3636989330322309
            Iterations: 33
            Function evaluations: 36
            Gradient evaluations: 33
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.37619034723919076
            Iterations: 35
            Function evaluations: 38
            Gradient evaluations: 35
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.3756514165958916
            Iterations: 34
            Function evaluations: 37
            Gradient evaluations: 34
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.3748171067933998
            Iterations: 32
            Function evaluations: 35
            Gradient evaluations: 32
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.3766942294601016
            Iterations: 60
            Functi

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.3470614177109677
            Iterations: 44
            Function evaluations: 47
            Gradient evaluations: 44
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.34448194644669966
            Iterations: 54
            Function evaluations: 59
            Gradient evaluations: 54
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.34523679085864367
            Iterations: 68
            Function evaluations: 71
            Gradient evaluations: 68
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.3449680626179527
            Iterations: 79
            Function evaluations: 83
            Gradient evaluations: 79
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.34530440036413906
            Iterations: 63
            Func

In [23]:
colname = 'lastmile_train'
task = Classify(colname)
d = task.process_data(data)
print(d.shape)
lastmile_train = task.forward_selected()

(74, 18)
{'op_need_car', 'op_limit_driving', 'gra', 'staff', 'op_like_biking', 'op_schedule_transit', 'op_like_driving', 'op_need_own_car', 'op_travel_stress', 'undergra', 'op_like_transit', 'op_feel_safe', 'op_dress_professional', 'op_travel_wasted', 'lastmile_train', 'fac', 'op_smartphone', 'op_eco_concern'}
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 1.232888411331522
            Iterations: 8
            Function evaluations: 8
            Gradient evaluations: 8
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 1.1775322537137303
            Iterations: 7
            Function evaluations: 8
            Gradient evaluations: 7
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 1.2270135035879837
            Iterations: 45
            Function evaluations: 45
            Gradient evaluations: 45
Optimization terminated successfully.    (Exit mode 0)
       

Optimization terminated successfully.    (Exit mode 0)
            Current function value: 1.0063093922059216
            Iterations: 38
            Function evaluations: 40
            Gradient evaluations: 38
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.8515039735437624
            Iterations: 55
            Function evaluations: 57
            Gradient evaluations: 55
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.9305177710441639
            Iterations: 55
            Function evaluations: 57
            Gradient evaluations: 55
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.9240560953479402
            Iterations: 53
            Function evaluations: 55
            Gradient evaluations: 53
Optimization terminated successfully.    (Exit mode 0)
            Current function value: 0.9332010317523769
            Iterations: 81
            Functio

In [24]:
firstmile_bus.summary()

0,1,2,3
Dep. Variable:,firstmile_bus,No. Observations:,898.0
Model:,MNLogit,Df Residuals:,880.0
Method:,MLE,Df Model:,15.0
Date:,"Wed, 18 Mar 2020",Pseudo R-squ.:,0.08031
Time:,15:09:58,Log-Likelihood:,-420.82
converged:,True,LL-Null:,-457.57
Covariance Type:,nonrobust,LLR p-value:,1.057e-09

firstmile_bus=2,coef,std err,z,P>|z|,[0.025,0.975]
op_smartphone,0.2032,0.184,1.107,0.268,-0.156,0.563
undergra,0.9477,0.595,1.594,0.111,-0.218,2.113
gra,-0.2016,0.793,-0.254,0.799,-1.756,1.353
op_like_biking,-0.5352,0.204,-2.625,0.009,-0.935,-0.136
op_like_transit,0.5256,0.177,2.962,0.003,0.178,0.873
op_feel_safe,-0.4196,0.207,-2.023,0.043,-0.826,-0.013
firstmile_bus=3,coef,std err,z,P>|z|,[0.025,0.975]
op_smartphone,0.3723,0.209,1.786,0.074,-0.036,0.781
undergra,0.0937,0.618,0.152,0.880,-1.118,1.306
gra,-1.1324,0.940,-1.205,0.228,-2.974,0.709


In [25]:
firstmile_train.summary()

0,1,2,3
Dep. Variable:,firstmile_train,No. Observations:,74.0
Model:,MNLogit,Df Residuals:,68.0
Method:,MLE,Df Model:,4.0
Date:,"Wed, 18 Mar 2020",Pseudo R-squ.:,0.1526
Time:,15:10:04,Log-Likelihood:,-66.714
converged:,True,LL-Null:,-78.725
Covariance Type:,nonrobust,LLR p-value:,7.905e-05

firstmile_train=3,coef,std err,z,P>|z|,[0.025,0.975]
undergra,1.8971,0.619,3.064,0.002,0.684,3.111
staff,1.6094,1.095,1.469,0.142,-0.538,3.756
gra,-1.3863,0.645,-2.148,0.032,-2.651,-0.121
firstmile_train=4,coef,std err,z,P>|z|,[0.025,0.975]
undergra,1.2040,0.658,1.829,0.067,-0.086,2.494
staff,-18.4045,9919.509,-0.002,0.999,-1.95e+04,1.94e+04
gra,-1.0986,0.577,-1.903,0.057,-2.230,0.033


In [26]:
lastmile_bus.summary()

0,1,2,3
Dep. Variable:,lastmile_bus,No. Observations:,898.0
Model:,MNLogit,Df Residuals:,886.0
Method:,MLE,Df Model:,10.0
Date:,"Wed, 18 Mar 2020",Pseudo R-squ.:,0.07813
Time:,15:10:11,Log-Likelihood:,-308.09
converged:,True,LL-Null:,-334.2
Covariance Type:,nonrobust,LLR p-value:,1.038e-07

lastmile_bus=2,coef,std err,z,P>|z|,[0.025,0.975]
op_smartphone,-0.0353,0.257,-0.138,0.891,-0.539,0.468
op_like_driving,0.1607,0.223,0.720,0.471,-0.277,0.598
op_like_biking,-0.8711,0.244,-3.563,0.000,-1.350,-0.392
op_like_transit,-0.0008,0.245,-0.003,0.997,-0.481,0.479
op_limit_driving,0.1378,0.246,0.559,0.576,-0.345,0.621
op_travel_wasted,0.1422,0.225,0.631,0.528,-0.300,0.584
lastmile_bus=4,coef,std err,z,P>|z|,[0.025,0.975]
op_smartphone,0.2740,0.111,2.475,0.013,0.057,0.491
op_like_driving,0.3458,0.093,3.728,0.000,0.164,0.528
op_like_biking,-0.7213,0.119,-6.073,0.000,-0.954,-0.489


In [27]:
lastmile_train.summary()

0,1,2,3
Dep. Variable:,lastmile_train,No. Observations:,74.0
Model:,MNLogit,Df Residuals:,62.0
Method:,MLE,Df Model:,9.0
Date:,"Wed, 18 Mar 2020",Pseudo R-squ.:,0.277
Time:,15:10:13,Log-Likelihood:,-63.011
converged:,True,LL-Null:,-87.153
Covariance Type:,nonrobust,LLR p-value:,2.261e-07

lastmile_train=2,coef,std err,z,P>|z|,[0.025,0.975]
op_like_transit,-0.5700,0.439,-1.300,0.194,-1.430,0.290
undergra,2.5068,0.981,2.556,0.011,0.585,4.429
op_like_biking,-0.4438,0.367,-1.210,0.226,-1.163,0.275
op_need_car,0.4064,0.346,1.176,0.240,-0.271,1.084
lastmile_train=3,coef,std err,z,P>|z|,[0.025,0.975]
op_like_transit,-1.6589,0.687,-2.415,0.016,-3.005,-0.313
undergra,4.5204,2.460,1.838,0.066,-0.301,9.341
op_like_biking,-1.8165,0.695,-2.612,0.009,-3.180,-0.453
op_need_car,1.8898,0.749,2.523,0.012,0.422,3.358
lastmile_train=4,coef,std err,z,P>|z|,[0.025,0.975]
