## 0 import packages

In [121]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report as clr
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import time
from  sklearn.metrics import log_loss
# from  sklearn.metrics import neg_log_loss
import warnings
import xgboost as xgb
from matplotlib import pyplot as plt
warnings.filterwarnings('ignore')

## 1 load data

In [122]:
def load_data(path_train, path_test):
    """load the train and test data

    Args:
        path_train (csv file): the path of training dataset
        path_test (csv file):  the path of testing dataset

    Returns:
        dataframe: the dataframe for training and testing dataset
    """
    
    data = pd.read_csv(path_train)
    test_data = pd.read_csv(path_test)

    return data, test_data

## 2 pre-process train

In [123]:
def Get_D(name,data):
    """Apply getting dummies method to change the categorical features

    Args:
        name (string): the column name of the categorical feature
        data (dataframe): the input dataset

    Returns:
        dataframe: the dataframe after changing the categorical features
    """
    res = pd.get_dummies(data[name]).rename(columns=lambda x:name+'_' +str(x))
    data = data.join(res)
    data.drop(columns=[name], inplace=True)
    return data

def transfrom_distance(x):
    """change the format of distance

    Args:
        x (float): the value of distance

    Returns:
        int: the category value of distance
    """
    if x < 3:
        return 0
    
    elif x<15:
        return 1
    
    else: return 2

def pre_process_train(data, if_trip_class=False):

    """
    preprocess the train data.
    data : train_data, dataframe object
    """

    date_time = pd.to_datetime(data['travel_date'], format='%d/%m/%Y').dt.dayofweek # change week time
    data['travel_date'] = date_time

    # get dummies of categorical column
    names=['travel_date','survey_language','disability','o_location_type','d_location_type','res_type','rent_own','o_purpose_category','d_purpose_category','age','employment','student','planning_apps','industry','gender','education','income_aggregate']
    for name in names:
        data=Get_D(name,data)
    #data.drop(columns=['travel_date'], inplace=True)
    data.drop(columns=['id','trip_n','person_id'], inplace=True)

    # change the trip_distance
    if if_trip_class:
        data['trip_distance'] = data['trip_distance'].apply(transfrom_distance)

    search_columns = data.columns.drop(['trip_distance', 'mode'])

    return search_columns, data

## 3 Feature importance analysis

In [124]:
def dis_features(data, search_columns,num_features):

    """
    get the most importance features for data with in search_columns

    data : dataframe
    search_columns : list contains columns in data
    num_features : numbers (int), the feature number we need
    """

    X = data[search_columns]
    y = data['trip_distance']

    rf = RandomForestRegressor()
    rf.fit(X,y)
    data_tuples = list(zip(search_columns,rf.feature_importances_))
    df=pd.DataFrame(data_tuples, columns=['name','value'])
    dataindex=list(df.sort_values('value',ascending=False)[0:num_features]['name'].values)
    dataindex=pd.Index(dataindex)

    return dataindex

In [125]:
def mode_features(data, search_columns,num_features):

    """
    get the most importance features for data with in search_columns

    data : dataframe
    search_columns : list contains columns in data
    num_features : numbers (int), the feature number we need
    """

    X = data[search_columns]
    y = data['mode']

    rf = RandomForestClassifier()
    rf.fit(X,y)
    data_tuples = list(zip(search_columns,rf.feature_importances_))
    df=pd.DataFrame(data_tuples, columns=['name','value'])
    dataindex=list(df.sort_values('value',ascending=False)[0:num_features]['name'].values)
    dataindex=pd.Index(dataindex)

    return dataindex

## 4 predict travel distance

In [126]:
def score(test, predict):
    """Apply RMSE method

    Args:
        test (array): the true value
        predict (array): the prediction value

    Returns:
        float: the RMSE value
    """
    test = np.array(test)
    predict = np.array(predict)
    diff = np.sqrt(((test-predict)**2).sum()/len(test))
    return diff

def class_dis_predict_rf_grid(data, search_columns_feature, rf_params, if_trip_class=False):

    """
    predict distance with random forest and grid search

    data: dataframe
    search_columns_feature: list of columns you need for classify

    best_param: the best parameters for corss validation for this classifier
    """
    
    X = data[search_columns_feature]
    y = data[['trip_distance','mode']]
    
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)
    sc = StandardScaler()
    sc.fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)

    if if_trip_class:

        clf = RandomForestClassifier(random_state=0)
        grid = GridSearchCV(clf, rf_params, cv=5, scoring='neg_log_loss')
        grid.fit(X_train_std,y_train['trip_distance'])
        predict_dis = grid.predict(X_test_std)
        predict_dis_prob = grid.predict_proba(X_test_std)
        predict_dis_train = grid.predict(X_train_std)
        best_param = grid.best_params_
        # print the results
        print('score for dis is {}'.format(log_loss(y_test['trip_distance'],predict_dis_prob)))
    
    else:

        clf = RandomForestRegressor(random_state=0)
        grid = GridSearchCV(clf, rf_params, cv=5)
        grid.fit(X_train_std,y_train['trip_distance'])
        predict_dis = grid.predict(X_test_std)
        predict_dis_train = grid.predict(X_train_std)
        best_param = grid.best_params_
        print('score for dis is {}'.format(score(predict_dis, y_test['trip_distance'])))


    return best_param, predict_dis_train, predict_dis

def class_dis_predict_rf_random(data, search_columns_feature, rf_params, if_trip_class=False):

    """
    predict distance with random forest and grid search

    data: dataframe
    search_columns_feature: list of columns you need for classify

    best_param: the best parameters for corss validation for this classifier
    """

    X = data[search_columns_feature]
    y = data[['trip_distance','mode']]
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)

    sc = StandardScaler()
    sc.fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)

    if if_trip_class:

        clf = RandomForestClassifier()
        Random = RandomizedSearchCV(clf, param_distributions=rf_params,cv=5,scoring='neg_log_loss',n_iter=20)
        Random.fit(X_train_std,y_train['trip_distance'])
        Random_best_param = Random.best_params_
        predict_dis = Random.predict(X_test_std)
        predict_dis_prob = Random.predict_proba(X_test_std)
        predict_dis_train = Random.predict(X_train_std)
        print('score for dis is {}'.format(log_loss(y_test['trip_distance'],predict_dis_prob)))
    
    else:
        clf = RandomForestRegressor()
        Random = RandomizedSearchCV(clf, param_distributions=rf_params,cv=5,n_iter=20)
        Random.fit(X_train_std,y_train['trip_distance'])
        Random_best_param = Random.best_params_
        predict_dis = Random.predict(X_test_std)
        predict_dis_train = Random.predict(X_train_std)
        print('score for dis is {}'.format(score(predict_dis, y_test['trip_distance'])))

    return Random_best_param, predict_dis_train, predict_dis

## 5 predict mode

In [127]:
def mode_predict_rf_grid(data,search_columns_feature, rf_params, predict_train_dis, predict_dis):

    """
    data,search_columns_pca, rf_params
    predict_train_dis : distance of train predict by the former classifier
    predict_dis
    """

    X = data[search_columns_feature]
    y = data[['trip_distance','mode']]
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)
    
    sc = StandardScaler()
    sc.fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)

    X_train['dis'] = predict_train_dis
    X_test['dis'] = predict_dis

    sc = StandardScaler()
    sc.fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)

    clf = RandomForestClassifier(random_state=42)
    grid = GridSearchCV(clf, rf_params, cv=5, scoring='neg_log_loss')
    grid.fit(X_train_std,y_train['mode'])
    best_params = grid.best_params_

    mode_predict_prob = grid.predict_proba(X_test_std)
    print(log_loss(y_test['mode'],mode_predict_prob))

    return best_params


def mode_predict_rf_random(data,search_columns_feature, rf_params, predict_train_dis, predict_dis):

    X = data[search_columns_feature]
    y = data[['trip_distance','mode']]
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)

    sc = StandardScaler()
    sc.fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)

    X_train['dis'] = predict_train_dis
    X_test['dis'] = predict_dis

    sc = StandardScaler()
    sc.fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)

    clf = RandomForestClassifier()
    # Random = RandomForestClassifier()
    Random = RandomizedSearchCV(clf, param_distributions=rf_params,cv=5,scoring='neg_log_loss',n_iter=20)
    Random.fit(X_train_std,y_train['mode'])
    Random_best_param = Random.best_params_
    mode_predict = Random.predict(X_test_std)
    mode_predict_prob = Random.predict_proba(X_test_std)
    print(log_loss(y_test['mode'],mode_predict_prob))


    return Random_best_param


def mode_predict_boost_random(data,search_columns_feature, param_boost, predict_train_dis, predict_dis):


    X = data[search_columns_feature]
    y = data[['trip_distance','mode']]
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)
    sc = StandardScaler()
    sc.fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)
    X_train['dis'] = predict_train_dis
    X_test['dis'] = predict_dis
    sc = StandardScaler()
    sc.fit(X_train)
    X_train_std = sc.transform(X_train)
    X_test_std = sc.transform(X_test)

    clf = xgb.XGBClassifier()

    rs_clf = RandomizedSearchCV(clf, param_boost, n_iter=10,
                                n_jobs=1, verbose=2, cv=5,
                                scoring='neg_log_loss', refit=False, random_state=42)

    rs_clf.fit(X_train_std,y_train['mode'])
    XGboost_best_params = rs_clf.best_params_

    clf_boost =  xgb.XGBClassifier(**XGboost_best_params)
    clf_boost.fit(X_train_std, y_train['mode'])
    mode_predict_prob = clf_boost.predict_proba(X_test_std)
    print(log_loss(y_test['mode'],mode_predict_prob))

    return XGboost_best_params


## 6 run-all cross validation for different pca strategy

In [128]:
"""
Choosing the hyperparameters
"""

rf_params_dis = {
    'n_estimators': [100],
    #'max_features': ['sqrt',0.5],
    'min_samples_split':[2],
    'max_depth': [4,8], # best
    'min_samples_leaf': [2],
    #"bootstrap":[True,False],
    # "criterion":['gini','entropy']
}

rf_params = {
    'n_estimators': [100],
    #'max_features': ['sqrt',0.5],
    'min_samples_split':[2],
    'max_depth': [4,8], # best
    'min_samples_leaf': [2],
    #"bootstrap":[True,False],
    # "criterion":['gini','entropy']
}




param_boost = {'subsample': [0.9],
 'silent': [False],
 'reg_lambda': [10.0],
 'n_estimators': [10,100],
 'min_child_weight': [0.2,0.5],
 'max_depth': [6,10,15],
 'learning_rate': [0.001,0.1,0.2],
 'gamma': [0.5,1],
 'colsample_bytree': [0.4,0.6,0.8],
 'colsample_bylevel': [0.4,0.6,0.8]}

# param_boost = {
#            'silent': [False],
#         'max_depth': [6, 10, 15, 20],
#         'learning_rate': [0.001, 0.01, 0.1, 0.2, 0,3],
#         'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
#         'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
#         'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
#         'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
#         'gamma': [0, 0.25, 0.5, 1.0],
#         'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
#         'n_estimators': [100]}


def run_all_function(path_train,path_test,rf_params,num_features_dis,num_features_dis_mode,if_trip_class):
    """
    This function is to run the process of evaluating, contain loading, preprocess, predict distance, predict mode

    num_features : int, the number of features you want to train the classifier to predict the mode
    """

    # get data
    data, test_data = load_data(path_train, path_test)

    # pre-process
    search_columns, data = pre_process_train(data,if_trip_class)

    # pca analysis

    search_columns_feature = dis_features(data, search_columns,num_features_dis)
    search_columns_feature_mode = mode_features(data, search_columns,num_features_dis_mode)

    #delete the imbalanced data
    search_columns_feature1=search_columns_feature.copy()
    for i in search_columns_feature1:
        if data[i].mean()<0.05:
            search_columns_feature=search_columns_feature.drop(i)
    #delete the imbalanced data
    search_columns_feature2=search_columns_feature_mode.copy()
    for i in search_columns_feature2:
        if data[i].mean()<0.05:
            search_columns_feature_mode=search_columns_feature_mode.drop(i)

    dataindex1 = search_columns_feature
    dataindex2 = search_columns_feature_mode
    # predict distance
    print("-------------Random Forest grid search for distance-----------------")
    best_param, predict_dis_train, predict_dis = class_dis_predict_rf_grid(data, dataindex1, rf_params,if_trip_class)

    print("-------------Random Forest random search for distance-----------------")
    best_param2, predict_dis_train2, predict_dis2 = class_dis_predict_rf_random(data, dataindex1, rf_params,if_trip_class)
    
    # predict mode
    print(' **************** the number of features selected for predicting mode is {} ****************'.format(num_features_dis_mode))
    
    print('the result for grid search random forest')
    mode_grid_best_params = mode_predict_rf_grid(data,dataindex2, rf_params, predict_dis_train, predict_dis)
    
    print('the result for Random search random forest')
    mode_Random_best_param = mode_predict_rf_random(data,dataindex2, rf_params, predict_dis_train, predict_dis)

    print('the result for random XGBoost')
    XGboost_best_params = mode_predict_boost_random(data,dataindex2, param_boost, predict_dis_train, predict_dis)

    return best_param,best_param2,mode_grid_best_params,mode_Random_best_param,XGboost_best_params, search_columns_feature,search_columns_feature_mode,data




In [129]:
#Feature selection and distance prediction model choice
if_trip_class=[False]
num_features_dis=[60]
num_features_dis_mode=[60,80,100,136]


In [131]:
for itc in if_trip_class:
    for nfd in num_features_dis:
        for nfdm in num_features_dis_mode:
            best_param,best_param2,mode_grid_best_params,mode_Random_best_param,XGboost_best_params,search_columns_feature,search_columns_feature_mode,data=run_all_function('nyc_train_validate.csv','nyc_test.csv',rf_params,nfd,nfdm,itc)

-------------Random Forest grid search for distance-----------------
score for dis is 3.2979333336043135
-------------Random Forest random search for distance-----------------
score for dis is 3.2979954498376407
 **************** the number of features selected for predicting mode is 100 ****************
the result for grid search random forest
1.109297200456094
the result for Random search random forest
1.1118921775210997
the result for random XGBoost
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV] END colsample_bylevel=0.8, colsample_bytree=0.8, gamma=1, learning_rate=0.1, max_depth=15, min_child_weight=0.5, n_estimators=100, reg_lambda=10.0, silent=False, su

## 7 prediction

In [None]:
test_data = pd.read_csv('nyc_test.csv')
def prediction(test_data, classifier_params_dis, classifier_params_mode_rf, search_columns_feature,search_columns_feature_mode):
    

    # prepare the data
    date_time = pd.to_datetime(test_data['travel_date'], format='%d/%m/%Y').dt.dayofweek
    test_data['travel_date'] = date_time
    names=['travel_date','survey_language','disability','o_location_type','d_location_type','res_type','rent_own','o_purpose_category','d_purpose_category','age','employment','student','planning_apps','industry','gender','education','income_aggregate']
    for name in names:
        test_data=Get_D(name,test_data)
    test_data.drop(columns=['id','trip_n','person_id'], inplace=True)


    # predict the dis
    rf=RandomForestRegressor(**classifier_params_dis) 
    x_train_dis= data[search_columns_feature]
    sc = StandardScaler()
    sc.fit(x_train_dis)
    x_train_dis = sc.transform(x_train_dis)
    test_data_std = sc.transform(test_data[search_columns_feature])

    rf.fit(x_train_dis,data['trip_distance'])
    predict_dis = rf.predict(test_data_std)
    # test_data['dis'] = predict_dis
    x_train_mode = data[search_columns_feature_mode]
    x_train_mode['dis']=rf.predict(x_train_dis)
    # predict the mode
    sc = StandardScaler()
    sc.fit(x_train_mode)
    test_data1=test_data[search_columns_feature_mode]
    test_data1['dis']=predict_dis
    x_train_mode = sc.transform(x_train_mode)
    Test_data_std = sc.transform(test_data1)
    print(Test_data_std.shape)


    clf1 = xgb.XGBClassifier(**XGboost_best_params)
    clf2=RandomForestClassifier(**classifier_params_mode_rf)
    clf1.fit(x_train_mode,data['mode'])
    clf2.fit(x_train_mode,data['mode'])
    mode_predict_rf = clf2.predict_proba(Test_data_std)
    mode_predict_xgb = clf1.predict_proba(Test_data_std)

    return mode_predict_rf, mode_predict_xgb




In [None]:
mode_predict_rf,mode_predict_xgb = prediction(test_data, best_param, mode_Random_best_param, search_columns_feature,search_columns_feature_mode)

#  save prediction
result=pd.DataFrame(data=mode_predict_xgb,columns=['bike', 'bus', 'drive', 'other', 'passenger', 'subway', 'walk'])
test_data = pd.read_csv('nyc_test.csv')
result['id']=test_data['id']
result=result[['id','drive','passenger','bus','subway','bike','walk','other']]
result=result.set_index('id')
result.to_csv('test_xgb.csv')

(26294, 94)
Parameters: { "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




In [None]:
# save prediction

result=pd.DataFrame(data=mode_predict_rf,columns=['bike', 'bus', 'drive', 'other', 'passenger', 'subway', 'walk'])
test_data = pd.read_csv('nyc_test.csv')
result['id']=test_data['id']
result=result[['id','drive','passenger','bus','subway','bike','walk','other']]
result=result.set_index('id')
result.to_csv('test_rf.csv')