# Assignment task - Find the best classification model for the travel mode choices prediction

The goal of this assignment is to find the best classification model to predict travel mode choices. The dataset contains the cost, access time, travel time and service of multiple travel modes. In this assignment I will look at car, bus, train and air travel. In this assignment multiple types of classification models are compared. 

### Package import

In [1]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import ConfusionMatrixDisplay as cmd
from sklearn.metrics import accuracy_score, precision_score, recall_score
import pandas as pd
import os
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

import datetime as dt

### Import the data

In [2]:
#url = 'https://raw.githubusercontent.com/zhenliangma/Applied-AI-in-Transportation/master/Exercise-3%20mode%20choice%20model/modeChoiceData.csv'
#data = pd.read_csv(url)

#data.head(10)

### Train and test all combinations of encoders and models

In [3]:
encoding_list = ["one-hot encoding", "label encoding", "dummy encoding"]
model_list = ["LR", "KNN", "RF", "XGBoost", "SVM"]
counter = 0
        
results_table = []


for i in encoding_list:
    url = 'https://raw.githubusercontent.com/zhenliangma/Applied-AI-in-Transportation/master/Exercise-3%20mode%20choice%20model/modeChoiceData.csv'
    data = pd.read_csv(url)
    if i == "one-hot encoding":
        df = pd.get_dummies(data, columns=['service_air', 'service_rail'])
    elif i == "label encoding":
        encoder = LabelEncoder()
        df=data.copy()
        df['service_air'] = encoder.fit_transform(df['service_air'])
        df['service_rail'] = encoder.fit_transform(df['service_rail'])
    elif i == "dummy encoding":
        df=data.copy()
        dummy_df = pd.get_dummies(df[['service_air', 'service_rail']], drop_first=True)
        df = pd.concat([df, dummy_df], axis=1)
        df = df.drop(['service_air', 'service_rail'],axis=1)
    else:
        print("Error: Encoding method not found!", i)
    
    x = df.drop(['choice','ID',], axis=1)
    y = df['choice']
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=0)
    
    for j in model_list:
        results_list = []
        start_training_time = dt.datetime.now()
        if j == "LR":
            params = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
            model = LogisticRegression(max_iter=1000, random_state=0)
        elif j == "KNN":
            params = {'n_neighbors': [3, 5, 7, 9], 
                      'weights': ['uniform', 'distance']}
            model=KNeighborsClassifier()
        elif j == "RF":
            params = {'n_estimators': [100, 200, 300],
                      'max_depth': [None, 10, 20, 30],
                      'min_samples_split': [2, 5, 10],
                      'min_samples_leaf': [1, 2, 4]}
            model = RandomForestClassifier(random_state=0)
        elif j == "XGBoost":
            params = {'learning_rate': [0.01, 0.1, 0.2],
                      'n_estimators': [100, 200, 300],
                      'max_depth': [3, 4, 5]}
            map = {'air': 0, 'bus': 1, 'car': 2,'rail':3}
            y_train = y_train.map(map)
            y_test = y_test.map(map)
            model =  XGBClassifier()
        elif j == "SVM":
            params = {'C': [0.1, 1, 10]}
            model= SVC(kernel="linear") 
        else:
            print("Error: Model not found!", j)
            
        # Initialize and fit a GridSearchCV object to perform hyperparameter tuning:
        grid_search = GridSearchCV(estimator=model, param_grid=params, cv=5, scoring='accuracy')
        grid_search.fit(x_train, y_train)
        
        #Find best parameters and corresponding accuracy score:
        best_params = grid_search.best_params_
        best_score = grid_search.best_score_
        
        end_training_time = dt.datetime.now()
        training_time = end_training_time - start_training_time
        
        start_predicting_time = dt.datetime.now()
        #Run the model-vectorizer combination on the test set for the best parameter combination
        model = grid_search.best_estimator_
        
        # Measures for test data
        accuracy_score_variable = accuracy_score(y_test,model.predict(x_test))
        precision_score_variable = precision_score(y_test,model.predict(x_test),average=None)
        recall_score_variable = recall_score(y_test,model.predict(x_test),average=None)
        
        end_predicting_time = dt.datetime.now()
        predicting_time = end_predicting_time - start_predicting_time

                            
        results_list.extend([i, j, best_params, best_score, accuracy_score_variable, precision_score_variable, recall_score_variable, training_time, predicting_time])
        results_table.append(results_list)        
        
        counter = counter + 1
        print("Progress: ", (i, j, (counter/(len(encoding_list)*len(model_list)))*100, "%"))
        

df_results = pd.DataFrame(results_table, columns = ["Encoding", "Model", "Optimal parameters", "Accuracy score training", 
                                     "Accuracy score test", "Precision score test", 
                                     "Recall_score", "Training time", "Prediction time"])
        
print(df_results)
print("Note: The training time is the training time for all models created in gridsearch framework of the given combination of encoder and model.")

Progress:  ('one-hot encoding', 'LR', 6.666666666666667, '%')
Progress:  ('one-hot encoding', 'KNN', 13.333333333333334, '%')
Progress:  ('one-hot encoding', 'RF', 20.0, '%')


  _warn_prf(average, modifier, msg_start, len(result))


Progress:  ('one-hot encoding', 'XGBoost', 26.666666666666668, '%')


  _warn_prf(average, modifier, msg_start, len(result))


Progress:  ('one-hot encoding', 'SVM', 33.33333333333333, '%')
Progress:  ('label encoding', 'LR', 40.0, '%')
Progress:  ('label encoding', 'KNN', 46.666666666666664, '%')
Progress:  ('label encoding', 'RF', 53.333333333333336, '%')


  _warn_prf(average, modifier, msg_start, len(result))


Progress:  ('label encoding', 'XGBoost', 60.0, '%')


  _warn_prf(average, modifier, msg_start, len(result))


Progress:  ('label encoding', 'SVM', 66.66666666666666, '%')
Progress:  ('dummy encoding', 'LR', 73.33333333333333, '%')
Progress:  ('dummy encoding', 'KNN', 80.0, '%')
Progress:  ('dummy encoding', 'RF', 86.66666666666667, '%')


  _warn_prf(average, modifier, msg_start, len(result))


Progress:  ('dummy encoding', 'XGBoost', 93.33333333333333, '%')
Progress:  ('dummy encoding', 'SVM', 100.0, '%')
            Encoding    Model  \
0   one-hot encoding       LR   
1   one-hot encoding      KNN   
2   one-hot encoding       RF   
3   one-hot encoding  XGBoost   
4   one-hot encoding      SVM   
5     label encoding       LR   
6     label encoding      KNN   
7     label encoding       RF   
8     label encoding  XGBoost   
9     label encoding      SVM   
10    dummy encoding       LR   
11    dummy encoding      KNN   
12    dummy encoding       RF   
13    dummy encoding  XGBoost   
14    dummy encoding      SVM   

                                   Optimal parameters  \
0                                         {'C': 0.01}   
1            {'n_neighbors': 9, 'weights': 'uniform'}   
2   {'max_depth': 10, 'min_samples_leaf': 2, 'min_...   
3   {'learning_rate': 0.01, 'max_depth': 3, 'n_est...   
4                                          {'C': 0.1}   
5              

  _warn_prf(average, modifier, msg_start, len(result))


### Train and test all combinations of encoders and models with extra column for total time

In [8]:
encoding_list = ["one-hot encoding", "label encoding", "dummy encoding"]
model_list = ["LR", "KNN", "RF", "XGBoost", "SVM"]
counter = 0
        
results_table = []

for i in encoding_list:
    url = 'https://raw.githubusercontent.com/zhenliangma/Applied-AI-in-Transportation/master/Exercise-3%20mode%20choice%20model/modeChoiceData.csv'
    data = pd.read_csv(url)
    data["total_time_bus"] = data["time_bus"] + data["access_bus"]
    data["total_time_air"] = data["time_air"] + data["access_air"]
    data["total_time_rail"] = data["time_rail"] + data["access_rail"]

    if i == "one-hot encoding":
        df = pd.get_dummies(data, columns=['service_air', 'service_rail'])
    elif i == "label encoding":
        encoder = LabelEncoder()
        df=data.copy()
        df['service_air'] = encoder.fit_transform(df['service_air'])
        df['service_rail'] = encoder.fit_transform(df['service_rail'])
    elif i == "dummy encoding":
        df=data.copy()
        dummy_df = pd.get_dummies(df[['service_air', 'service_rail']], drop_first=True)
        df = pd.concat([df, dummy_df], axis=1)
        df = df.drop(['service_air', 'service_rail'],axis=1)
    else:
        print("Error: Encoding method not found!", i)
    
    x = df.drop(['choice','ID',], axis=1)
    y = df['choice']
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=0)
    
    for j in model_list:
        results_list = []
        start_training_time = dt.datetime.now()
        if j == "LR":
            params = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
            model = LogisticRegression(max_iter=1000, random_state=0)
        elif j == "KNN":
            params = {'n_neighbors': [3, 5, 7, 9], 
                      'weights': ['uniform', 'distance']}
            model=KNeighborsClassifier()
        elif j == "RF":
            params = {'n_estimators': [100, 200, 300],
                      'max_depth': [None, 10, 20, 30],
                      'min_samples_split': [2, 5, 10],
                      'min_samples_leaf': [1, 2, 4]}
            model = RandomForestClassifier(random_state=0)
        elif j == "XGBoost":
            params = {'learning_rate': [0.01, 0.1, 0.2],
                      'n_estimators': [100, 200, 300],
                      'max_depth': [3, 4, 5]}
            map = {'air': 0, 'bus': 1, 'car': 2,'rail':3}
            y_train = y_train.map(map)
            y_test = y_test.map(map)
            model =  XGBClassifier()
        elif j == "SVM":
            params = {'C': [0.1, 1, 10]}
            model= SVC(kernel="linear") 
        else:
            print("Error: Model not found!", j)
            
        # Initialize and fit a GridSearchCV object to perform hyperparameter tuning:
        grid_search = GridSearchCV(estimator=model, param_grid=params, cv=5, scoring='accuracy')
        grid_search.fit(x_train, y_train)
        
        #Find best parameters and corresponding accuracy score:
        best_params = grid_search.best_params_
        best_score = grid_search.best_score_
        
        end_training_time = dt.datetime.now()
        training_time = end_training_time - start_training_time
        
        start_predicting_time = dt.datetime.now()
        #Run the model-vectorizer combination on the test set for the best parameter combination
        model = grid_search.best_estimator_
        
        # Measures for test data
        accuracy_score_variable = accuracy_score(y_test,model.predict(x_test))
        precision_score_variable = precision_score(y_test,model.predict(x_test),average=None)
        recall_score_variable = recall_score(y_test,model.predict(x_test),average=None)
        
        end_predicting_time = dt.datetime.now()
        predicting_time = end_predicting_time - start_predicting_time

                            
        results_list.extend([i, j, best_params, best_score, accuracy_score_variable, precision_score_variable, recall_score_variable, training_time, predicting_time])
        results_table.append(results_list)        
        
        counter = counter + 1
        print("Progress: ", (i, j, (counter/(len(encoding_list)*len(model_list)))*100, "%"))
        

df_results2 = pd.DataFrame(results_table, columns = ["Encoding", "Model", "Optimal parameters", "Accuracy score training", 
                                     "Accuracy score test", "Precision score test", 
                                     "Recall_score", "Training time", "Prediction time"])
        
print(df_results2)
print("Note: The training time is the training time for all models created in gridsearch framework of the given combination of encoder and model.")
 


   ID  time_car  cost_car  time_bus  cost_bus  access_bus  time_air  cost_air  \
0   1       275        50       330        35          20        80        65   
1   2       275        45       330        15           5        70       110   
2   3       275        45       390        15          25        70        80   
3   4       300        45       300        15          10        60       110   
4   5       390        35       390        35          10        60        95   

   access_air service_air  time_rail  cost_rail  access_rail service_rail  \
0          55        food        120         45            5    no-frills   
1          40        wifi        170         55           25         food   
2          55   no-frills        155         35            5    no-frills   
3          40        wifi        155         65           20    no-frills   
4          45        wifi        155         65           15         food   

  choice  total_time_bus  total_time_air  total_ti

### Train and test all combinations of encoders and models with extra columns for total time and cost per time unit

In [None]:
encoding_list = ["one-hot encoding", "label encoding", "dummy encoding"]
model_list = ["LR", "KNN", "RF", "XGBoost", "SVM"]
counter = 0
        
results_table = []

for i in encoding_list:
    url = 'https://raw.githubusercontent.com/zhenliangma/Applied-AI-in-Transportation/master/Exercise-3%20mode%20choice%20model/modeChoiceData.csv'
    data = pd.read_csv(url)
    data["total_time_bus"] = data["time_bus"] + data["access_bus"]
    data["total_time_air"] = data["time_air"] + data["access_air"]
    data["total_time_rail"] = data["time_rail"] + data["access_rail"]
    data["cost_time_car"] = data["time_car"]/data["cost_car"]
    data["cost_time_bus"] = data["total_time_bus"]/data["cost_bus"]
    data["cost_time_air"] = data["total_time_air"]/data["cost_air"]
    data["cost_time_rail"] = data["total_time_rail"]/data["cost_rail"]

    if i == "one-hot encoding":
        df = pd.get_dummies(data, columns=['service_air', 'service_rail'])
    elif i == "label encoding":
        encoder = LabelEncoder()
        df=data.copy()
        df['service_air'] = encoder.fit_transform(df['service_air'])
        df['service_rail'] = encoder.fit_transform(df['service_rail'])
    elif i == "dummy encoding":
        df=data.copy()
        dummy_df = pd.get_dummies(df[['service_air', 'service_rail']], drop_first=True)
        df = pd.concat([df, dummy_df], axis=1)
        df = df.drop(['service_air', 'service_rail'],axis=1)
    else:
        print("Error: Encoding method not found!", i)
    
    x = df.drop(['choice','ID',], axis=1)
    y = df['choice']
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=0)
    
    for j in model_list:
        results_list = []
        start_training_time = dt.datetime.now()
        if j == "LR":
            params = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
            model = LogisticRegression(max_iter=1000, random_state=0)
        elif j == "KNN":
            params = {'n_neighbors': [3, 5, 7, 9], 
                      'weights': ['uniform', 'distance']}
            model=KNeighborsClassifier()
        elif j == "RF":
            params = {'n_estimators': [100, 200, 300],
                      'max_depth': [None, 10, 20, 30],
                      'min_samples_split': [2, 5, 10],
                      'min_samples_leaf': [1, 2, 4]}
            model = RandomForestClassifier(random_state=0)
        elif j == "XGBoost":
            params = {'learning_rate': [0.01, 0.1, 0.2],
                      'n_estimators': [100, 200, 300],
                      'max_depth': [3, 4, 5]}
            map = {'air': 0, 'bus': 1, 'car': 2,'rail':3}
            y_train = y_train.map(map)
            y_test = y_test.map(map)
            model =  XGBClassifier()
        elif j == "SVM":
            params = {'C': [0.1, 1, 10]}
            model= SVC(kernel="linear") 
        else:
            print("Error: Model not found!", j)
            
        # Initialize and fit a GridSearchCV object to perform hyperparameter tuning:
        grid_search = GridSearchCV(estimator=model, param_grid=params, cv=5, scoring='accuracy')
        grid_search.fit(x_train, y_train)
        
        #Find best parameters and corresponding accuracy score:
        best_params = grid_search.best_params_
        best_score = grid_search.best_score_
        
        end_training_time = dt.datetime.now()
        training_time = end_training_time - start_training_time
        
        start_predicting_time = dt.datetime.now()
        #Run the model-vectorizer combination on the test set for the best parameter combination
        model = grid_search.best_estimator_
        
        # Measures for test data
        accuracy_score_variable = accuracy_score(y_test,model.predict(x_test))
        precision_score_variable = precision_score(y_test,model.predict(x_test),average=None)
        recall_score_variable = recall_score(y_test,model.predict(x_test),average=None)
        
        end_predicting_time = dt.datetime.now()
        predicting_time = end_predicting_time - start_predicting_time

                            
        results_list.extend([i, j, best_params, best_score, accuracy_score_variable, precision_score_variable, recall_score_variable, training_time, predicting_time])
        results_table.append(results_list)        
        
        counter = counter + 1
        print("Progress: ", (i, j, (counter/(len(encoding_list)*len(model_list)))*100, "%"))
        

df_results2 = pd.DataFrame(results_table, columns = ["Encoding", "Model", "Optimal parameters", "Accuracy score training", 
                                     "Accuracy score test", "Precision score test", 
                                     "Recall_score", "Training time", "Prediction time"])
        
print(df_results2)
print("Note: The training time is the training time for all models created in gridsearch framework of the given combination of encoder and model.")
 
