**Loaded packages**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler, MaxAbsScaler, LabelEncoder
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, chi2 
from sklearn.pipeline import Pipeline
import sklearn.metrics as metrics

**Load Data**

In [None]:
train_process3 = pd.read_csv("../data/preprocessed/train_process3.csv")
test_process3 = pd.read_csv("../data/preprocessed/test_process3.csv")

In [None]:
train_process3.head()

In [None]:
test_process3.head()

**Train-Validation Split**

In [None]:
X = train_process3.drop(["Surge_Pricing_Type","Trip_ID"], axis=1)
y = train_process3.Surge_Pricing_Type
X_test = test_process3.drop(["Trip_ID"], axis=1)
Trip_ID = test_process3.Trip_ID

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 0)

Checking the scores of the features using SelectKBest

In [None]:
bestfeatures = SelectKBest(score_func = chi2, k='all')
fit = bestfeatures.fit(X_train, y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns, dfscores], axis = 1)
featureScores.columns = ['feature','Score']
print(featureScores.sort_values('Score', ascending = False)) 

As we can see that gender has very low importance, we will drop the variable

In [None]:
X = train_process3.drop(["Surge_Pricing_Type","Trip_ID",'Gender'], axis=1)
y = train_process3.Surge_Pricing_Type
X_test = test_process3.drop(["Trip_ID",'Gender'], axis=1)
Trip_ID = test_process3.Trip_ID

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 0)

## Grid Search for Random Forest, KNN and SVM

### Random Forest

In [None]:
# Random Forest
pipe_rf  = Pipeline([('scaler', StandardScaler()), ('clf', RandomForestClassifier(random_state = 0))])

In [None]:
rf_params = {
    'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler(), MaxAbsScaler()],
    'clf__n_estimators': [10, 50, 100, 200, 500],
    'clf__criterion': ['gini', 'entropy'],
    'clf__max_depth': [None, 2, 5, 10],
    'clf__min_samples_split': [2, 4, 8],
    'clf__min_samples_leaf': [1, 2, 5]
    }

cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2, random_state = 1)

In [None]:
gs = HalvingGridSearchCV(pipe_rf, rf_params, scoring="f1_macro", n_jobs=-1, min_resources="exhaust", factor=3, verbose = 2, cv = cv)
gs.fit(X_train, y_train)
y_pred = gs.predict(X_val)

In [None]:
print("accuracy",metrics.accuracy_score(y_val, y_pred))
print("f1 score macro",metrics.f1_score(y_val, y_pred, average='macro')) 
print("f1 score micro",metrics.f1_score(y_val, y_pred, average='micro')) 
print("precision score",metrics.precision_score(y_val, y_pred, average='macro')) 
print("recall score",metrics.recall_score(y_val, y_pred, average='macro')) 
print("hamming_loss",metrics.hamming_loss(y_val, y_pred))
print("classification_report", metrics.classification_report(y_val, y_pred))

In [None]:
# Printing out the best parameters for the selected classifier
print('Best Parameters: ', gs.best_params_)

# Access the best set of parameters
best_params_rf = gs.best_params_

# Stores the optimum model in best_pipe
best_pipe_rf = gs.best_estimator_
print(best_pipe_rf)

results_df_rf = pd.DataFrame.from_dict(gs.cv_results_, orient='columns')
print(results_df_rf.columns)

Training using the best parameters by grid search 

In [None]:
tuned_rf_pipe = Pipeline([('scaler', RobustScaler()),
                          ('clf', RandomForestClassifier(n_estimators = 500, criterion = 'gini', max_depth = None,
                                                         min_samples_split = 8, min_samples_leaf = 2, random_state = 0))])

In [None]:
tuned_rf_pipe.fit(X_train, y_train)
y_pred = tuned_rf_pipe.predict(X_val)

Plot the confusion matrix on Validation set for the tuned Random Forest Model

In [None]:
def plot_confusion_matrix(y_true, y_pred, labels = ['Low', 'Medium', 'High'],
                          normalize=False, title=None, cmap=plt.cm.coolwarm):

    cm = metrics.confusion_matrix(y_true, y_pred)
    fig, ax = plt.subplots(figsize=(12,6))
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=labels, yticklabels=labels,
           title=title,
           ylabel='ACTUAL',
           xlabel='PREDICTED')
    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 1.5
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="snow" if cm[i, j] > thresh else "orange",
                    size=26)
    ax.grid(False)
    fig.tight_layout()
    return ax

In [None]:
plot_confusion_matrix(y_val, y_pred, title="Confusion Matrix For Tuned Random Forest")
np.set_printoptions(precision=1)
# Plot non-normalized confusion matrix
plt.show()

# Final Classification Report
print(metrics.classification_report(y_val, y_pred, target_names=['Low', 'Medium', 'High']))

Save the test predictions to upload on the competition site

In [None]:
test_pred = tuned_rf_pipe.predict(X_test)

new_dict_data = dict(zip(Trip_ID.values,test_pred))

df = pd.DataFrame(new_dict_data.items(), columns=['Trip_ID', 'Surge_Pricing_Type'])
df.to_csv(r'../submissions/Preprocess3/tuned_RF_test_prediction.csv', index = False)

### KNN

In [None]:
# K Nearest Neighbors
pipe_knn = Pipeline([('scaler', StandardScaler()), ('clf', KNeighborsClassifier())])

In [None]:
knn_params = {
    'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler(), MaxAbsScaler()],
    'clf__n_neighbors': [5, 10, 25, 50, 100, 200],
    'clf__weights': ['uniform', 'distance'],
    'clf__p': [1,2],
    'clf__leaf_size': [1, 5, 10, 15]
    }

cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2, random_state = 1)

In [None]:
gs = HalvingGridSearchCV(pipe_knn, knn_params, scoring = "f1_macro", n_jobs = -1,
                         min_resources = "exhaust", factor = 3, verbose = 2, cv = cv)
gs.fit(X_train, y_train)
y_pred = gs.predict(X_val)

In [None]:
print("accuracy",metrics.accuracy_score(y_val, y_pred))
print("f1 score macro",metrics.f1_score(y_val, y_pred, average='macro')) 
print("f1 score micro",metrics.f1_score(y_val, y_pred, average='micro')) 
print("precision score",metrics.precision_score(y_val, y_pred, average='macro')) 
print("recall score",metrics.recall_score(y_val, y_pred, average='macro')) 
print("hamming_loss",metrics.hamming_loss(y_val, y_pred))
print("classification_report", metrics.classification_report(y_val, y_pred))

In [None]:
# Printing out the best parameters for the selected classifier
print('Best Parameters: ', gs.best_params_)

# Access the best set of parameters
best_params_knn = gs.best_params_

# Stores the optimum model in best_pipe
best_pipe_knn = gs.best_estimator_
print(best_pipe_knn)

results_df_knn = pd.DataFrame.from_dict(gs.cv_results_, orient='columns')
print(results_df_knn.columns)

Training using the best parameters by grid search 

In [None]:
tuned_knn_pipe = Pipeline([('scaler', RobustScaler()),
                           ('clf', KNeighborsClassifier(n_neighbors = 50, weights = 'distance', p = 1, leaf_size = 15))])

In [None]:
tuned_knn_pipe.fit(X_train, y_train)
y_pred = tuned_knn_pipe.predict(X_val)

Plot the confusion matrix on Validation set for the tuned KNN Model

In [None]:
plot_confusion_matrix(y_val, y_pred, title="Confusion Matrix For Tuned KNN")
np.set_printoptions(precision=1)
# Plot non-normalized confusion matrix
plt.show()

# Final Classification Report
print(metrics.classification_report(y_val, y_pred, target_names=['Low', 'Medium', 'High']))

Save the test predictions to upload on the competition site

In [None]:
test_pred = tuned_knn_pipe.predict(X_test)

new_dict_data = dict(zip(Trip_ID.values,test_pred))

df = pd.DataFrame(new_dict_data.items(), columns=['Trip_ID', 'Surge_Pricing_Type'])
df.to_csv(r'../submissions/Preprocess3/tuned_KNN_test_prediction.csv', index = False)

### SVM

In [None]:
# Support Vector Machine
pipe_svm = Pipeline([('scaler', StandardScaler()), ('clf', SVC(random_state = 0))])

In [None]:
svm_params = {
    'scaler': [StandardScaler(), MinMaxScaler(), RobustScaler(), MaxAbsScaler()],
    'clf__C': [0.1, 1, 10, 100],
    'clf__gamma': [1, 0.1, 0.01, 0.001],
    'clf__kernel': ['rbf', 'poly', 'sigmoid', 'linear']
    }

cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 2, random_state = 1)

In [None]:
gs = HalvingGridSearchCV(pipe_svm, svm_params, scoring = "f1_macro", n_jobs = -1,
                         min_resources = "exhaust", factor = 3, verbose = 2, cv = cv)
gs.fit(X_train, y_train)
y_pred = gs.predict(X_val)

In [None]:
print("accuracy",metrics.accuracy_score(y_val, y_pred))
print("f1 score macro",metrics.f1_score(y_val, y_pred, average='macro')) 
print("f1 score micro",metrics.f1_score(y_val, y_pred, average='micro')) 
print("precision score",metrics.precision_score(y_val, y_pred, average='macro')) 
print("recall score",metrics.recall_score(y_val, y_pred, average='macro')) 
print("hamming_loss",metrics.hamming_loss(y_val, y_pred))
print("classification_report", metrics.classification_report(y_val, y_pred))

In [None]:
# Printing out the best parameters for the selected classifier
print('Best Parameters: ', gs.best_params_)

# Access the best set of parameters
best_params_svm = gs.best_params_

# Stores the optimum model in best_pipe
best_pipe_svm = gs.best_estimator_
print(best_params_svm)

results_df_svm = pd.DataFrame.from_dict(gs.cv_results_, orient='columns')
print(results_df_svm.columns)

Training using the best parameters by grid search

In [None]:
tuned_svm_pipe = Pipeline([('scaler', RobustScaler()),
                           ('clf', SVC(kernel = 'rbf', C = 100, gamma = 0.01, random_state = 0))])

In [None]:
tuned_svm_pipe.fit(X_train, y_train)
y_pred = tuned_svm_pipe.predict(X_val)

In [None]:
print("accuracy",metrics.accuracy_score(y_val, y_pred))
print("f1 score macro",metrics.f1_score(y_val, y_pred, average='macro')) 
print("f1 score micro",metrics.f1_score(y_val, y_pred, average='micro')) 
print("precision score",metrics.precision_score(y_val, y_pred, average='macro')) 
print("recall score",metrics.recall_score(y_val, y_pred, average='macro')) 
print("hamming_loss",metrics.hamming_loss(y_val, y_pred))
print("classification_report", metrics.classification_report(y_val, y_pred))

Plot the confusion matrix on Validation set for the tuned SVM Model

In [None]:
plot_confusion_matrix(y_val, y_pred, title="Confusion Matrix For Tuned SVM")
np.set_printoptions(precision=1)
# Plot non-normalized confusion matrix
plt.show()

# Final Classification Report
print(metrics.classification_report(y_val, y_pred, target_names=['Low', 'Medium', 'High']))

Save the test predictions to upload on the competition site

In [None]:
test_pred = tuned_svm_pipe.predict(X_test)

new_dict_data = dict(zip(Trip_ID.values,test_pred))

df = pd.DataFrame(new_dict_data.items(), columns=['Trip_ID', 'Surge_Pricing_Type'])
df.to_csv(r'../submissions/Preprocess3/tuned_SVM_test_prediction.csv', index = False)