**Loaded packages**

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2  
from sklearn.pipeline import Pipeline
import sklearn.metrics as metrics

**Load Data**

In [None]:
train_process3 = pd.read_csv("../data/preprocessed/train_process3.csv")
test_process3 = pd.read_csv("../data/preprocessed/test_process3.csv")

In [None]:
train_process3.head()

In [None]:
test_process3.head()

**Train-Validation Split**

In [None]:
X = train_process3.drop(["Surge_Pricing_Type","Trip_ID"], axis=1)
y = train_process3.Surge_Pricing_Type
X_test = train_process3.drop(["Trip_ID"], axis=1)
Trip_ID = train_process3.Trip_ID

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 0)

Checking the scores of the features using SelectKBest

In [None]:
bestfeatures = SelectKBest(score_func = chi2, k='all')
fit = bestfeatures.fit(X_train, y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns, dfscores], axis = 1)
featureScores.columns = ['feature','Score']
print(featureScores.sort_values('Score', ascending = False)) 

###Pipelines

In [None]:
# Random Forest
pipe_rf  = Pipeline([('scaler', StandardScaler()), ('clf', RandomForestClassifier(random_state = 0))])

# Decision Tree
pipe_dt  = Pipeline([('scaler', StandardScaler()), ('clf', DecisionTreeClassifier(random_state = 0))])

# Dummy (Baseline)
pipe_dum = Pipeline([('scaler', StandardScaler()), ('clf', DummyClassifier(random_state = 0))])

# K Nearest Neighbors
pipe_knn = Pipeline([('scaler', StandardScaler()), ('clf', KNeighborsClassifier())])

# Naive Bayes
pipe_nb  = Pipeline([('scaler', StandardScaler()), ('clf', GaussianNB())])

# Support Vector Machine
pipe_svm = Pipeline([('scaler', StandardScaler()), ('clf', SVC(random_state = 0))])

In [None]:
pipelines = [pipe_rf, pipe_dt, pipe_dum, 
             pipe_knn, pipe_nb, pipe_svm]

models = ['RandomForest', 
          'DecisionTree', 
          'Dummy(Baseline)', 
          'KNN', 
          'NaiveBayes',
          'SupportVectorMachine']

# Zipping the the strings and pipelines together and creating a dictionary
model_pipelines = dict(zip(models, pipelines))
model_pipelines

In [None]:
# Dictionary containing the model names and their scores
models_f1 = {}
classification_report = {}
test_preds = {}

for name, pipe in model_pipelines.items():
    print('\n'+ name + ' Fitting')
    pipe.fit(X_train, y_train)
    print(name + ' (Macro Avg - F1 Score):')
    
    # Classification Report
    report = metrics.classification_report(y_val, pipe.predict(X_val), output_dict=True)
   
    f1 = report['macro avg']['f1-score']
    
    #We predict on the test set given by the competition
    test_pred = pipe.predict(X_test)
    
    # Assigning to the Dictionary
    test_preds[name] = test_pred
    classification_reports[name] = report
    models_f1[name] = f1
    
    print(f1)



Check the scores of each model

In [None]:
for i in sorted(models_f1, key=models_f1.get, reverse=True):
    print(i, models_f1[i])

Save the test predictions to upload on the competition site

In [None]:
for k,v in test_preds.items():
  new_dict_data = dict(zip(Trip_ID.values,test_preds[k]))
  df = pd.DataFrame(new_dict_data.items(), columns=['Trip_ID', 'Surge_Pricing_Type'])
  df.to_csv('../submissions/Preprocess3/Preprocess3_{methodname}_test_prediction.csv'.format(methodname = k), index = False)