**Loaded packages**

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2  
from sklearn.pipeline import Pipeline
import sklearn.metrics as metrics

**Load Data**

In [2]:
train_process2 = pd.read_csv("../data/preprocessed/train_process2.csv")
test_process2 = pd.read_csv("../data/preprocessed/test_process2.csv")

In [3]:
train_process2.head()

Unnamed: 0,Trip_ID,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender,Surge_Pricing_Type
0,T0005689460,6.77,1,1,2,0,0,3.905,0,40,46,60,0,2
1,T0005689461,29.47,1,10,2,1,0,3.45,0,38,56,78,1,2
2,T0005689464,41.58,1,10,2,1,4,3.50125,2,61,56,77,1,2
3,T0005689465,61.56,2,10,2,1,0,3.45375,0,61,52,74,1,3
4,T0005689467,54.95,2,10,3,1,0,3.4025,4,51,49,102,1,2


In [4]:
test_process2.head()

Unnamed: 0,Trip_ID,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender
0,T0005689459,9.44,0,10,2,0,1,3.68,2,61,46,63,1
1,T0005689462,32.15,1,10,2,0,0,1.59375,0,65,49,80,0
2,T0005689463,10.38,2,4,2,0,3,4.505,0,61,47,74,1
3,T0005689466,14.94,1,6,2,2,4,4.53,0,63,43,54,1
4,T0005689468,32.03,1,7,2,0,10,4.60125,3,96,44,56,1


**Train-Validation Split**

In [5]:
X = train_process2.drop(["Surge_Pricing_Type", "Trip_ID"], axis = 1)
y = train_process2.Surge_Pricing_Type
X_test = test_process2.drop(["Trip_ID"], axis = 1)
Trip_ID = test_process2.Trip_ID

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 0)

Checking the scores of the features using SelectKBest

In [7]:
bestfeatures = SelectKBest(score_func = chi2, k = 'all')
fit = bestfeatures.fit(X_train, y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns, dfscores], axis = 1)
featureScores.columns = ['feature', 'Score']
print(featureScores.sort_values('Score', ascending = False)) 

                        feature         Score
0                 Trip_Distance  37788.173363
1                   Type_of_Cab  23054.584147
7      Cancellation_Last_1Month   5183.168216
5              Destination_Type    945.223041
6               Customer_Rating    856.058342
4   Confidence_Life_Style_Index    751.830971
10                         Var3    498.593517
2         Customer_Since_Months    179.298575
8                          Var1     58.423428
3              Life_Style_Index     31.330939
9                          Var2     29.041903
11                       Gender      0.225639


### Pipelines

In [8]:
# Random Forest
pipe_rf  = Pipeline([('scaler', StandardScaler()), ('clf', RandomForestClassifier(random_state = 0))])

# Decision Tree
pipe_dt  = Pipeline([('scaler', StandardScaler()), ('clf', DecisionTreeClassifier(random_state = 0))])

# Dummy (Baseline)
pipe_dum = Pipeline([('scaler', StandardScaler()), ('clf', DummyClassifier(random_state = 0))])

# K Nearest Neighbors
pipe_knn = Pipeline([('scaler', StandardScaler()), ('clf', KNeighborsClassifier())])

# Naive Bayes
pipe_nb  = Pipeline([('scaler', StandardScaler()), ('clf', GaussianNB())])

# Support Vector Machine
pipe_svm = Pipeline([('scaler', StandardScaler()), ('clf', SVC(random_state = 0))])

In [9]:
pipelines = [pipe_rf, pipe_dt, pipe_dum, 
             pipe_knn, pipe_nb, pipe_svm]

models = ['RandomForest', 
          'DecisionTree', 
          'Dummy(Baseline)', 
          'KNN', 
          'NaiveBayes',
          'SupportVectorMachine']

# Zipping the the strings and pipelines together and creating a dictionary
model_pipelines = dict(zip(models, pipelines))
model_pipelines

{'RandomForest': Pipeline(steps=[('scaler', StandardScaler()),
                 ('clf', RandomForestClassifier(random_state=0))]),
 'DecisionTree': Pipeline(steps=[('scaler', StandardScaler()),
                 ('clf', DecisionTreeClassifier(random_state=0))]),
 'Dummy(Baseline)': Pipeline(steps=[('scaler', StandardScaler()),
                 ('clf', DummyClassifier(random_state=0))]),
 'KNN': Pipeline(steps=[('scaler', StandardScaler()), ('clf', KNeighborsClassifier())]),
 'NaiveBayes': Pipeline(steps=[('scaler', StandardScaler()), ('clf', GaussianNB())]),
 'SupportVectorMachine': Pipeline(steps=[('scaler', StandardScaler()), ('clf', SVC(random_state=0))])}

In [10]:
# Dictionary containing the model names and their scores
models_f1 = {}
classification_reports = {}
test_preds = {}

for name, pipe in model_pipelines.items():
    print('\n' + name + ' Fitting')
    pipe.fit(X_train, y_train)
    print(name + ' (Macro Avg - F1 Score):')
    
    # Classification Report
    report = metrics.classification_report(y_val, pipe.predict(X_val), output_dict = True)
   
    f1 = report['macro avg']['f1-score']
    
    #We predict on the test set given by the competition
    test_pred = pipe.predict(X_test)
    
    # Assigning to the Dictionary
    test_preds[name] = test_pred
    classification_reports[name] = report
    models_f1[name] = f1
    
    print(f1)




RandomForest Fitting
RandomForest (Macro Avg - F1 Score):
0.6662916318376942

DecisionTree Fitting
DecisionTree (Macro Avg - F1 Score):
0.5488612609345586

Dummy(Baseline) Fitting
Dummy(Baseline) (Macro Avg - F1 Score):
0.20074842750603783

KNN Fitting


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KNN (Macro Avg - F1 Score):
0.5974079268491438

NaiveBayes Fitting
NaiveBayes (Macro Avg - F1 Score):
0.6216906302008356

SupportVectorMachine Fitting
SupportVectorMachine (Macro Avg - F1 Score):
0.6634614628262082


Check the scores of each model

In [11]:
for i in sorted(models_f1, key = models_f1.get, reverse = True):
    print(i, models_f1[i])

RandomForest 0.6662916318376942
SupportVectorMachine 0.6634614628262082
NaiveBayes 0.6216906302008356
KNN 0.5974079268491438
DecisionTree 0.5488612609345586
Dummy(Baseline) 0.20074842750603783


In [12]:
classification_reports

{'RandomForest': {'1': {'precision': 0.7511520737327189,
   'recall': 0.5390409700532794,
   'f1-score': 0.6276607123756551,
   'support': 5443},
  '2': {'precision': 0.6378370922132561,
   'recall': 0.8150890181561784,
   'f1-score': 0.7156509963242406,
   'support': 11346},
  '3': {'precision': 0.7223763874873865,
   'recall': 0.6000628667225482,
   'f1-score': 0.6555631868131869,
   'support': 9544},
  'accuracy': 0.6800972164204611,
  'macro avg': {'precision': 0.7037885178111205,
   'recall': 0.6513976183106687,
   'f1-score': 0.6662916318376942,
   'support': 26333},
  'weighted avg': {'precision': 0.6918991618030004,
   'recall': 0.6800972164204611,
   'f1-score': 0.6756855852618608,
   'support': 26333}},
 'DecisionTree': {'1': {'precision': 0.49901662792776685,
   'recall': 0.5127686937350726,
   'f1-score': 0.5057992026096412,
   'support': 5443},
  '2': {'precision': 0.5756238176740834,
   'recall': 0.5631940772078265,
   'f1-score': 0.5693411146255625,
   'support': 11346},

Save the test predictions to upload on the competition site

In [13]:
for k,v in test_preds.items():
  new_dict_data = dict(zip(Trip_ID.values,test_preds[k]))
  df = pd.DataFrame(new_dict_data.items(), columns = ['Trip_ID', 'Surge_Pricing_Type'])
  df.to_csv('../submissions/Preprocess2/Preprocess2_{methodname}_test_prediction.csv'.format(methodname = k), index = False)