**Loaded packages**

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, chi2  
from sklearn.pipeline import Pipeline
import sklearn.metrics as metrics

**Load Data**

In [2]:
train_process3 = pd.read_csv("../data/preprocessed/train_process3.csv")
test_process3 = pd.read_csv("../data/preprocessed/test_process3.csv")

In [3]:
train_process3.head()

Unnamed: 0,Trip_ID,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender,Surge_Pricing_Type
0,T0005689460,6.77,2.0,1.0,2.42769,1.0,0.0,3.905,0.0,40.0,46.0,60.0,0.0,2
1,T0005689461,29.47,2.0,10.0,2.78245,2.0,0.0,3.45,0.0,38.0,56.0,78.0,1.0,2
2,T0005689464,41.58,2.8,10.0,2.838005,1.7,4.0,3.50125,2.0,48.6,56.0,77.0,1.0,2
3,T0005689465,61.56,3.0,10.0,2.887663,2.4,0.0,3.45375,0.0,63.5,52.0,74.0,1.0,3
4,T0005689467,54.95,3.0,10.0,3.03453,2.0,0.0,3.4025,4.0,51.0,49.0,102.0,1.0,2


In [4]:
test_process3.head()

Unnamed: 0,Trip_ID,Trip_Distance,Type_of_Cab,Customer_Since_Months,Life_Style_Index,Confidence_Life_Style_Index,Destination_Type,Customer_Rating,Cancellation_Last_1Month,Var1,Var2,Var3,Gender
0,T0005689459,9.44,1.0,10.0,2.57438,1.0,1.0,3.68,2.0,68.3,46.0,63.0,1.0
1,T0005689462,32.15,2.0,10.0,2.85143,1.0,0.0,1.59375,0.0,65.0,49.0,80.0,0.0
2,T0005689463,10.38,3.0,4.0,2.7053,1.0,3.0,4.505,0.0,59.0,47.0,74.0,1.0
3,T0005689466,14.94,3.0,6.0,2.48159,3.0,4.0,4.53,0.0,63.0,43.0,54.0,1.0
4,T0005689468,32.03,2.0,7.0,2.81598,1.0,10.0,4.60125,3.0,96.0,44.0,56.0,1.0


**Train-Validation Split**

In [5]:
X = train_process3.drop(["Surge_Pricing_Type", "Trip_ID"], axis = 1)
y = train_process3.Surge_Pricing_Type
X_test = test_process3.drop(["Trip_ID"], axis = 1)
Trip_ID = test_process3.Trip_ID

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify = y, test_size = 0.2, random_state = 0)

Checking the scores of the features using SelectKBest

In [7]:
bestfeatures = SelectKBest(score_func = chi2, k = 'all')
fit = bestfeatures.fit(X_train, y_train)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)
featureScores = pd.concat([dfcolumns, dfscores], axis = 1)
featureScores.columns = ['feature', 'Score']
print(featureScores.sort_values('Score', ascending = False)) 

                        feature         Score
0                 Trip_Distance  37788.173363
1                   Type_of_Cab  14197.177188
7      Cancellation_Last_1Month   5183.168216
5              Destination_Type    945.223041
6               Customer_Rating    856.058342
10                         Var3    498.593517
4   Confidence_Life_Style_Index    438.737026
8                          Var1    323.039875
2         Customer_Since_Months    180.231134
9                          Var2     29.041903
3              Life_Style_Index     15.666168
11                       Gender      0.225639


###Pipelines

In [8]:
# Random Forest
pipe_rf  = Pipeline([('scaler', StandardScaler()), ('clf', RandomForestClassifier(random_state = 0))])

# Decision Tree
pipe_dt  = Pipeline([('scaler', StandardScaler()), ('clf', DecisionTreeClassifier(random_state = 0))])

# Dummy (Baseline)
pipe_dum = Pipeline([('scaler', StandardScaler()), ('clf', DummyClassifier(random_state = 0))])

# K Nearest Neighbors
pipe_knn = Pipeline([('scaler', StandardScaler()), ('clf', KNeighborsClassifier())])

# Naive Bayes
pipe_nb  = Pipeline([('scaler', StandardScaler()), ('clf', GaussianNB())])

# Support Vector Machine
pipe_svm = Pipeline([('scaler', StandardScaler()), ('clf', SVC(random_state = 0))])

In [9]:
pipelines = [pipe_rf, pipe_dt, pipe_dum, 
             pipe_knn, pipe_nb, pipe_svm]

models = ['RandomForest', 
          'DecisionTree', 
          'Dummy(Baseline)', 
          'KNN', 
          'NaiveBayes',
          'SupportVectorMachine']

# Zipping the the strings and pipelines together and creating a dictionary
model_pipelines = dict(zip(models, pipelines))
model_pipelines

{'RandomForest': Pipeline(steps=[('scaler', StandardScaler()),
                 ('clf', RandomForestClassifier(random_state=0))]),
 'DecisionTree': Pipeline(steps=[('scaler', StandardScaler()),
                 ('clf', DecisionTreeClassifier(random_state=0))]),
 'Dummy(Baseline)': Pipeline(steps=[('scaler', StandardScaler()),
                 ('clf', DummyClassifier(random_state=0))]),
 'KNN': Pipeline(steps=[('scaler', StandardScaler()), ('clf', KNeighborsClassifier())]),
 'NaiveBayes': Pipeline(steps=[('scaler', StandardScaler()), ('clf', GaussianNB())]),
 'SupportVectorMachine': Pipeline(steps=[('scaler', StandardScaler()), ('clf', SVC(random_state=0))])}

In [10]:
# Dictionary containing the model names and their scores
models_f1 = {}
classification_reports = {}
test_preds = {}

for name, pipe in model_pipelines.items():
    print('\n' + name + ' Fitting')
    pipe.fit(X_train, y_train)
    print(name + ' (Macro Avg - F1 Score):')
    
    # Classification Report
    report = metrics.classification_report(y_val, pipe.predict(X_val), output_dict = True)
   
    f1 = report['macro avg']['f1-score']
    
    #We predict on the test set given by the competition
    test_pred = pipe.predict(X_test)
    
    # Assigning to the Dictionary
    test_preds[name] = test_pred
    classification_reports[name] = report
    models_f1[name] = f1
    
    print(f1)




RandomForest Fitting
RandomForest (Macro Avg - F1 Score):
0.6774983240256301

DecisionTree Fitting
DecisionTree (Macro Avg - F1 Score):
0.559973780554167

Dummy(Baseline) Fitting
Dummy(Baseline) (Macro Avg - F1 Score):
0.20074842750603783

KNN Fitting


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


KNN (Macro Avg - F1 Score):
0.6034851309430658

NaiveBayes Fitting
NaiveBayes (Macro Avg - F1 Score):
0.6343436170211314

SupportVectorMachine Fitting
SupportVectorMachine (Macro Avg - F1 Score):
0.669256309243632


Check the scores of each model

In [11]:
for i in sorted(models_f1, key = models_f1.get, reverse = True):
    print(i, models_f1[i])

RandomForest 0.6774983240256301
SupportVectorMachine 0.669256309243632
NaiveBayes 0.6343436170211314
KNN 0.6034851309430658
DecisionTree 0.559973780554167
Dummy(Baseline) 0.20074842750603783


In [12]:
classification_reports

{'RandomForest': {'1': {'precision': 0.7503797468354431,
   'recall': 0.5445526364137424,
   'f1-score': 0.6311082721175343,
   'support': 5443},
  '2': {'precision': 0.6575077121744745,
   'recall': 0.8077736647276573,
   'f1-score': 0.7249357326478149,
   'support': 11346},
  '3': {'precision': 0.7205116058739933,
   'recall': 0.6374685666387259,
   'f1-score': 0.676450967311541,
   'support': 9544},
  'accuracy': 0.691641666350207,
  'macro avg': {'precision': 0.7094663549613035,
   'recall': 0.6632649559267085,
   'f1-score': 0.6774983240256301,
   'support': 26333},
  'weighted avg': {'precision': 0.6995390662217862,
   'recall': 0.691641666350207,
   'f1-score': 0.6879691330186152,
   'support': 26333}},
 'DecisionTree': {'1': {'precision': 0.49373069234962746,
   'recall': 0.49917325004593055,
   'f1-score': 0.49643705463182897,
   'support': 5443},
  '2': {'precision': 0.5940943146760688,
   'recall': 0.5940419531112286,
   'f1-score': 0.5940681327398528,
   'support': 11346},


Save the test predictions to upload on the competition site

In [13]:
for k,v in test_preds.items():
  new_dict_data = dict(zip(Trip_ID.values,test_preds[k]))
  df = pd.DataFrame(new_dict_data.items(), columns=['Trip_ID', 'Surge_Pricing_Type'])
  df.to_csv('../submissions/Preprocess3/Preprocess3_{methodname}_test_prediction.csv'.format(methodname = k), index = False)