<h2>Importing Libraries and Modules</h2>

In [24]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier, VotingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve


<h2>Loading the Datasets</h2>

In [25]:
## Loading the Datasets
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')


<h2>Basic Intuition on the Data</h2>

In [26]:
train.head()

Unnamed: 0,tripid,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
0,189123628,10.5,834.0,56.0,0.0,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.9033,79.8783,270.32,correct
1,189125358,10.5,791.0,47.0,0.0,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,correct
2,189125719,10.5,1087.0,80.0,0.0,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,correct
3,189127273,10.5,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.9257,79.8895,6.92748,79.8971,82.3,correct
4,189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.929,358.39,correct


In [27]:
test.head()

Unnamed: 0,tripid,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
0,213284604,10.5,924,42,2.4486,148,2/1/2020 0:38,2/1/2020 0:53,6.83454,79.875,6.7749,79.884,289.27
1,213286352,10.5,4249,20,0.0,91,2/1/2020 1:02,2/1/2020 2:13,6.91168,79.8723,6.55091,79.9706,1912.7
2,213293973,10.5,1552,255,2.6588,23,2/1/2020 5:02,2/1/2020 5:28,6.92145,79.8478,6.90539,79.8989,394.0
3,213294622,10.5,462,16,0.0,198,2/1/2020 5:30,2/1/2020 5:38,6.77433,79.9416,6.80401,79.9407,154.32
4,213298687,10.5,814,392,12.3692,69,2/1/2020 7:00,2/1/2020 7:14,6.97968,79.913,6.98875,79.8914,147.47


In [28]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17176 entries, 0 to 17175
Data columns (total 14 columns):
tripid                       17176 non-null int64
additional_fare              16974 non-null float64
duration                     16974 non-null float64
meter_waiting                16974 non-null float64
meter_waiting_fare           16974 non-null float64
meter_waiting_till_pickup    16974 non-null float64
pickup_time                  17176 non-null object
drop_time                    17176 non-null object
pick_lat                     17176 non-null float64
pick_lon                     17176 non-null float64
drop_lat                     17176 non-null float64
drop_lon                     17176 non-null float64
fare                         17039 non-null float64
label                        17176 non-null object
dtypes: float64(10), int64(1), object(3)
memory usage: 1.8+ MB


In [29]:
train.shape

(17176, 14)

In [30]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8576 entries, 0 to 8575
Data columns (total 13 columns):
tripid                       8576 non-null int64
additional_fare              8576 non-null float64
duration                     8576 non-null int64
meter_waiting                8576 non-null int64
meter_waiting_fare           8576 non-null float64
meter_waiting_till_pickup    8576 non-null int64
pickup_time                  8576 non-null object
drop_time                    8576 non-null object
pick_lat                     8576 non-null float64
pick_lon                     8576 non-null float64
drop_lat                     8576 non-null float64
drop_lon                     8576 non-null float64
fare                         8576 non-null float64
dtypes: float64(7), int64(4), object(2)
memory usage: 871.1+ KB


In [31]:
test.shape

(8576, 13)

<h2>Cleaning the Data</h2>

In [32]:
## check for missing values in the dataset

# train.isna().head()
train.isna().sum()

tripid                         0
additional_fare              202
duration                     202
meter_waiting                202
meter_waiting_fare           202
meter_waiting_till_pickup    202
pickup_time                    0
drop_time                      0
pick_lat                       0
pick_lon                       0
drop_lat                       0
drop_lon                       0
fare                         137
label                          0
dtype: int64

In [33]:
test.isna().sum()

tripid                       0
additional_fare              0
duration                     0
meter_waiting                0
meter_waiting_fare           0
meter_waiting_till_pickup    0
pickup_time                  0
drop_time                    0
pick_lat                     0
pick_lon                     0
drop_lat                     0
drop_lon                     0
fare                         0
dtype: int64

In [34]:
## fill missing values by mean of the column
train.fillna(train.mean(), inplace=True)


In [35]:
train.isna().sum()

tripid                       0
additional_fare              0
duration                     0
meter_waiting                0
meter_waiting_fare           0
meter_waiting_till_pickup    0
pickup_time                  0
drop_time                    0
pick_lat                     0
pick_lon                     0
drop_lat                     0
drop_lon                     0
fare                         0
label                        0
dtype: int64

In [36]:
## Converting date columns

# train['pickup_time'] = pd.to_datetime(train['pickup_time'], format='%m/%d/%Y %H%M')
# test['pickup_time'] = pd.to_datetime(test['pickup_time'], format='%m/%d/%Y %H%M')

## Dropping date columns
train.drop(['pickup_time', 'drop_time'], axis=1, inplace=True)
test.drop(['pickup_time', 'drop_time'], axis=1, inplace=True)


In [37]:
## converting label values into 0,1 instead of correct, incorrect

train.label = train.label.map(dict(correct=1, incorrect=0))

<h2>Basic Feature Engineering</h2>

In [38]:
## Join train and test datasets in order to obtain the same number of features during categorical conversion
train_len = len(train)
dataset = pd.concat(objs=[train, test], axis=0).reset_index(drop=True)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


<h2>Advanced Feature Engineering</h2>

<h2>Feature Preprocessing</h2>

In [39]:
## Scaling values into 0-1 range

scaler = MinMaxScaler(feature_range=(0,1))
features = ['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare', 'meter_waiting_till_pickup', 'pick_lat', 'pick_lon', 'drop_lat', 'drop_lon', 'fare']
dataset[features] = scaler.fit_transform(dataset[features])


In [40]:
## Split into training and testing again

train = dataset[:train_len]
test = dataset[train_len:]
test.drop(labels=['label'], axis=1, inplace=True)

train['label'] = train['label'].astype(int)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [41]:
## Drop unnecessary columns

train.drop(labels=['tripid'], axis=1, inplace=True)


<h2>Feature Importance</h2>

In [42]:
y = train['label']
X = train.drop(labels=['label'], axis=1)

# apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(X, y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

# concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs', 'Score']  #naming the dataframe columns
print(featureScores.nlargest(30, 'Score'))  #print 10 best features


                       Specs      Score
6         meter_waiting_fare  42.959063
5              meter_waiting  40.890308
4                       fare  23.059641
3                   duration  16.979525
9                   pick_lon   6.718608
0            additional_fare   1.456858
7  meter_waiting_till_pickup   0.608440
8                   pick_lat   0.255063
1                   drop_lat   0.009238
2                   drop_lon   0.001652


In [43]:
## Dropping less important features
train = train.drop(labels=['pick_lon', 'pick_lat', 'drop_lat', 'drop_lon'], axis=1)


<h2>Training and Testing Different Algorithms</h2>

In [44]:
## Seperate train features and label

y_train = train['label']
X_train = train.drop(labels='label', axis=1)

In [45]:
kfold = StratifiedKFold(n_splits=20, random_state=42, shuffle=False)


In [46]:
## Test different algorithms
random_state = 42

classifiers = []
classifiers.append(SVC(random_state=random_state))
classifiers.append(DecisionTreeClassifier(random_state=random_state))
classifiers.append(AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state),random_state=random_state,learning_rate=0.1))
classifiers.append(RandomForestClassifier(random_state=random_state))
classifiers.append(ExtraTreesClassifier(random_state=random_state))
classifiers.append(GradientBoostingClassifier(random_state=random_state))
classifiers.append(MLPClassifier(random_state=random_state))
classifiers.append(KNeighborsClassifier())
classifiers.append(LogisticRegression(random_state = random_state))
classifiers.append(LinearDiscriminantAnalysis())

cv_results = []
for classifier in classifiers:
    cv_results.append(
        cross_val_score(classifier, X_train, y=y_train, scoring="f1", cv=kfold, n_jobs=4)
    )

cv_means = []
cv_std = []
for cv_result in cv_results:
    cv_means.append(cv_result.mean())
    cv_std.append(cv_result.std())

cv_res = pd.DataFrame({"CrossValMeans":cv_means,"CrossValerrors": cv_std,"Algorithm":["SVC","DecisionTree","AdaBoost",
"RandomForest","ExtraTrees","GradientBoosting","MultipleLayerPerceptron","KNeighboors","LogisticRegression","LinearDiscriminantAnalysis"]})

cv_res


Unnamed: 0,CrossValMeans,CrossValerrors,Algorithm
0,0.948661,0.000285,SVC
1,0.953232,0.007428,DecisionTree
2,0.952178,0.007748,AdaBoost
3,0.968015,0.008707,RandomForest
4,0.966397,0.007826,ExtraTrees
5,0.965585,0.008187,GradientBoosting
6,0.951167,0.002,MultipleLayerPerceptron
7,0.966317,0.005003,KNeighboors
8,0.948974,0.000625,LogisticRegression
9,0.949501,0.001768,LinearDiscriminantAnalysis


<h2>Train for the Selected Model</h2>

In [47]:
model = RandomForestClassifier()
model.fit(X_train, y_train)




RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

<h2>Predicting and Preparing the Submission</h2>

In [48]:
trip_ids = test.tripid
test = test.drop(labels=['tripid'], axis=1)
test = test.drop(labels=['pick_lon', 'pick_lat', 'drop_lat', 'drop_lon'], axis=1)

predictions = model.predict(test)

output = pd.DataFrame({'tripid': trip_ids, 'prediction': predictions})
output.to_csv('../submissions/160253h_submission_02.csv', index=False)
print('Completed!')


Completed!


Initial model training.

without any feature engineering.

datetime and coordinates columns have dropped.

[model: Random Forest Classifier]

score: 0.97274