<h2>Importing Libraries and Modules</h2>

In [1]:
# !pip install geopy
# !pip install scikit-learn==0.22.1 --user
# !pip install imblearn --user
# !pip install xgboost

In [2]:
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import numpy as np

from geopy.distance import great_circle
import datetime
import math
from collections import Counter

from sklearn.preprocessing import MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from xgboost import XGBClassifier
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, RandomizedSearchCV


<h2>Loading the Datasets</h2>

In [3]:
## Loading the Datasets
train = pd.read_csv('../../data/weka_processed/train_pro.csv')
test = pd.read_csv('../../data/test.csv')


<h2>Basic Intuition on the Data</h2>

In [4]:
train.head()

Unnamed: 0,tripid,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
0,189123628,10.5,834,56,0,64,'11/1/2019 0:20','11/1/2019 0:34',6.86252,79.8993,6.9033,79.8783,270.32,correct
1,189125358,10.5,791,47,0,134,'11/1/2019 0:56','11/1/2019 1:09',6.88589,79.8984,6.91373,79.8923,197.85,correct
2,189125719,10.5,1087,80,0,61,'11/1/2019 1:08','11/1/2019 1:26',6.90839,79.8651,6.93669,79.9146,301.64,correct
3,189127273,10.5,598,271,15.6638,68,'11/1/2019 2:27','11/1/2019 2:37',6.9257,79.8895,6.92748,79.8971,82.3,correct
4,189128020,?,?,?,?,?,'11/1/2019 3:34','11/1/2019 3:51',6.87441,79.8615,6.84478,79.929,358.39,correct


In [5]:
test.head()

Unnamed: 0,tripid,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
0,213284604,10.5,924,42,2.4486,148,2/1/2020 0:38,2/1/2020 0:53,6.83454,79.875,6.7749,79.884,289.27
1,213286352,10.5,4249,20,0.0,91,2/1/2020 1:02,2/1/2020 2:13,6.91168,79.8723,6.55091,79.9706,1912.7
2,213293973,10.5,1552,255,2.6588,23,2/1/2020 5:02,2/1/2020 5:28,6.92145,79.8478,6.90539,79.8989,394.0
3,213294622,10.5,462,16,0.0,198,2/1/2020 5:30,2/1/2020 5:38,6.77433,79.9416,6.80401,79.9407,154.32
4,213298687,10.5,814,392,12.3692,69,2/1/2020 7:00,2/1/2020 7:14,6.97968,79.913,6.98875,79.8914,147.47


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15327 entries, 0 to 15326
Data columns (total 14 columns):
tripid                       15327 non-null int64
additional_fare              15327 non-null object
duration                     15327 non-null object
meter_waiting                15327 non-null object
meter_waiting_fare           15327 non-null object
meter_waiting_till_pickup    15327 non-null object
pickup_time                  15327 non-null object
drop_time                    15327 non-null object
pick_lat                     15327 non-null float64
pick_lon                     15327 non-null float64
drop_lat                     15327 non-null float64
drop_lon                     15327 non-null float64
fare                         15327 non-null object
label                        15327 non-null object
dtypes: float64(4), int64(1), object(9)
memory usage: 1.6+ MB


In [7]:
train.shape

(15327, 14)

In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8576 entries, 0 to 8575
Data columns (total 13 columns):
tripid                       8576 non-null int64
additional_fare              8576 non-null float64
duration                     8576 non-null int64
meter_waiting                8576 non-null int64
meter_waiting_fare           8576 non-null float64
meter_waiting_till_pickup    8576 non-null int64
pickup_time                  8576 non-null object
drop_time                    8576 non-null object
pick_lat                     8576 non-null float64
pick_lon                     8576 non-null float64
drop_lat                     8576 non-null float64
drop_lon                     8576 non-null float64
fare                         8576 non-null float64
dtypes: float64(7), int64(4), object(2)
memory usage: 871.1+ KB


In [9]:
test.shape

(8576, 13)

<h2>Cleaning the Data</h2>

In [10]:
## cleaning weka added unnecessary values

train = train.replace({'?': np.nan})


In [11]:
## converting label values into 0,1 instead of correct, incorrect

train.label = train.label.map(dict(correct=1, incorrect=0))

In [12]:
## check for missing values in the dataset

# train.isna().head()
train.isna().sum()

tripid                         0
additional_fare              196
duration                     196
meter_waiting                196
meter_waiting_fare           196
meter_waiting_till_pickup    196
pickup_time                    0
drop_time                      0
pick_lat                       0
pick_lon                       0
drop_lat                       0
drop_lon                       0
fare                         133
label                          0
dtype: int64

In [13]:
test.isna().sum()

tripid                       0
additional_fare              0
duration                     0
meter_waiting                0
meter_waiting_fare           0
meter_waiting_till_pickup    0
pickup_time                  0
drop_time                    0
pick_lat                     0
pick_lon                     0
drop_lat                     0
drop_lon                     0
fare                         0
dtype: int64

In [14]:
## fill missing values by mean of the column

# train.fillna(train.mean(), inplace=True)

# imputer = IterativeImputer(max_iter=10, random_state=42)
# imputer.fit(train.iloc[:,[1,2,3,4,5,12]])
# train.iloc[:,[1,2,3,4,5,12]] = imputer.transform(train.iloc[:,[1,2,3,4,5,12]])


imputer = KNNImputer(n_neighbors=5, weights='uniform')
train.iloc[:,[1,2,3,4,5,12]] = imputer.fit_transform(train.iloc[:,[1,2,3,4,5,12]])


In [15]:
train.isna().sum()

tripid                       0
additional_fare              0
duration                     0
meter_waiting                0
meter_waiting_fare           0
meter_waiting_till_pickup    0
pickup_time                  0
drop_time                    0
pick_lat                     0
pick_lon                     0
drop_lat                     0
drop_lon                     0
fare                         0
label                        0
dtype: int64

<h2>Basic Feature Engineering</h2>

In [16]:
## Join train and test datasets in order to obtain the same number of features during categorical conversion
train_len = len(train)
dataset = pd.concat(objs=[train, test], axis=0).reset_index(drop=True)


<h2>Advanced Feature Engineering</h2>

In [17]:
## new feature columns
trip_fare = []
trip_duration = []
trip_distance = []

distance_to_duration = []
fare_to_duration = []
fare_to_distance = []
waiting_fare_to_waiting_duration = []

duration_from_time = []
duration_error = []

## j48 rules
duration_bucket = []    ## 50
fare_bucket = []        ## 50

## dataset inspection rules
fare_waiting_gap = []
additional_fare_greater_than_250 = []
is_duration_zero = []

In [18]:
## iterate through each row in the dataset
for row in dataset.itertuples():
    fare = float(row.fare)
    additional_fare = float(row.additional_fare)
    meter_waiting_fare = float(row.meter_waiting_fare)
    
    meter_waiting = row.meter_waiting
    meter_waiting_till_pickup = row.meter_waiting_till_pickup
    duration = row.duration
    
    pick_lat = row.pick_lat
    pick_lon = row.pick_lon
    drop_lat = row.drop_lat
    drop_lon = row.drop_lon
    
    pickup_time = row.pickup_time.replace('\'', '')
    drop_time = row.drop_time.replace('\'', '')
    pickup_time = pickup_time.split('/')
    drop_time = drop_time.split('/')
    
    # fare bucket
    fare_bucket.append(int(math.ceil(fare / 50)))
    
    # total fare for the trip
    cur_trip_fare = fare - (additional_fare + meter_waiting_fare)
    trip_fare.append(cur_trip_fare)
    
    # trip duration
    cur_trip_duration = duration - meter_waiting - meter_waiting_till_pickup
    trip_duration.append(cur_trip_duration)
    duration_bucket.append(int(math.ceil(cur_trip_duration / 50)))
    
    if(duration != 0):
        is_duration_zero.append(0)
    else:
        is_duration_zero.append(1)
    
    # trip distance
    cur_trip_distance = float(great_circle((drop_lat, drop_lon),(pick_lat, pick_lon)).kilometers)
    trip_distance.append(cur_trip_distance)
    
    # trip distance to duration
    try:
        cur_dist_to_dur = cur_trip_distance / cur_trip_duration
    except ZeroDivisionError:
        cur_dist_to_dur  = 0
    distance_to_duration.append(cur_dist_to_dur)
    
    # trip fare to trip duration
    try:
        cur_fare_to_duration = cur_trip_fare / cur_trip_duration
    except ZeroDivisionError:
        cur_fare_to_duration  = 0
    fare_to_duration.append(cur_fare_to_duration)
    
    # trip fare to trip distance
    try:
        cur_fare_to_distance = cur_trip_fare / cur_trip_distance
    except ZeroDivisionError:
        cur_fare_to_distance  = 0
    fare_to_distance.append(cur_fare_to_distance)
    
    # trip waiting fare to waiting duration
    try:
        cur_waiting_fare_to_waiting_dur = meter_waiting_fare / meter_waiting
    except ZeroDivisionError:
        cur_waiting_fare_to_waiting_dur  = 0
    waiting_fare_to_waiting_duration.append(cur_waiting_fare_to_waiting_dur)
    
    # trip fare to waiting fare
    try:
        cur_fare_waiting_gap = (fare - meter_waiting_fare) / fare
    except ZeroDivisionError:
        cur_fare_waiting_gap  = 0
    fare_waiting_gap.append(cur_fare_waiting_gap)
    
    # additional fare greater than 250
    if(additional_fare > 250):
        additional_fare_greater_than_250.append(1)
    else:
        additional_fare_greater_than_250.append(0)
    
    # time duration
    # pickup time
    month, day, y_hm = int(pickup_time[0]), int(pickup_time[1]), pickup_time[2].split(' ')
    year, h_m = int(y_hm[0]), y_hm[1].split(':')
    hour, minute = int(h_m[0]), int(h_m[1])
    pickup_time = datetime.datetime(year, month, day, hour, minute)
    
    # drop time
    month, day, y_hm = int(drop_time[0]), int(drop_time[1]), drop_time[2].split(' ')
    year, h_m = int(y_hm[0]), y_hm[1].split(':')
    hour, minute = int(h_m[0]), int(h_m[1])
    drop_time = datetime.datetime(year, month, day, hour, minute)
    
    # duration
    cur_duration_from_time = (drop_time - pickup_time).seconds
    duration_from_time.append(cur_duration_from_time)
    duration_error.append(cur_duration_from_time - duration)


In [19]:
## append new columns to dataset
dataset['trip_fare'] = trip_fare
dataset['trip_duration'] = trip_duration
dataset['trip_distance'] = trip_distance
dataset['distance_to_duration'] = distance_to_duration
dataset['fare_to_duration'] = fare_to_duration
dataset['fare_to_distance'] = fare_to_distance
dataset['waiting_fare_to_waiting_duration'] = waiting_fare_to_waiting_duration
dataset['duration_from_time'] = duration_from_time
dataset['duration_error'] = duration_error
dataset['duration_bucket'] = duration_bucket
dataset['fare_bucket'] = fare_bucket

dataset['fare_waiting_gap'] = fare_waiting_gap
dataset['additional_fare_greater_than_250'] = additional_fare_greater_than_250
dataset['is_duration_zero'] = is_duration_zero


In [20]:
## column average values
avg_fare = dataset['fare'].mean()
avg_trip_fare = dataset['trip_fare'].mean()
avg_waiting_fare = dataset['meter_waiting_fare'].mean()
avg_fare_to_distance = dataset['fare_to_distance'].mean()
avg_waiting_fare_to_duration = dataset['waiting_fare_to_waiting_duration'].mean()

In [21]:
## new columns
is_invalid_total_fare = []
div_from_avg_total_fare = []
div_from_avg_waiting_fare = []
div_from_avg_waiting_fare_to_waiting_duration = []

In [22]:
## iterate through each row in the dataset
for row in dataset.itertuples():
    fare = float(row.fare)
    trip_fare = float(row.trip_fare)
    meter_waiting_fare = float(row.meter_waiting_fare)
    additional_fare = float(row.additional_fare)
    
    fare_to_distance = float(row.fare_to_distance)
    waiting_fare_to_waiting_duration = float(row.waiting_fare_to_waiting_duration)
    
    # total fare less than or equal to zero
    if(fare-meter_waiting_fare-additional_fare <= 0):
        is_invalid_total_fare.append(1)
    else:
        is_invalid_total_fare.append(0)
        
    # diviation from average
    div_from_avg_total_fare.append(fare - avg_fare)
    div_from_avg_waiting_fare.append(meter_waiting_fare - avg_waiting_fare)
    div_from_avg_waiting_fare_to_waiting_duration.append(waiting_fare_to_waiting_duration - avg_waiting_fare_to_duration)


In [23]:
## append new columns to dataset
dataset['is_invalid_total_fare'] = is_invalid_total_fare
dataset['div_from_avg_total_fare'] = div_from_avg_total_fare
dataset['div_from_avg_waiting_fare'] = div_from_avg_waiting_fare
dataset['div_from_avg_waiting_fare_to_waiting_duration'] = div_from_avg_waiting_fare_to_waiting_duration


<h2>Feature Preprocessing</h2>

In [24]:
## Scaling values into 0-1 range

scaler = MinMaxScaler(feature_range=(0,1))
features = [
    'additional_fare',
    'duration',
    'meter_waiting',
    'meter_waiting_fare',
    'meter_waiting_till_pickup',
    'pick_lat',
    'pick_lon',
    'drop_lat',
    'drop_lon',
    'fare',
    'trip_fare',
    'trip_duration',
    'trip_distance',
    'distance_to_duration',
    'fare_to_duration',
    'fare_to_distance',
    'waiting_fare_to_waiting_duration', 
    'duration_from_time',
    'duration_error',
    'duration_bucket',
    'fare_bucket'
]

dataset[features] = scaler.fit_transform(dataset[features])


In [25]:
## Split into training and testing again

train = dataset[:train_len]
test = dataset[train_len:]
test.drop(labels=['label'], axis=1, inplace=True)

train['label'] = train['label'].astype(int)


In [26]:
## Drop unnecessary columns
labels_to_drop = [
    'tripid',
#     'pick_lat',
#     'pick_lon',
#     'drop_lat',
#     'drop_lon',
    'pickup_time',
    'drop_time'
]

train.drop(labels=labels_to_drop, axis=1, inplace=True)


<h2>Feature Importance</h2>

In [27]:
# y = train['label']
# X = train.drop(labels=['label'], axis=1)

# # apply SelectKBest class to extract top 20 best features
# bestfeatures = SelectKBest(score_func=chi2, k='all')
# fit = bestfeatures.fit(X, y)
# dfscores = pd.DataFrame(fit.scores_)
# dfcolumns = pd.DataFrame(X.columns)

# # concat two dataframes for better visualization
# featureScores = pd.concat([dfcolumns, dfscores], axis=1)
# featureScores.columns = ['Specs', 'Score']  #naming the dataframe columns
# print(featureScores.nlargest(40, 'Score'))  #print 40 best features


In [28]:
## Drop less important columns
labels_to_drop = [
#     'trip_distance',
#     'fare_to_duration',
#     'trip_duration',
#     'distance_to_duration',
#     'duration_error',
#     'meter_waiting_till_pickup',
#     'additional_fare'
]

# train.drop(labels=labels_to_drop, axis=1, inplace=True)


<h2>Training and Evaluating Basic XGBoost Setup</h2>

In [29]:
## Seperate train features and label

y_train = train['label']
X_train = train.drop(labels='label', axis=1)

In [30]:
## model definition
model = XGBClassifier(random_state=42)

## evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
scores = cross_val_score(model, X_train, y_train, scoring='f1', cv=cv, n_jobs=-1)

print('Mean F1 Score: %.4f' % np.mean(scores))

Mean F1 Score: 0.9767


<h2>Handle Class Imbalancy - Weighted XGBoost</h2>

In [31]:
counter = Counter(y_train)
estimate = counter[1] / counter[0]

print(counter)
print('Estimate: %.3f' % estimate)

estimate = round(estimate)
print('Approximate Estimate: %i' % estimate)


Counter({1: 13925, 0: 1402})
Estimate: 9.932
Approximate Estimate: 10


In [32]:
## model definition
model = XGBClassifier(random_state=42, scale_pos_weight=estimate)

## evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
scores = cross_val_score(model, X_train, y_train, scoring='f1', cv=cv, n_jobs=-1)

print('Mean F1 Score: %.4f' % np.mean(scores))


Mean F1 Score: 0.9759


<h2>Tune for the Class Weight - Weighted XGBoost GridSearchCV</h2>

In [33]:
weights = [1, 3, 5, 7, 9, 10, 11, 12, 14, 15, 20]
param_grid = dict(scale_pos_weight=weights)

In [34]:
model = XGBClassifier(random_state=42)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='f1')
grid_result = grid.fit(X_train, y_train)


In [35]:
## report
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

# all configurations
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']

for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


Best: 0.976748 using {'scale_pos_weight': 1}
0.976748 (0.003030) with: {'scale_pos_weight': 1}
0.976566 (0.002406) with: {'scale_pos_weight': 3}
0.976319 (0.002207) with: {'scale_pos_weight': 5}
0.976116 (0.001935) with: {'scale_pos_weight': 7}
0.976227 (0.002208) with: {'scale_pos_weight': 9}
0.975875 (0.002089) with: {'scale_pos_weight': 10}
0.975870 (0.002225) with: {'scale_pos_weight': 11}
0.975714 (0.002176) with: {'scale_pos_weight': 12}
0.975775 (0.002045) with: {'scale_pos_weight': 14}
0.975893 (0.002294) with: {'scale_pos_weight': 15}
0.975973 (0.002272) with: {'scale_pos_weight': 20}


<h2>Cross Validation to Tune Hiper Parameters</h2>

In [97]:
n_estimators = [int(x) for x in np.linspace(start=100, stop=2600, num=10)]

max_depth = [int(x) for x in np.linspace(1, 15, num=1)]
max_depth.append(None)

learning_rate = [0.1, 0.01, 0.001, 0.0001]
gamma = [0, 0.1, 1, 2, 5]
reg_alpha = [0.01, 0.1, 1, 10]
reg_lambda = [0.01, 0.1, 1, 10]
booster = ['gbtree', 'gblinear', 'dart']
tree_method = ['auto']
num_parallel_tree = [1, 2, 5, 10]


In [98]:
## Creating the random grid

param_grid = {
    'n_estimators': n_estimators,
    'max_depth': max_depth,
    'learning_rate': learning_rate,
    'gamma': gamma,
    'reg_alpha': reg_alpha,
    'reg_lambda': reg_lambda,
    'booster': booster,
    'tree_method': tree_method,
    'num_parallel_tree': num_parallel_tree
}

In [None]:
model_random = XGBClassifier(random_state=42, scale_pos_weight=1, n_jobs=-1)
cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=3, random_state=42)

random_grid = GridSearchCV(estimator=model_random, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='f1')
random_grid.fit(X_train, y_train)


In [None]:
## Best parameters
random_grid.best_params_


In [80]:
# # ## GridSearch on best range
# param_grid = {
#     'n_estimators': n_estimators,
#     'max_depth': max_depth,
#     'learning_rate': learning_rate,
#     'gamma': gamma,
#     'reg_alpha': reg_alpha,
#     'reg_lambda': reg_lambda,
#     'booster': booster,
#     'tree_method': tree_method,
#     'num_parallel_tree': num_parallel_tree
# }

In [76]:
# model = XGBClassifier(random_state=42, scale_pos_weight=1, n_jobs=-1)
# cv = RepeatedStratifiedKFold(n_splits=4, n_repeats=3, random_state=42)

# grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=cv, scoring='f1')
# grid.fit(X_train, y_train)


In [None]:
# grid.best_params_

<h2>Train for the Selected Setup</h2>

In [63]:
model = XGBClassifier(
    scale_pos_weight = 1,
    n_jobs = -1,
    random_state = 42,
    n_estimators = 800,
    max_depth = 15,
    learning_rate = 0.1,
    gamma = 0,
    reg_alpha = 0.1,
    reg_lambda = 0.1,
    booster = 'gbtree',
    tree_method = 'auto',
    num_parallel_tree = 1
)

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)

rfc_scores = []
score = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1')
rfc_scores.append(score.mean())

print('% Accuracy :', round(score.mean()*100, 4))


% Accuracy : 97.725


In [64]:
model.fit(X_train, y_train)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.1, max_delta_step=0, max_depth=15,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=800, n_jobs=-1, num_parallel_tree=1,
              objective='binary:logistic', random_state=42, reg_alpha=0.1,
              reg_lambda=0.1, scale_pos_weight=1, subsample=1,
              tree_method='auto', validate_parameters=False, verbosity=None)

In [None]:
# ## parameters and accuracies

# n_estimators      : 800     1000    600     800     800     800
# max_depth         : 2       2       2       5       10      15
# learning_rate     : 0.1     0.1     0.1     0.1     0.1     0.1
# gamma             : 0       0       0       0       0       0
# reg_alpha         : 0.01    0.01    0.01    0.01    0.01    0.01
# reg_lambda        : 0.01    0.01    0.01    0.01    0.01    0.01
# booster           : gbtree  gbtree  gbtree  gbtree  gbtree  gbtree
# num_parallel_tree : 1       1       1       1       1       1
# accuracy          : 97.70   97.69   97.68   97.67   97.69   97.72
    

# n_estimators      : 800     800     800     800     800     800
# max_depth         : 15      15      15      15      15      15
# learning_rate     : 0.01    0.001   0.1     0.1     0.1     0.1
# gamma             : 0       0       1       2       5       0.1
# reg_alpha         : 0.01    0.01    0.01    0.01    0.01    0.01
# reg_lambda        : 0.01    0.01    0.01    0.01    0.01    0.01
# booster           : gbtree  gbtree  gbtree  gbtree  gbtree  gbtree
# num_parallel_tree : 1       1       1       1       1       1
# accuracy          : 97.68   97.45   97.68   97.69   97.69   97.71
    

# n_estimators      : 800     800     800     800     800     800
# max_depth         : 15      15      15      15      15      15
# learning_rate     : 0.1     0.1     0.1     0.1     0.1     0.1
# gamma             : 0.01    0.001   0       0       0       0
# reg_alpha         : 0.01    0.01    0.1     1       0.1     0.1
# reg_lambda        : 0.01    0.01    0.1     1       0.1     0.1
# booster           : gbtree  gbtree  gbtree  gbtree  gbtree  gbtree
# num_parallel_tree : 1       1       1       1       5       8
# accuracy          : 97.71   97.70   97.725  97.70   97.72   97.71


<h2>Predicting and Preparing the Submission</h2>

In [65]:
trip_ids = test.tripid

labels_to_drop = [
    'tripid',
#     'pick_lat',
#     'pick_lon',
#     'drop_lat',
#     'drop_lon',
    'pickup_time',
    'drop_time',
#     'trip_distance',
#     'fare_to_duration',
#     'trip_duration',
#     'distance_to_duration',
#     'duration_error',
#     'meter_waiting_till_pickup',
#     'additional_fare'
]

test = test.drop(labels=labels_to_drop, axis=1)

predictions = model.predict(test)

output = pd.DataFrame({'tripid': trip_ids, 'prediction': predictions})
output.to_csv('../../submissions/160253h_submission_15.csv', index=False)
print('Completed!')


Completed!


model training.

KNNImputer to impute missing values.

with cross validation (to tune hiper-parameters).

with feature engineering (added 16 new features).

datetime columns have dropped (new features added).

[model: XGBoost Classifier]

score: 0.9787