<h2>Importing Libraries and Modules</h2>

In [40]:
# from google.colab import drive
# drive.mount('/content/drive')

In [1]:
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import numpy as np

from geopy.distance import great_circle
import datetime
import math
from collections import Counter

from sklearn.preprocessing import MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer
from sklearn.impute import MissingIndicator

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import RepeatedStratifiedKFold

from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, RandomizedSearchCV

from sklearn.cluster import DBSCAN
# import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


<h2>Loading the Datasets</h2>

In [2]:
## Loading the Datasets

train = pd.read_csv('../../data/weka_processed/train_pro.csv')
test = pd.read_csv('../../data/test.csv')

# train = pd.read_csv('drive/My Drive/ML/data/weka_processed/train_pro.csv')
# test = pd.read_csv('drive/My Drive/ML/data/test.csv')


<h2>Basic Intuition on the Data</h2>

In [3]:
train.head()

Unnamed: 0,tripid,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
0,189123628,10.5,834,56,0,64,'11/1/2019 0:20','11/1/2019 0:34',6.86252,79.8993,6.9033,79.8783,270.32,correct
1,189125358,10.5,791,47,0,134,'11/1/2019 0:56','11/1/2019 1:09',6.88589,79.8984,6.91373,79.8923,197.85,correct
2,189125719,10.5,1087,80,0,61,'11/1/2019 1:08','11/1/2019 1:26',6.90839,79.8651,6.93669,79.9146,301.64,correct
3,189127273,10.5,598,271,15.6638,68,'11/1/2019 2:27','11/1/2019 2:37',6.9257,79.8895,6.92748,79.8971,82.3,correct
4,189128020,?,?,?,?,?,'11/1/2019 3:34','11/1/2019 3:51',6.87441,79.8615,6.84478,79.929,358.39,correct


In [4]:
test.head()

Unnamed: 0,tripid,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
0,213284604,10.5,924,42,2.4486,148,2/1/2020 0:38,2/1/2020 0:53,6.83454,79.875,6.7749,79.884,289.27
1,213286352,10.5,4249,20,0.0,91,2/1/2020 1:02,2/1/2020 2:13,6.91168,79.8723,6.55091,79.9706,1912.7
2,213293973,10.5,1552,255,2.6588,23,2/1/2020 5:02,2/1/2020 5:28,6.92145,79.8478,6.90539,79.8989,394.0
3,213294622,10.5,462,16,0.0,198,2/1/2020 5:30,2/1/2020 5:38,6.77433,79.9416,6.80401,79.9407,154.32
4,213298687,10.5,814,392,12.3692,69,2/1/2020 7:00,2/1/2020 7:14,6.97968,79.913,6.98875,79.8914,147.47


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15327 entries, 0 to 15326
Data columns (total 14 columns):
tripid                       15327 non-null int64
additional_fare              15327 non-null object
duration                     15327 non-null object
meter_waiting                15327 non-null object
meter_waiting_fare           15327 non-null object
meter_waiting_till_pickup    15327 non-null object
pickup_time                  15327 non-null object
drop_time                    15327 non-null object
pick_lat                     15327 non-null float64
pick_lon                     15327 non-null float64
drop_lat                     15327 non-null float64
drop_lon                     15327 non-null float64
fare                         15327 non-null object
label                        15327 non-null object
dtypes: float64(4), int64(1), object(9)
memory usage: 1.6+ MB


In [6]:
train.shape

(15327, 14)

In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8576 entries, 0 to 8575
Data columns (total 13 columns):
tripid                       8576 non-null int64
additional_fare              8576 non-null float64
duration                     8576 non-null int64
meter_waiting                8576 non-null int64
meter_waiting_fare           8576 non-null float64
meter_waiting_till_pickup    8576 non-null int64
pickup_time                  8576 non-null object
drop_time                    8576 non-null object
pick_lat                     8576 non-null float64
pick_lon                     8576 non-null float64
drop_lat                     8576 non-null float64
drop_lon                     8576 non-null float64
fare                         8576 non-null float64
dtypes: float64(7), int64(4), object(2)
memory usage: 871.1+ KB


In [8]:
test.shape

(8576, 13)

<h2>Cleaning the Data</h2>

In [9]:
## cleaning weka added unnecessary values

train = train.replace({'?': np.nan})


In [10]:
## converting label values into 0,1 instead of correct, incorrect

train.label = train.label.map(dict(correct=1, incorrect=0))

In [11]:
## check for missing values in the dataset

# train.isna().head()
train.isna().sum()

tripid                         0
additional_fare              196
duration                     196
meter_waiting                196
meter_waiting_fare           196
meter_waiting_till_pickup    196
pickup_time                    0
drop_time                      0
pick_lat                       0
pick_lon                       0
drop_lat                       0
drop_lon                       0
fare                         133
label                          0
dtype: int64

In [12]:
test.isna().sum()

tripid                       0
additional_fare              0
duration                     0
meter_waiting                0
meter_waiting_fare           0
meter_waiting_till_pickup    0
pickup_time                  0
drop_time                    0
pick_lat                     0
pick_lon                     0
drop_lat                     0
drop_lon                     0
fare                         0
dtype: int64

In [13]:
## fill missing values

imputer = KNNImputer(n_neighbors=5, weights='uniform')
train.iloc[:,[1,2,3,4,5,12]] = imputer.fit_transform(train.iloc[:,[1,2,3,4,5,12]])

# ## drop missing values
# train = train.dropna(how='any', axis=0)


In [14]:
train.isna().sum()

tripid                       0
additional_fare              0
duration                     0
meter_waiting                0
meter_waiting_fare           0
meter_waiting_till_pickup    0
pickup_time                  0
drop_time                    0
pick_lat                     0
pick_lon                     0
drop_lat                     0
drop_lon                     0
fare                         0
label                        0
dtype: int64

<h2>Basic Feature Engineering</h2>

In [15]:
## Join train and test datasets in order to obtain the same number of features during categorical conversion
train_len = len(train)
dataset = pd.concat(objs=[train, test], axis=0).reset_index(drop=True)


<h4>Clustering Based Feature Column</h4>

In [16]:
dbscan = DBSCAN(eps=0.3, min_samples=5)

cluster_pred = dbscan.fit_predict(dataset.drop(labels=['tripid', 'label', 'pickup_time', 'drop_time'], axis=1))

# cluster_pred = np.where(cluster_pred == -1, 10, cluster_pred)


In [17]:
unique1, count1 = np.unique(cluster_pred, return_counts=True)
{k:v for (k,v) in zip(unique1, count1)}


{-1: 23774, 0: 17, 1: 112}

In [18]:
dataset['cluster'] = cluster_pred


<h4>Location Cluster Feature Column</h4>

In [19]:
pick_coord_dt = dataset.loc[:, ['pick_lat', 'pick_lon']]
drop_coord_dt = dataset.loc[:, ['drop_lat', 'drop_lon']]


In [20]:
dbscan = DBSCAN(eps=0.01, min_samples=5)
pick_labels = dbscan.fit_predict(pick_coord_dt)

dbscan = DBSCAN(eps=0.01, min_samples=5)
drop_labels = dbscan.fit_predict(drop_coord_dt)


In [21]:
# pick_labels = np.where(pick_labels == -1, 50, pick_labels)
# drop_labels = np.where(drop_labels == -1, 50, drop_labels)


In [22]:
unique1, count1 = np.unique(pick_labels, return_counts=True)
{k:v for (k,v) in zip(unique1, count1)}


{-1: 213,
 0: 20205,
 1: 2760,
 2: 88,
 3: 37,
 4: 76,
 5: 5,
 6: 22,
 7: 12,
 8: 8,
 9: 20,
 10: 12,
 11: 27,
 12: 7,
 13: 6,
 14: 28,
 15: 8,
 16: 14,
 17: 9,
 18: 5,
 19: 5,
 20: 6,
 21: 7,
 22: 5,
 23: 159,
 24: 149,
 25: 5,
 26: 5}

In [23]:
unique1, count1 = np.unique(drop_labels, return_counts=True)
{k:v for (k,v) in zip(unique1, count1)}


{-1: 351,
 0: 20147,
 1: 2738,
 2: 75,
 3: 25,
 4: 10,
 5: 62,
 6: 25,
 7: 8,
 8: 21,
 9: 17,
 10: 6,
 11: 10,
 12: 6,
 13: 17,
 14: 17,
 15: 13,
 16: 13,
 17: 5,
 18: 9,
 19: 5,
 20: 5,
 21: 5,
 22: 5,
 23: 6,
 24: 5,
 25: 6,
 26: 4,
 27: 3,
 28: 132,
 29: 146,
 30: 5,
 31: 1}

In [24]:
dataset['pick_cluster'] = pick_labels
dataset['drop_cluster'] = drop_labels


<h2>Advanced Feature Engineering</h2>

In [26]:
## new feature columns
trip_fare = []
trip_duration = []
trip_distance = []

distance_to_duration = []
distance_to_duration_min_meter_waiting = []
fare_to_duration = []
fare_to_distance = []
waiting_fare_to_waiting_duration = []
duration_min_waiting_to_fare = []
additional_fare_to_distance = []
fare_min_waiting_fare_to_duration_min_meter_waiting = []
waiting_duration_to_duration = []
additional_fare_to_fare = []

duration_from_time = []
duration_error = []

## j48 rules
duration_bucket = []    ## 50
fare_bucket = []        ## 50

## dataset inspection rules
fare_waiting_gap = []
additional_fare_greater_than_250 = []
is_duration_zero = []

# time based new features
fare_month = []
fare_hour = []
is_midnight = []
is_weekend = []


In [27]:
## iterate through each row in the dataset
for row in dataset.itertuples():
    fare = float(row.fare)
    additional_fare = float(row.additional_fare)
    meter_waiting_fare = float(row.meter_waiting_fare)
    
    meter_waiting = row.meter_waiting
    meter_waiting_till_pickup = row.meter_waiting_till_pickup
    duration = row.duration
    
    pick_lat = row.pick_lat
    pick_lon = row.pick_lon
    drop_lat = row.drop_lat
    drop_lon = row.drop_lon
    
    pickup_time = row.pickup_time.replace('\'', '')
    drop_time = row.drop_time.replace('\'', '')
    pickup_time = pickup_time.split('/')
    drop_time = drop_time.split('/')
    
    # fare bucket
    fare_bucket.append(int(math.ceil(fare / 50)))
    
    # total fare for the trip
    cur_trip_fare = fare - (additional_fare + meter_waiting_fare)
    trip_fare.append(cur_trip_fare)
    
    # trip duration
    cur_trip_duration = duration - meter_waiting - meter_waiting_till_pickup
    trip_duration.append(cur_trip_duration)
    duration_bucket.append(int(math.ceil(cur_trip_duration / 50)))
    
    if(duration != 0):
        is_duration_zero.append(0)
    else:
        is_duration_zero.append(1)
    
    # trip distance
    cur_trip_distance = float(great_circle((drop_lat, drop_lon),(pick_lat, pick_lon)).kilometers)
    trip_distance.append(cur_trip_distance)
    
    # trip distance to duration
    try:
        cur_dist_to_dur = cur_trip_distance / cur_trip_duration
    except ZeroDivisionError:
        cur_dist_to_dur  = 0
    distance_to_duration.append(cur_dist_to_dur)
    
    # trip distance to (duration - meter_waiting_duration)
    try:
        cur_temp = cur_trip_distance / (duration - meter_waiting)
    except ZeroDivisionError:
        cur_temp = 0
    distance_to_duration_min_meter_waiting.append(cur_temp)
    
    # trip fare to trip duration
    try:
        cur_fare_to_duration = cur_trip_fare / cur_trip_duration
    except ZeroDivisionError:
        cur_fare_to_duration  = 0
    fare_to_duration.append(cur_fare_to_duration)
    
    # trip fare to trip distance
    try:
        cur_fare_to_distance = cur_trip_fare / cur_trip_distance
    except ZeroDivisionError:
        cur_fare_to_distance  = 0
    fare_to_distance.append(cur_fare_to_distance)
    
    # trip waiting fare to waiting duration
    try:
        cur_waiting_fare_to_waiting_dur = meter_waiting_fare / meter_waiting
    except ZeroDivisionError:
        cur_waiting_fare_to_waiting_dur  = 0
    waiting_fare_to_waiting_duration.append(cur_waiting_fare_to_waiting_dur)
    
    # (trip_duration - waiting_duration) to trip_fare
    try:
        cur_temp = (duration - meter_waiting) / fare
    except ZeroDivisionError:
        cur_temp = 0
    duration_min_waiting_to_fare.append(cur_temp)
    
    # additional_fare to distance
    try:
        cur_additional_fare_to_distance = additional_fare / cur_trip_distance
    except ZeroDivisionError:
        cur_additional_fare_to_distance = 0
    additional_fare_to_distance.append(cur_additional_fare_to_distance)
    
    # (fare - waiting_fare) to (duration - meter_waiting_duration)
    try:
        cur_temp = (fare - meter_waiting_fare) / (duration - meter_waiting)
    except ZeroDivisionError:
        cur_temp = 0
    fare_min_waiting_fare_to_duration_min_meter_waiting.append(cur_temp)
    
    # waiting_duration to duration
    try:
        cur_waiting_duration_to_duration = meter_waiting / duration
    except ZeroDivisionError:
        cur_waiting_duration_to_duration = 0
    waiting_duration_to_duration.append(cur_waiting_duration_to_duration)
    
    # additional_fare to fare
    try:
        cur_additional_fare_to_fare = additional_fare / fare
    except ZeroDivisionError:
        cur_additional_fare_to_fare = 0
    additional_fare_to_fare.append(cur_additional_fare_to_fare)
    
    # trip fare to waiting fare
    try:
        cur_fare_waiting_gap = (fare - meter_waiting_fare) / fare
    except ZeroDivisionError:
        cur_fare_waiting_gap  = 0
    fare_waiting_gap.append(cur_fare_waiting_gap)
    
    # additional fare greater than 250
    if(additional_fare > 250):
        additional_fare_greater_than_250.append(1)
    else:
        additional_fare_greater_than_250.append(0)
    
    # time duration
    # pickup time
    month, day, y_hm = int(pickup_time[0]), int(pickup_time[1]), pickup_time[2].split(' ')
    year, h_m = int(y_hm[0]), y_hm[1].split(':')
    hour, minute = int(h_m[0]), int(h_m[1])
    pickup_time = datetime.datetime(year, month, day, hour, minute)

    fare_month.append(month)
    fare_hour.append(hour)

    if 0 <= hour <= 5:
        is_midnight.append(1)
    else:
        is_midnight.append(0)
    
    weekday = pickup_time.weekday()
    if 0 <= weekday <= 4:
        is_weekend.append(1)
    else:
        is_weekend.append(0)
    
    # drop time
    month, day, y_hm = int(drop_time[0]), int(drop_time[1]), drop_time[2].split(' ')
    year, h_m = int(y_hm[0]), y_hm[1].split(':')
    hour, minute = int(h_m[0]), int(h_m[1])
    drop_time = datetime.datetime(year, month, day, hour, minute)
    
    # duration
    cur_duration_from_time = (drop_time - pickup_time).seconds
    duration_from_time.append(cur_duration_from_time)
    duration_error.append(cur_duration_from_time - duration)


In [28]:
## append new columns to dataset
dataset['trip_fare'] = trip_fare
dataset['distance_to_duration_min_meter_waiting'] = distance_to_duration_min_meter_waiting
dataset['trip_duration'] = trip_duration
dataset['trip_distance'] = trip_distance
dataset['distance_to_duration'] = distance_to_duration
dataset['fare_to_duration'] = fare_to_duration
dataset['fare_to_distance'] = fare_to_distance
dataset['waiting_fare_to_waiting_duration'] = waiting_fare_to_waiting_duration
dataset['duration_min_waiting_to_fare'] = duration_min_waiting_to_fare
dataset['additional_fare_to_distance'] = additional_fare_to_distance
dataset['fare_min_waiting_fare_to_duration_min_meter_waiting'] = fare_min_waiting_fare_to_duration_min_meter_waiting
dataset['waiting_duration_to_duration'] = waiting_duration_to_duration
dataset['additional_fare_to_fare'] = additional_fare_to_fare
dataset['duration_from_time'] = duration_from_time
dataset['duration_error'] = duration_error
dataset['duration_bucket'] = duration_bucket
dataset['fare_bucket'] = fare_bucket

dataset['fare_waiting_gap'] = fare_waiting_gap
dataset['additional_fare_greater_than_250'] = additional_fare_greater_than_250
dataset['is_duration_zero'] = is_duration_zero

dataset['fare_month'] = fare_month
dataset['fare_hour'] = fare_hour
dataset['is_midnight'] = is_midnight
dataset['is_weekend'] = is_weekend


In [29]:
# ## column average values
# avg_fare = dataset['fare'].mean()
# avg_trip_fare = dataset['trip_fare'].mean()
# avg_waiting_fare = dataset['meter_waiting_fare'].mean()
# avg_fare_to_distance = dataset['fare_to_distance'].mean()
# avg_waiting_fare_to_duration = dataset['waiting_fare_to_waiting_duration'].mean()


In [30]:
## new columns
is_invalid_total_fare = []
# div_from_avg_total_fare = []
# div_from_avg_waiting_fare = []
# div_from_avg_waiting_fare_to_waiting_duration = []


In [31]:
## iterate through each row in the dataset
for row in dataset.itertuples():
    fare = float(row.fare)
    trip_fare = float(row.trip_fare)
    meter_waiting_fare = float(row.meter_waiting_fare)
    additional_fare = float(row.additional_fare)
    
    fare_to_distance = float(row.fare_to_distance)
    waiting_fare_to_waiting_duration = float(row.waiting_fare_to_waiting_duration)
    
    # total fare less than or equal to zero
    if(fare-meter_waiting_fare-additional_fare <= 0):
        is_invalid_total_fare.append(1)
    else:
        is_invalid_total_fare.append(0)
        
#     # diviation from average
#     div_from_avg_total_fare.append(fare - avg_fare)
#     div_from_avg_waiting_fare.append(meter_waiting_fare - avg_waiting_fare)
#     div_from_avg_waiting_fare_to_waiting_duration.append(waiting_fare_to_waiting_duration - avg_waiting_fare_to_duration)


In [32]:
## append new columns to dataset

dataset['is_invalid_total_fare'] = is_invalid_total_fare
# dataset['div_from_avg_total_fare'] = div_from_avg_total_fare
# dataset['div_from_avg_waiting_fare'] = div_from_avg_waiting_fare
# dataset['div_from_avg_waiting_fare_to_waiting_duration'] = div_from_avg_waiting_fare_to_waiting_duration


<h2>Feature Preprocessing</h2>

In [33]:
## Scaling values into 0-10 range

scaler = MinMaxScaler(feature_range=(0,1))
features = [
    'additional_fare',
    'duration',
    'meter_waiting',
    'meter_waiting_fare',
    'meter_waiting_till_pickup',
    'pick_lat',
    'pick_lon',
    'drop_lat',
    'drop_lon',
    'fare',
    'trip_fare',
    'trip_duration',
    'trip_distance',
    'distance_to_duration',
    'fare_to_duration',
    'fare_to_distance',
    'waiting_fare_to_waiting_duration', 
    'duration_from_time',
    'duration_error',
    'duration_bucket',
    'fare_bucket',
    'distance_to_duration_min_meter_waiting',
    'duration_min_waiting_to_fare',
    'additional_fare_to_distance',
    'fare_min_waiting_fare_to_duration_min_meter_waiting',
    'waiting_duration_to_duration',
    'additional_fare_to_fare',
    'pick_cluster',
    'drop_cluster',
    # 'fare_hour',
    # 'fare_month'
]

dataset[features] = scaler.fit_transform(dataset[features])


In [34]:
## Split into training and testing again

train = dataset[:train_len]
test = dataset[train_len:]
test.drop(labels=['label'], axis=1, inplace=True)

train['label'] = train['label'].astype(int)


In [35]:
## Drop unnecessary columns
labels_to_drop = [
    'tripid',
    'pickup_time',
    'drop_time'
]

train.drop(labels=labels_to_drop, axis=1, inplace=True)


<h2>Feature Importance</h2>

In [36]:
# y = train['label']
# X = train.drop(labels=['label'], axis=1)

# # apply SelectKBest class to extract top 20 best features
# bestfeatures = SelectKBest(score_func=chi2, k='all')
# fit = bestfeatures.fit(X, y)
# dfscores = pd.DataFrame(fit.scores_)
# dfcolumns = pd.DataFrame(X.columns)

# # concat two dataframes for better visualization
# featureScores = pd.concat([dfcolumns, dfscores], axis=1)
# featureScores.columns = ['Specs', 'Score']  #naming the dataframe columns
# print(featureScores.nlargest(40, 'Score'))  #print 40 best features


In [37]:
## Drop less important columns
labels_to_drop = [
    'pick_lat',
    'pick_lon',
    'drop_lat',
    'drop_lon',

    # 'trip_duration',
    # 'duration_bucket',
    # 'fare_to_duration',
    # 'distance_to_duration',
#     'fare_hour',
#     'is_weekend'
]

train.drop(labels=labels_to_drop, axis=1, inplace=True)


<h2>Training and Evaluating Basic XGBoost Setup</h2>

In [38]:
## Seperate train features and label

y_train = train['label']
X_train = train.drop(labels='label', axis=1)

X2_train, X2_test, y2_train, y2_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [39]:
def XGBmodel(X_train, X_test, y_train, y_test, params):
    matrix_train = xgb.DMatrix(X_train, label=y_train)
    matrix_test = xgb.DMatrix(X_test, label=y_test)
    
    model=xgb.train(
        params=params,
        dtrain=matrix_train,
        num_boost_round=5000, 
        early_stopping_rounds=10,
        evals=[(matrix_test, 'test')]
    )
    
    return model


In [42]:
params = {
    'max_depth': 10,
    'gamma': 0,
    'eta': .03,
    'subsample': 1,             # use high value
    'colsample_bytree': 0.8,    # use low value
    'eval_metric':'rmse',
    'silent': 0,
    
    'booster': 'dart',
    'learning_rate': 0.01,
    'objective': 'binary:logistic',
    'sample_type': 'weighted',
    'normalize_type': 'forest',
    'rate_drop': 0.2,
    'skip_drop': 0.5,
    'alpha': 1
}


In [43]:
model = XGBmodel(X2_train, X2_test, y2_train, y2_test, params)


Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	test-rmse:0.49571
Will train until test-rmse hasn't improved in 10 rounds.
[1]	test-rmse:0.49147
[2]	test-rmse:0.48725
[3]	test-rmse:0.48311
[4]	test-rmse:0.47899
[5]	test-rmse:0.47492
[6]	test-rmse:0.47096
[7]	test-rmse:0.46705
[8]	test-rmse:0.46316
[9]	test-rmse:0.45929
[10]	test-rmse:0.45547
[11]	test-rmse:0.45176
[12]	test-rmse:0.44806
[13]	test-rmse:0.44443
[14]	test-rmse:0.44084
[15]	test-rmse:0.43722
[16]	test-rmse:0.43363
[17]	test-rmse:0.43013
[18]	test-rmse:0.42664
[19]	test-rmse:0.42323
[20]	test-rmse:0.41979
[21]	test-rmse:0.41649
[22]	test-rmse:0.41322
[23]	test-rmse:0.40997
[24]	test-rmse:0.40673
[25]	test-rmse:0.40359
[26]	test-rmse:0.40041
[27]	test-rmse:0.39727
[28]	test-rmse:0.39416
[29]

[330]	test-rmse:0.18049
[331]	test-rmse:0.18043
[332]	test-rmse:0.18042
[333]	test-rmse:0.18038
[334]	test-rmse:0.18033
[335]	test-rmse:0.18032
[336]	test-rmse:0.18029
[337]	test-rmse:0.18026
[338]	test-rmse:0.18026
[339]	test-rmse:0.18025
[340]	test-rmse:0.18021
[341]	test-rmse:0.18015
[342]	test-rmse:0.18010
[343]	test-rmse:0.18011
[344]	test-rmse:0.18009
[345]	test-rmse:0.18004
[346]	test-rmse:0.18002
[347]	test-rmse:0.18000
[348]	test-rmse:0.17995
[349]	test-rmse:0.17991
[350]	test-rmse:0.17989
[351]	test-rmse:0.17983
[352]	test-rmse:0.17978
[353]	test-rmse:0.17972
[354]	test-rmse:0.17969
[355]	test-rmse:0.17969
[356]	test-rmse:0.17966
[357]	test-rmse:0.17959
[358]	test-rmse:0.17955
[359]	test-rmse:0.17954
[360]	test-rmse:0.17951
[361]	test-rmse:0.17944
[362]	test-rmse:0.17944
[363]	test-rmse:0.17943
[364]	test-rmse:0.17939
[365]	test-rmse:0.17939
[366]	test-rmse:0.17937
[367]	test-rmse:0.17933
[368]	test-rmse:0.17930
[369]	test-rmse:0.17927
[370]	test-rmse:0.17925
[371]	test-rmse:

<h2>Predicting and Preparing Submission</h2>

In [44]:
trip_ids = test.tripid

labels_to_drop = [
    'tripid',
    'pick_lat',
    'pick_lon',
    'drop_lat',
    'drop_lon',
    'pickup_time',
    'drop_time',

#     'fare_hour',
#     'is_weekend'
    
#     'trip_distance',
#     'fare_to_duration',
#     'trip_duration',
#     'distance_to_duration',
#     'duration_error',
#     'meter_waiting_till_pickup',
#     'additional_fare'
]

test = test.drop(labels=labels_to_drop, axis=1)


In [47]:
predictions = model.predict(xgb.DMatrix(test), ntree_limit = model.best_ntree_limit).tolist()

# round predictions
pred_rounded = []

for pred in predictions:
    if pred > 0.5:
        pred_rounded.append(1)
    else:
        pred_rounded.append(0)

#     pred_rounded.append(round(pred))


In [48]:
unique, count = np.unique(pred_rounded, return_counts=True)
{k:v for (k,v) in zip(unique, count)}


{0: 458, 1: 8118}

In [49]:
output = pd.DataFrame({'tripid': trip_ids, 'prediction': pred_rounded})
output.to_csv('../../submissions/160253h_submission_37.csv', index=False)
print('Completed!')

# # output = pd.DataFrame({'tripid': trip_ids, 'prediction': predictions})
# # output.to_csv('160253h_submission_37.csv', index=False)

# # !cp '160253h_submission_37.csv' "drive/My Drive/ML/submissions/"

# # print('Completed!')


Completed!


model training.

with feature engineering (added more new features).

datetime columns and coordinates have dropped (new features added).

clustering features.

submissions:

===============================================================================

(imputing missing values)

[model: XGBoost (regression)]

'objective': 'reg:logistic' | colsample_bytree: 0.8 | alpha: 1 | weighted | forest | threshold: 0.56

160253h_submission_35.csv

score: 0.97915
