In [0]:
from google.colab import drive
drive.mount('/content/drive')

## Importing Libraries and Modules

In [0]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from geopy.distance import great_circle
import datetime
import os

from sklearn.preprocessing import MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split

from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
import keras.backend as K
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.utils import to_categorical
from keras.utils import plot_model
from keras.models import Model
from keras.layers import Input
from keras.layers.merge import concatenate
from numpy import argmax
from numpy import dstack

from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

from sklearn.utils import class_weight
from imblearn.over_sampling import SMOTE


## Loading the Datasets

In [0]:
## Loading the Datasets
train = pd.read_csv('drive/My Drive/ML/data/weka_processed/train_pro.csv')
test = pd.read_csv('drive/My Drive/ML/data/test.csv')


## Basic Intuition on the Data

In [0]:
train.head()

Unnamed: 0,tripid,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
0,189123628,10.5,834,56,0,64,'11/1/2019 0:20','11/1/2019 0:34',6.86252,79.8993,6.9033,79.8783,270.32,correct
1,189125358,10.5,791,47,0,134,'11/1/2019 0:56','11/1/2019 1:09',6.88589,79.8984,6.91373,79.8923,197.85,correct
2,189125719,10.5,1087,80,0,61,'11/1/2019 1:08','11/1/2019 1:26',6.90839,79.8651,6.93669,79.9146,301.64,correct
3,189127273,10.5,598,271,15.6638,68,'11/1/2019 2:27','11/1/2019 2:37',6.9257,79.8895,6.92748,79.8971,82.3,correct
4,189128020,?,?,?,?,?,'11/1/2019 3:34','11/1/2019 3:51',6.87441,79.8615,6.84478,79.929,358.39,correct


In [0]:
test.head()

Unnamed: 0,tripid,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare
0,213284604,10.5,924,42,2.4486,148,2/1/2020 0:38,2/1/2020 0:53,6.83454,79.875,6.7749,79.884,289.27
1,213286352,10.5,4249,20,0.0,91,2/1/2020 1:02,2/1/2020 2:13,6.91168,79.8723,6.55091,79.9706,1912.7
2,213293973,10.5,1552,255,2.6588,23,2/1/2020 5:02,2/1/2020 5:28,6.92145,79.8478,6.90539,79.8989,394.0
3,213294622,10.5,462,16,0.0,198,2/1/2020 5:30,2/1/2020 5:38,6.77433,79.9416,6.80401,79.9407,154.32
4,213298687,10.5,814,392,12.3692,69,2/1/2020 7:00,2/1/2020 7:14,6.97968,79.913,6.98875,79.8914,147.47


In [0]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15327 entries, 0 to 15326
Data columns (total 14 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   tripid                     15327 non-null  int64  
 1   additional_fare            15327 non-null  object 
 2   duration                   15327 non-null  object 
 3   meter_waiting              15327 non-null  object 
 4   meter_waiting_fare         15327 non-null  object 
 5   meter_waiting_till_pickup  15327 non-null  object 
 6   pickup_time                15327 non-null  object 
 7   drop_time                  15327 non-null  object 
 8   pick_lat                   15327 non-null  float64
 9   pick_lon                   15327 non-null  float64
 10  drop_lat                   15327 non-null  float64
 11  drop_lon                   15327 non-null  float64
 12  fare                       15327 non-null  object 
 13  label                      15327 non-null  obj

In [0]:
train.shape

(15327, 14)

In [0]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8576 entries, 0 to 8575
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   tripid                     8576 non-null   int64  
 1   additional_fare            8576 non-null   float64
 2   duration                   8576 non-null   int64  
 3   meter_waiting              8576 non-null   int64  
 4   meter_waiting_fare         8576 non-null   float64
 5   meter_waiting_till_pickup  8576 non-null   int64  
 6   pickup_time                8576 non-null   object 
 7   drop_time                  8576 non-null   object 
 8   pick_lat                   8576 non-null   float64
 9   pick_lon                   8576 non-null   float64
 10  drop_lat                   8576 non-null   float64
 11  drop_lon                   8576 non-null   float64
 12  fare                       8576 non-null   float64
dtypes: float64(7), int64(4), object(2)
memory usage:

In [0]:
test.shape

(8576, 13)

## Cleaning the Data

In [0]:
## cleaning weka added unnecessary values

train = train.replace({'?': np.nan})


In [0]:
## converting label values into 0,1 instead of correct, incorrect

train.label = train.label.map(dict(correct=1, incorrect=0))

In [0]:
## check for missing values in the dataset

# train.isna().head()
train.isna().sum()

tripid                         0
additional_fare              196
duration                     196
meter_waiting                196
meter_waiting_fare           196
meter_waiting_till_pickup    196
pickup_time                    0
drop_time                      0
pick_lat                       0
pick_lon                       0
drop_lat                       0
drop_lon                       0
fare                         133
label                          0
dtype: int64

In [0]:
test.isna().sum()

tripid                       0
additional_fare              0
duration                     0
meter_waiting                0
meter_waiting_fare           0
meter_waiting_till_pickup    0
pickup_time                  0
drop_time                    0
pick_lat                     0
pick_lon                     0
drop_lat                     0
drop_lon                     0
fare                         0
dtype: int64

In [0]:
## fill missing values

imputer = KNNImputer(n_neighbors=5, weights='uniform')
train.iloc[:,[1,2,3,4,5,12]] = imputer.fit_transform(train.iloc[:,[1,2,3,4,5,12]])


In [0]:
train.isna().sum()

tripid                       0
additional_fare              0
duration                     0
meter_waiting                0
meter_waiting_fare           0
meter_waiting_till_pickup    0
pickup_time                  0
drop_time                    0
pick_lat                     0
pick_lon                     0
drop_lat                     0
drop_lon                     0
fare                         0
label                        0
dtype: int64

## Basic Feature Engineering

In [0]:
## Join train and test datasets in order to obtain the same number of features during categorical conversion
train_len = len(train)
dataset = pd.concat(objs=[train, test], axis=0).reset_index(drop=True)


## Advanced Feature Engineering

In [0]:
## new feature columns
trip_fare = []
trip_duration = []
trip_distance = []

distance_to_duration = []
fare_to_duration = []
fare_to_distance = []
waiting_fare_to_waiting_duration = []

duration_from_time = []
duration_error = []


In [0]:
## iterate through each row in the dataset
for row in dataset.itertuples():
    fare = float(row.fare)
    additional_fare = float(row.additional_fare)
    meter_waiting_fare = float(row.meter_waiting_fare)
    
    meter_waiting = row.meter_waiting
    meter_waiting_till_pickup = row.meter_waiting_till_pickup
    duration = row.duration
    
    pick_lat = row.pick_lat
    pick_lon = row.pick_lon
    drop_lat = row.drop_lat
    drop_lon = row.drop_lon
    
    pickup_time = row.pickup_time.replace('\'', '')
    drop_time = row.drop_time.replace('\'', '')
    pickup_time = pickup_time.split('/')
    drop_time = drop_time.split('/')
    
    # total fare for the trip
    cur_trip_fare = fare - (additional_fare + meter_waiting_fare)
    trip_fare.append(cur_trip_fare)
    
    # trip duration
    cur_trip_duration = duration - meter_waiting - meter_waiting_till_pickup
    trip_duration.append(cur_trip_duration)
    
    # trip distance
    cur_trip_distance = float(great_circle((drop_lat, drop_lon),(pick_lat, pick_lon)).kilometers)
    trip_distance.append(cur_trip_distance)
    
    # trip distance to duration
    try:
        cur_dist_to_dur = cur_trip_distance / cur_trip_duration
    except ZeroDivisionError:
        cur_dist_to_dur  = 0
    distance_to_duration.append(cur_dist_to_dur)
    
    # trip fare to trip duration
    try:
        cur_fare_to_duration = cur_trip_fare / cur_trip_duration
    except ZeroDivisionError:
        cur_fare_to_duration  = 0
    fare_to_duration.append(cur_fare_to_duration)
    
    # trip fare to trip distance
    try:
        cur_fare_to_distance = cur_trip_fare / cur_trip_distance
    except ZeroDivisionError:
        cur_fare_to_distance  = 0
    fare_to_distance.append(cur_fare_to_distance)
    
    # trip waiting fare to waiting duration
    try:
        cur_waiting_fare_to_waiting_dur = meter_waiting_fare / meter_waiting
    except ZeroDivisionError:
        cur_waiting_fare_to_waiting_dur  = 0
    waiting_fare_to_waiting_duration.append(cur_waiting_fare_to_waiting_dur)
    
    # time duration
    # pickup time
    month, day, y_hm = int(pickup_time[0]), int(pickup_time[1]), pickup_time[2].split(' ')
    year, h_m = int(y_hm[0]), y_hm[1].split(':')
    hour, minute = int(h_m[0]), int(h_m[1])
    pickup_time = datetime.datetime(year, month, day, hour, minute)
    
    # drop time
    month, day, y_hm = int(drop_time[0]), int(drop_time[1]), drop_time[2].split(' ')
    year, h_m = int(y_hm[0]), y_hm[1].split(':')
    hour, minute = int(h_m[0]), int(h_m[1])
    drop_time = datetime.datetime(year, month, day, hour, minute)
    
    # duration
    cur_duration_from_time = (drop_time - pickup_time).seconds
    duration_from_time.append(cur_duration_from_time)
    duration_error.append(cur_duration_from_time - duration)


In [0]:
## append new columns to dataset
dataset['trip_fare'] = trip_fare
dataset['trip_duration'] = trip_duration
dataset['trip_distance'] = trip_distance
dataset['distance_to_duration'] = distance_to_duration
dataset['fare_to_duration'] = fare_to_duration
dataset['fare_to_distance'] = fare_to_distance
dataset['waiting_fare_to_waiting_duration'] = waiting_fare_to_waiting_duration
dataset['duration_from_time'] = duration_from_time
dataset['duration_error'] = duration_error


In [0]:
## few more new features

In [0]:
## column average values
avg_fare = dataset['fare'].mean()
avg_trip_fare = dataset['trip_fare'].mean()
avg_waiting_fare = dataset['meter_waiting_fare'].mean()
avg_fare_to_distance = dataset['fare_to_distance'].mean()
avg_waiting_fare_to_duration = dataset['waiting_fare_to_waiting_duration'].mean()

In [0]:
## new columns
is_invalid_total_fare = []

div_from_avg_total_fare = []
div_from_avg_trip_fare = []
div_from_avg_waiting_fare = []

div_from_avg_fare_to_distance = []
div_from_avg_waiting_fare_to_waiting_duration = []

In [0]:
## iterate through each row in the dataset
for row in dataset.itertuples():
    fare = float(row.fare)
    trip_fare = float(row.trip_fare)
    meter_waiting_fare = float(row.meter_waiting_fare)
    additional_fare = float(row.additional_fare)
    
    fare_to_distance = float(row.fare_to_distance)
    waiting_fare_to_waiting_duration = float(row.waiting_fare_to_waiting_duration)
    
    # total fare less than or equal to zero
    if(fare-meter_waiting_fare-additional_fare <= 0):
        is_invalid_total_fare.append(1)
    else:
        is_invalid_total_fare.append(0)
        
    # diviation from average
    div_from_avg_total_fare.append(fare - avg_fare)
    div_from_avg_trip_fare.append(trip_fare - avg_trip_fare)
    div_from_avg_waiting_fare.append(meter_waiting_fare - avg_waiting_fare)
    div_from_avg_fare_to_distance.append(fare_to_distance - avg_fare_to_distance)
    div_from_avg_waiting_fare_to_waiting_duration.append(waiting_fare_to_waiting_duration - avg_waiting_fare_to_duration)


In [0]:
## append new columns to dataset
dataset['is_invalid_total_fare'] = is_invalid_total_fare
dataset['div_from_avg_total_fare'] = div_from_avg_total_fare
dataset['div_from_avg_trip_fare'] = div_from_avg_trip_fare
dataset['div_from_avg_waiting_fare'] = div_from_avg_waiting_fare
dataset['div_from_avg_fare_to_distance'] = div_from_avg_fare_to_distance
dataset['div_from_avg_waiting_fare_to_waiting_duration'] = div_from_avg_waiting_fare_to_waiting_duration


## Feature Preprocessing

In [0]:
## Scaling values into 0-1 range

scaler = MinMaxScaler(feature_range=(0,1))
features = [
    'additional_fare',
    'duration',
    'meter_waiting',
    'meter_waiting_fare',
    'meter_waiting_till_pickup',
    'pick_lat',
    'pick_lon',
    'drop_lat',
    'drop_lon',
    'fare',
    
    'trip_fare',
    'trip_duration',
    'trip_distance',
    'distance_to_duration',
    'fare_to_duration',
    'fare_to_distance',
    'waiting_fare_to_waiting_duration', 
    'duration_from_time',
    'duration_error',
    
    'div_from_avg_total_fare',
    'div_from_avg_trip_fare',
    'div_from_avg_waiting_fare',
    'div_from_avg_fare_to_distance',
    'div_from_avg_waiting_fare_to_waiting_duration'
]

dataset[features] = scaler.fit_transform(dataset[features])


In [0]:
## Split into training and testing again

train = dataset[:train_len]
test = dataset[train_len:]
test.drop(labels=['label'], axis=1, inplace=True)

train['label'] = train['label'].astype(int)


In [0]:
## Drop unnecessary columns
labels_to_drop = [
    'tripid',
#     'pick_lat',
#     'pick_lon',
#     'drop_lat',
#     'drop_lon',
    'pickup_time',
    'drop_time'
]

train.drop(labels=labels_to_drop, axis=1, inplace=True)


## Feature Importance

In [0]:
y = train['label']
X = train.drop(labels=['label'], axis=1)

# apply SelectKBest class to extract top 20 best features
bestfeatures = SelectKBest(score_func=chi2, k='all')
fit = bestfeatures.fit(X, y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

# concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Specs', 'Score']  #naming the dataframe columns
print(featureScores.nlargest(30, 'Score'))  #print 10 best features


                                            Specs       Score
19                          is_invalid_total_fare  108.180246
3                              meter_waiting_fare   45.763162
22                      div_from_avg_waiting_fare   45.763162
2                                   meter_waiting   41.708268
17                             duration_from_time   26.766455
9                                            fare   16.321160
20                        div_from_avg_total_fare   16.321160
1                                        duration   15.022474
24  div_from_avg_waiting_fare_to_waiting_duration    9.070288
16               waiting_fare_to_waiting_duration    9.070288
6                                        pick_lon    7.465128
21                         div_from_avg_trip_fare    6.433426
10                                      trip_fare    6.433426
0                                 additional_fare    1.522073
15                               fare_to_distance    1.288787
23      

In [0]:
## Drop less important columns
labels_to_drop = [
#     'trip_duration',
#     'fare_to_duration',
#     'distance_to_duration',
    'drop_lon',
    'drop_lat',
#     'trip_distance',
#     'duration_error',
#     'pick_lat',
#     'pick_lon',
#     'meter_waiting_till_pickup',
#     'div_from_avg_fare_to_distance',
#     'fare_to_distance',
#     'additional_fare'
]

train.drop(labels=labels_to_drop, axis=1, inplace=True)


## Train and Save Sub Models

In [0]:
## function to calculate F1 score
def f1_score(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true*y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2 * (precision * recall) / (precision + recall + K.epsilon())
    return f1_val


In [0]:
## Seperate train features and label

y = train['label']
X = train.drop(labels='label', axis=1)

## Spliting into training and testing
X_train, X_eval, y_train, y_eval = train_test_split(X, y, test_size=0.2, random_state=42)


In [0]:
# fit and save models
n_models = 5

# # sub model 1
# model1 = Sequential()
# model1.add(Dense(36, input_dim=23, activation='relu'))
# model1.add(Dense(18, activation='relu'))
# model1.add(Dense(8, activation='relu'))
# model1.add(Dense(1, activation='sigmoid'))
# model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1_score])
# model1.fit(X_train, y_train, epochs=500)

# yhat1 = model1.predict_classes(X_eval, verbose=0)
# acc = accuracy_score(y_eval, yhat1)
# print('Model 1 Neural Net Accuracy: %.4f' % acc)

yhat1 = model1.predict(X_eval, verbose=0)


# sub model 2
model2 = RandomForestClassifier(
    bootstrap=True,
    max_depth=30,
    max_features='auto',
    min_samples_leaf=2,
    min_samples_split=12,
    n_estimators=1600
)
model2.fit(X_train, y_train)

# yhat2 = model2.predict(X_eval)
# acc = accuracy_score(y_eval, yhat2)
# print('Model 2 Random Forest Accuracy: %.4f' % acc)

yhat2 = model2.predict_proba(X_eval)


# sub model 3
model3 = XGBClassifier(
    scale_pos_weight = 1,
    n_jobs = -1,
    random_state = 42,
    n_estimators = 1000,
    max_depth = 15,
    # learning_rate = 0.1,
    # gamma = 0,
    # reg_alpha = 0.1,
    # reg_lambda = 0.1,
    # booster = 'gbtree',
    # tree_method = 'auto',
    # num_parallel_tree = 1
)
model3.fit(X_train, y_train)

# yhat3 = model3.predict(X_eval)
# acc = accuracy_score(y_eval, yhat3)
# print('Model 3 XGBoost Accuracy: %.4f' % acc)

yhat3 = model3.predict_proba(X_eval)


## Seperate Stacking Model (Meta Learner)

In [0]:
yhat1_list = []
for pred in yhat1:
    yhat1_list.append(pred[0])

yhat2_list = []
for pred in yhat2:
    yhat2_list.append(pred[1])

yhat3_list = []
for pred in yhat3:
    yhat3_list.append(pred[1])

# result_df = pd.DataFrame({'pred1': yhat1_list, 'pred2': yhat2, 'pred3': yhat3})

result_df = pd.DataFrame({'pred1': yhat1_list, 'pred2': yhat2_list, 'pred3': yhat3_list})


In [0]:
# X_eval['model1_pred'] = yhat1_list
# X_eval['model2_pred'] = yhat2
# X_eval['model3_pred'] = yhat3

X_eval['model1_pred'] = yhat1_list
X_eval['model2_pred'] = yhat2_list
X_eval['model3_pred'] = yhat3_list


In [0]:
# creating a meta learner

metal = RandomForestClassifier()
metal.fit(X_eval, y_eval)

# meta_l = Sequential()
# meta_l.add(Dense(3, input_dim=3, activation='relu'))
# #meta_l.add(Dense(2, activation='relu'))
# meta_l.add(Dense(1, activation='sigmoid'))
# meta_l.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1_score])
# meta_l.fit(result_df, y_eval, epochs=500)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
# evaluating meta learner

yhat_meta = metal.predict(X_eval)
acc = accuracy_score(y_eval, yhat_meta)
print('Meta Learner Accuracy: %.4f' % acc)
# Meta Learner Accuracy: 0.9618

# yhat_meta = meta_l.predict_classes(result_df, verbose=0)
# acc = accuracy_score(y_eval, yhat_meta)
# print('Meta Learner Accuracy: %.4f' % acc)
# # Meta Learner Accuracy: 0.9599


Meta Learner Accuracy: 1.0000


In [0]:
## cross val score for the complete train set

kfold = StratifiedKFold(n_splits=4, random_state=42, shuffle=False)

metal_cv = RandomForestClassifier()

rfc_scores = []
score = cross_val_score(metal_cv, X_eval, y_eval, cv=kfold, scoring='f1')
rfc_scores.append(score.mean())

print('% Accuracy :', round(score.mean()*100, 4))


% Accuracy : 97.8903


## Predicting and Preparing the Submission

In [0]:
trip_ids = test.tripid

labels_to_drop = [
    'tripid',
#     'pick_lat',
#     'pick_lon',
    'drop_lat',
    'drop_lon',
    'pickup_time',
    'drop_time',
#     'trip_distance',
#     'fare_to_duration',
#     'trip_duration',
#     'distance_to_duration',
#     'duration_error',
#     'meter_waiting_till_pickup',
#     'additional_fare'
]

test = test.drop(labels=labels_to_drop, axis=1)


In [0]:
# # predictions
# pred1 = model1.predict_classes(test, verbose=0)
# pred2 = model2.predict(test)
# pred3 = model3.predict(test)

# pred1_list = []
# for pred in pred1:
#     pred1_list.append(pred[0])

# # result_df = pd.DataFrame({'pred1': pred1_list, 'pred2': pred2, 'pred3': pred3})

# test['model1_pred'] = pred1_list
# test['model2_pred'] = pred2
# test['model3_pred'] = pred3

# # meta learner
# predictions = metal.predict(test)



# predictions
pred1 = model1.predict(test, verbose=0)
pred2 = model2.predict_proba(test)
pred3 = model3.predict_proba(test)

pred1_list = []
for pred in pred1:
    pred1_list.append(pred[0])

pred2_list = []
for pred in pred2:
    pred2_list.append(pred[1])

pred3_list = []
for pred in pred3:
    pred3_list.append(pred[1])

test['model1_pred'] = pred1_list
test['model2_pred'] = pred2_list
test['model3_pred'] = pred3_list

# meta learner
predictions = metal.predict(test)


In [0]:
output = pd.DataFrame({'tripid': trip_ids, 'prediction': predictions})
output.to_csv('160253h_submission_20.csv', index=False)

!cp '160253h_submission_20.csv' "drive/My Drive/ML/submissions/"

print('Completed!')


Completed!


model training.

remove outliers only (-Weka).

KNNImputer to impute missing values.

with feature engineering (added 15 new features)

datetime columns have dropped.

======================================

[Neural Net + RF + XGB]
[Meta Learner: RF]

======================================

Append output classes of level 1 learners back to dataset; -> input for the meta learner

submission: 160253h_submission_19.csv

score: 0.9781

======================================

Append output class probabilities of level 1 learners back to dataset; -> input for the meta learner

submission: 160253h_submission_20.csv

score: 0.9779
