## 0. Setup


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
#from sklearn.naive_bayes import GaussianNB

from google.colab import drive  
drive.mount('/content/gdrive')
train_df= pd.read_csv('/content/gdrive/My Drive/Ride Fare/train.csv', index_col="tripid")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## 2. Feature PreProcessing

In [None]:
train_df['label_encod'] = train_df['label'].apply(lambda x: 0 if x == 'incorrect' else 1 if x == 'correct' else 2)
train_df['label_encod'].unique()
train_df.drop(['label'],axis=1, inplace=True)

def haversine_distance(row):
    lat_p, lon_p = row['pick_lat'], row['pick_lon']
    lat_d, lon_d = row['drop_lat'], row['drop_lon']
    radius = 6371 # km

    dlat = np.radians(lat_d - lat_p)
    dlon = np.radians(lon_d - lon_p)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lat_p)) * np.cos(np.radians(lat_d)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    distance = radius * c

    return distance

train_df['distance'] = train_df.apply(haversine_distance, axis = 1)
train_df =  train_df.drop(train_df[['pick_lat','pick_lon','drop_lat','drop_lon']], axis=1)

import datetime as dt
train_df['pickup_time']= pd.to_datetime(train_df['pickup_time'])
train_df['drop_time']= pd.to_datetime(train_df['drop_time'])
train_df['AssumedDuration'] = (train_df['drop_time']-train_df['pickup_time']).dt.total_seconds()
train_df['duration'] = train_df.apply(
    lambda row: row['AssumedDuration'] if np.isnan(row['duration']) else row['duration'],
    axis=1
)
train_df =  train_df.drop(train_df[['AssumedDuration','pickup_time','drop_time']], axis=1)
features_df =  train_df.drop('label_encod', axis=1)
labels_df = pd.DataFrame(data = train_df['label_encod'], columns = ['label_encod'])

## 3. Model Creation and Training


In [None]:
numeric_cols = ['additional_fare', 'duration', 'meter_waiting', 'meter_waiting_fare', 'meter_waiting_till_pickup', 'fare', 'distance']

numeric_preprocessing_steps = Pipeline(steps = [
    ('standard_scaler', StandardScaler()),
    ('imputer', SimpleImputer(strategy='mean'))])

preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_preprocessing_steps, numeric_cols)
    ],
    remainder = "drop"
)

In [None]:
from sklearn.ensemble import StackingClassifier

# get the models to evaluate
level0 = list()
level0.append(('xg',XGBClassifier(n_estimators=500,subsample=0.14))) 
level0.append(('mlp',MLPClassifier(hidden_layer_sizes=(50,100,50), max_iter=1000)))
level0.append(('dt', RandomForestClassifier(n_estimators = 100,max_features = 'log2')))

# define meta learner model
level1 = LogisticRegression(penalty="l2", C=3)
 
estimator = StackingClassifier(estimators=level0, final_estimator=level1, cv=10)

fullPipe = Pipeline([
    ('preprocessor', preprocessor),
    ('model', estimator)])

In [None]:
X_train, X_eval, y_train, y_eval = train_test_split(features_df, labels_df, test_size=0.33, shuffle=True, stratify=labels_df, random_state=6)

# Train model
fullPipe.fit(X_train, np.ravel(y_train))

preds = fullPipe.predict(X_eval)
y_preds = pd.DataFrame({"label": preds},index = y_eval.index)

from sklearn.metrics import f1_score
print(f1_score(y_eval, y_preds))

0.9741362671298978


In [None]:
fullPipe.fit(features_df, np.ravel(labels_df))

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('standard_scaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True)),
                                                                  ('imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                            

## 4. Test data PreProcessing


In [None]:
test_df= pd.read_csv('/content/gdrive/My Drive/Ride Fare/test.csv', index_col="tripid")
test_df['distance'] = test_df.apply(haversine_distance, axis = 1)
test_df =  test_df.drop(test_df[['pick_lat','pick_lon','drop_lat','drop_lon']], axis=1)

test_df['pickup_time']= pd.to_datetime(test_df['pickup_time'])
test_df['drop_time']= pd.to_datetime(test_df['drop_time'])

test_df['AssumedDuration'] = (test_df['drop_time']-test_df['pickup_time']).dt.total_seconds()
test_df['duration'] = test_df.apply(lambda row: row['AssumedDuration'] if np.isnan(row['duration']) else row['duration'],axis=1)
test_df =  test_df.drop('AssumedDuration', axis=1)

## 5. Make Predictions


In [None]:
test_preds = fullPipe.predict(test_df)

submission_df = pd.read_csv('/content/gdrive/My Drive/Ride Fare/sample_submission.csv', index_col="tripid")

# Make sure we have the rows in the same order
np.testing.assert_array_equal(test_df.index.values, submission_df.index.values)

# Save predictions to submission data frame
submission_df['prediction'] = test_preds

submission_df.to_csv('/content/gdrive/My Drive/Ride Fare/my_submission_stack3.csv', index=True)