In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime as dt
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve, roc_auc_score


In [2]:
#load traing dataset
data=pd.read_csv("train.csv",index_col="tripid")
#load testing dataset
test=pd.read_csv("test.csv",index_col="tripid")

In [3]:
data.head()

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,pick_lat,pick_lon,drop_lat,drop_lon,fare,label
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
189123628,10.5,834.0,56.0,0.0,64.0,11/1/2019 0:20,11/1/2019 0:34,6.86252,79.8993,6.9033,79.8783,270.32,correct
189125358,10.5,791.0,47.0,0.0,134.0,11/1/2019 0:56,11/1/2019 1:09,6.88589,79.8984,6.91373,79.8923,197.85,correct
189125719,10.5,1087.0,80.0,0.0,61.0,11/1/2019 1:08,11/1/2019 1:26,6.90839,79.8651,6.93669,79.9146,301.64,correct
189127273,10.5,598.0,271.0,15.6638,68.0,11/1/2019 2:27,11/1/2019 2:37,6.9257,79.8895,6.92748,79.8971,82.3,correct
189128020,,,,,,11/1/2019 3:34,11/1/2019 3:51,6.87441,79.8615,6.84478,79.929,358.39,correct


In [4]:
#fill NaN values in duration column-traindata
data['CalculatedDuration'] = (pd.to_datetime(data['drop_time'])-pd.to_datetime(data['pickup_time'])).dt.total_seconds()
data['duration'] = data.apply(lambda row: row['CalculatedDuration'] if np.isnan(row['duration']) else row['duration'], axis=1)
data=data.drop(data[['CalculatedDuration','pickup_time','drop_time']], axis=1)

In [5]:
#fill NaN values in duration column-testdata
test['CalculatedDuration'] = (pd.to_datetime(test['drop_time'])-pd.to_datetime(test['pickup_time'])).dt.total_seconds()
test['duration'] = test.apply(lambda row: row['CalculatedDuration'] if np.isnan(row['duration']) else row['duration'],axis=1)
test =  test.drop('CalculatedDuration', axis=1)

In [6]:
#define function for calculate harversine distance
def haversine_distance_calc(row):
    lat_p, lon_p, lat_d, lon_d = row['pick_lat'], row['pick_lon'], row['drop_lat'], row['drop_lon'] 
    dlat = np.radians(lat_d - lat_p)
    dlon = np.radians(lon_d - lon_p)
    a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lat_p)) * np.cos(np.radians(lat_d)) * np.sin(dlon/2) * np.sin(dlon/2)
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    r = 6371
    return r*c 
#add harversine distance column,drop 'pick_lat','pick_lon','drop_lat','drop_lon' columns
data['distance'] = data.apply(haversine_distance_calc, axis = 1)
data = data.drop(data[['pick_lat','pick_lon','drop_lat','drop_lon']], axis=1)

test['distance'] = test.apply(haversine_distance_calc, axis = 1)
test=  test.drop(test[['pick_lat','pick_lon','drop_lat','drop_lon']], axis=1)



In [7]:
#define feature_df,Labels_df,test_df
features_df=data.drop('label',axis=1)
labels_df=data['label'].replace(['correct','incorrect'],[1,0])
test_features_df=test

In [8]:
features_df.head(10)

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,fare,distance
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
189123628,10.5,834.0,56.0,0.0,64.0,270.32,5.09277
189125358,10.5,791.0,47.0,0.0,134.0,197.85,3.168058
189125719,10.5,1087.0,80.0,0.0,61.0,301.64,6.305395
189127273,10.5,598.0,271.0,15.6638,68.0,82.3,0.861946
189128020,,1020.0,,,,358.39,8.147782
189129552,10.5,3407.0,182.0,0.0,112.0,1065.02,24.207039
189132829,10.5,1246.0,487.0,0.0,133.0,266.62,4.777624
189135103,10.5,1333.0,295.0,17.1985,212.0,318.05,5.322544
189139296,10.5,360.0,80.0,4.664,3.0,100.32,1.035302
189138671,10.5,1539.0,588.0,33.9864,43.0,257.89,2.930715


In [9]:
labels_df.head(10)

tripid
189123628    1
189125358    1
189125719    1
189127273    1
189128020    1
189129552    1
189132829    1
189135103    1
189139296    1
189138671    1
Name: label, dtype: int64

In [10]:
test_features_df.head(10)

Unnamed: 0_level_0,additional_fare,duration,meter_waiting,meter_waiting_fare,meter_waiting_till_pickup,pickup_time,drop_time,fare,distance
tripid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
213284604,10.5,924,42,2.4486,148,2/1/2020 0:38,2/1/2020 0:53,289.27,6.705702
213286352,10.5,4249,20,0.0,91,2/1/2020 1:02,2/1/2020 2:13,1912.7,41.558513
213293973,10.5,1552,255,2.6588,23,2/1/2020 5:02,2/1/2020 5:28,394.0,5.916678
213294622,10.5,462,16,0.0,198,2/1/2020 5:30,2/1/2020 5:38,154.32,3.301761
213298687,10.5,814,392,12.3692,69,2/1/2020 7:00,2/1/2020 7:14,147.47,2.588542
213299545,10.5,2495,351,16.5308,9,2/1/2020 7:13,2/1/2020 7:55,1156.97,17.247478
213302332,10.5,1108,454,23.9292,43,2/1/2020 7:47,2/1/2020 8:05,196.81,3.132721
213302671,10.5,2737,320,18.496,17,2/1/2020 7:48,2/1/2020 8:33,688.43,11.556896
213305594,10.5,1154,29,0.0,130,2/1/2020 8:11,2/1/2020 8:30,288.77,6.45878
213305134,10.5,1372,277,16.046498,63,2/1/2020 8:12,2/1/2020 8:35,199.57,4.218669


In [11]:
#separate numerical value columns and none numerical columns
numeric_cols = features_df.columns[features_df.dtypes != "object"].values

not_numeric_cols =features_df.columns[features_df.dtypes == "object"].values

In [12]:
numeric_preprocessing_steps = Pipeline([
    ('standard_scaler', StandardScaler()),
    ('simple_imputer', SimpleImputer(strategy='mean'))
])

categorical_features = not_numeric_cols

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


# create the preprocessor stage of final pipeline
# each entry in the transformer list is a tuple of
preprocessor = ColumnTransformer(
    transformers = [
        ("numeric", numeric_preprocessing_steps, numeric_cols), 
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [13]:
X_train, X_eval, y_train, y_eval = train_test_split(features_df, labels_df, test_size=0.33, shuffle=True, stratify=labels_df, random_state=6)

# define evaluate model
evaluate_models = (('xg',XGBClassifier(n_estimators=550,subsample=0.14)),
                    ('mlp',MLPClassifier(hidden_layer_sizes=(50,100,50), max_iter=1000)))

# define meta learner model
meta_model = LogisticRegression(penalty="l2", C=3)
#define stack model 
stack = StackingClassifier(estimators=evaluate_models, final_estimator=meta_model, cv=10)

full_Pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', stack)])


# Train model
full_Pipeline.fit(X_train, np.ravel(y_train))



Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numeric',
                                                  Pipeline(memory=None,
                                                           steps=[('standard_scaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True)),
                                                                  ('simple_imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                 

In [14]:
preds = full_Pipeline.predict(X_eval)
y_preds = pd.DataFrame({"label": preds},index = y_eval.index)

In [15]:
print(f1_score(y_eval, y_preds))

0.9733759318423856


In [16]:
roc_auc_score(y_eval, y_preds)

0.822124568839469

In [17]:
test_probas = full_Pipeline.predict(test_features_df)


submission_df = pd.read_csv("sample_submission.csv", index_col="tripid")

# check rows in the same order
np.testing.assert_array_equal(test_features_df.index.values, submission_df.index.values)





In [18]:
# Save predictions to submission data frame
submission_df["prediction"] = test_probas

submission_df.to_csv('ml_submissionfinal.csv', index=True)

In [19]:
submission_df.head()

Unnamed: 0_level_0,prediction
tripid,Unnamed: 1_level_1
213284604,1
213286352,0
213293973,1
213294622,1
213298687,1
