In [0]:
from google.colab import drive
drive.mount('/content/drive/')

In [22]:
cd "/content/drive/My Drive/ML"

/content/drive/My Drive/ML


In [0]:
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from xgboost.sklearn import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.utils import resample
from sklearn.model_selection import cross_val_score

RANDOM_SEED = 8    # Set a random seed for reproducibility!
pd.set_option("display.max_columns", 100)

In [0]:
DATA_PATH = Path.cwd().parent / "data" / "final" / "public"

train_df = pd.read_csv(
    DATA_PATH / "/content/drive/My Drive/ML/train.csv", 
    index_col="tripid"
)
test_features_df = pd.read_csv(
    DATA_PATH / "/content/drive/My Drive/ML/test.csv", 
    index_col="tripid"
)

In [0]:
labels_df = train_df[['label']]
features_df = train_df.drop('label', axis=1)

In [0]:
labels_df=labels_df.replace("correct", 1)
labels_df=labels_df.replace("incorrect", 0)

In [0]:
features_df = features_df.drop([ 'pickup_time', 'drop_time'], axis = 1)
test_features_df = test_features_df.drop([ 'pickup_time', 'drop_time'], axis = 1)

In [0]:
numeric_cols = features_df.columns[features_df.dtypes != "object"].values

In [0]:
numeric_preprocessing_steps = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('standard_scaler', StandardScaler())])

preprocessor = ColumnTransformer(
    transformers = [
        ("numeric", numeric_preprocessing_steps, numeric_cols)
    ],
    remainder = "drop"
)

In [0]:
X_train, X_eval, y_train, y_eval = train_test_split(
    features_df,
    labels_df,
    test_size=0.33,
    shuffle=True,
    stratify=labels_df,
    random_state=RANDOM_SEED
)

In [0]:
features_df_preprocess = pd.DataFrame(preprocessor.fit_transform(features_df))
features_df_preprocess.columns = features_df.columns

test_features_df_preprocess = pd.DataFrame(preprocessor.transform(test_features_df))
test_features_df_preprocess.columns = test_features_df.columns

In [0]:
X_train_preprocess = pd.DataFrame(preprocessor.transform(X_train))
X_train_preprocess.columns = X_train.columns

X_eval_preprocess = pd.DataFrame(preprocessor.transform(X_eval))
X_eval_preprocess.columns = X_eval.columns

In [0]:
clf = XGBClassifier(learning_rate =0.1, n_estimators=400, max_depth=10,
 min_child_weight=2, gamma=0.3, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.000005, reg_lambda=1,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27)

In [48]:
%%time

# Train model
clf.fit(X_train_preprocess, y_train.values.ravel())

None   # don't print out the whole pipeline representation

CPU times: user 13.1 s, sys: 932 ms, total: 14 s
Wall time: 8.37 s


In [0]:
# Predict on evaluation set
preds = clf.predict(X_eval_preprocess)

In [50]:
y_preds = pd.DataFrame(
    {
        "label": preds,
    },
    index = y_eval.index
)
print("y_preds.shape:", y_preds.shape)
y_preds.head()

y_preds.shape: (5669, 1)


Unnamed: 0_level_0,label
tripid,Unnamed: 1_level_1
200328361,1
191127057,1
200136070,1
195614910,1
191488838,1


In [51]:
f1_score(y_eval, y_preds, average='macro')

0.8170736090722465

In [52]:
# retrain on full dataset
%%time 

clf.fit(features_df_preprocess, labels_df.values.ravel())

None   # So we don't print out the whole pipeline representation

CPU times: user 19.8 s, sys: 1.06 s, total: 20.9 s
Wall time: 12.4 s


In [0]:
preds1 = clf.predict(test_features_df_preprocess)

In [0]:
submission_df = pd.read_csv(DATA_PATH / "/content/drive/My Drive/ML/sample_submission.csv", 
                            index_col="tripid")

In [0]:
# Make sure we have the rows in the same order
np.testing.assert_array_equal(test_features_df.index.values, 
                              submission_df.index.values)

In [0]:
# Save predictions to submission data frame
submission_df["prediction"] = preds1

In [0]:
submission_df.to_csv('my_submission.csv', index=True)