In [0]:
from google.colab import drive
drive.mount('/content/drive/')

In [2]:
cd "/content/drive/My Drive/ML"

/content/drive/My Drive/ML


In [0]:
from pathlib import Path

import numpy as np
import pandas as pd

RANDOM_SEED = 8    # Set a random seed for reproducibility!
pd.set_option("display.max_columns", 100)

In [0]:
!pip3 install catboost
from catboost import CatBoostClassifier

In [0]:
DATA_PATH = Path.cwd().parent / "data" / "final" / "public"

train_df = pd.read_csv(
    DATA_PATH / "/content/drive/My Drive/ML/train.csv", 
    index_col="tripid"
)
test_features_df = pd.read_csv(
    DATA_PATH / "/content/drive/My Drive/ML/test.csv", 
    index_col="tripid"
)

In [0]:
labels_df = train_df[['label']]
features_df = train_df.drop(['label'], axis=1)

In [0]:
labels_df=labels_df.replace("correct", 1)
labels_df=labels_df.replace("incorrect", 0)

In [0]:
features_df = features_df.fillna(features_df.mean())
test_features_df = test_features_df.fillna(test_features_df.mean())

In [0]:
def distance(lat1, lon1, lat2, lon2):
    p = 0.017453292519943295 # Pi/180
    a = 0.5 - np.cos((lat2 - lat1) * p)/2 + np.cos(lat1 * p) * np.cos(lat2 * p) * (1 - np.cos((lon2 - lon1) * p)) / 2
    return 0.6213712 * 12742 * np.arcsin(np.sqrt(a)) 

In [0]:
features_df["distance"] = distance(features_df["pick_lat"],features_df["pick_lon"],features_df["drop_lat"],features_df["drop_lon"])
test_features_df["distance"] = distance(test_features_df["pick_lat"],test_features_df["pick_lon"],test_features_df["drop_lat"],test_features_df["drop_lon"])

In [0]:
features_df['pickup_time'] = features_df['pickup_time'].astype('datetime64[m]')
features_df['drop_time'] = features_df['drop_time'].astype('datetime64[m]')

features_df['pickup_hour'] = [time.hour for time in features_df['pickup_time']]
features_df['pickup_minute'] = [time.minute for time in features_df['pickup_time']]
features_df['pickup_day'] = [date.day for date in features_df['pickup_time']]

features_df['drop_hour'] = [time.hour for time in features_df['drop_time']]
features_df['drop_minute'] = [time.minute for time in features_df['drop_time']]
features_df['drop_day'] = [date.day for date in features_df['drop_time']]

features_df["effective_time"] = features_df["duration"] - features_df["meter_waiting"]

In [0]:
test_features_df['pickup_time'] = test_features_df['pickup_time'].astype('datetime64[m]')
test_features_df['drop_time'] = test_features_df['drop_time'].astype('datetime64[m]')

test_features_df['pickup_hour'] = [time.hour for time in test_features_df['pickup_time']]
test_features_df['pickup_minute'] = [time.minute for time in test_features_df['pickup_time']]
test_features_df['pickup_day'] = [date.day for date in test_features_df['pickup_time']]

test_features_df['drop_hour'] = [time.hour for time in test_features_df['drop_time']]
test_features_df['drop_minute'] = [time.minute for time in test_features_df['drop_time']]
test_features_df['drop_day'] = [date.day for date in test_features_df['drop_time']]

test_features_df["effective_time"] = test_features_df["duration"] - test_features_df["meter_waiting"]

In [13]:
features_df.dtypes

additional_fare                     float64
duration                            float64
meter_waiting                       float64
meter_waiting_fare                  float64
meter_waiting_till_pickup           float64
pickup_time                  datetime64[ns]
drop_time                    datetime64[ns]
pick_lat                            float64
pick_lon                            float64
drop_lat                            float64
drop_lon                            float64
fare                                float64
distance                            float64
pickup_hour                           int64
pickup_minute                         int64
pickup_day                            int64
drop_hour                             int64
drop_minute                           int64
drop_day                              int64
effective_time                      float64
dtype: object

In [14]:
features_df.shape

(17176, 20)

In [0]:
clf = CatBoostClassifier(iterations=500000)

In [0]:
%%time 

# train on full dataset
clf.fit(features_df, labels_df.values.ravel())

None   # So we don't print out the whole pipeline representation

In [0]:
preds1 = clf.predict(test_features_df)

In [0]:
submission_df = pd.read_csv(DATA_PATH / "/content/drive/My Drive/ML/sample_submission.csv", 
                            index_col="tripid")

In [0]:
# Make sure we have the rows in the same order
np.testing.assert_array_equal(test_features_df.index.values, 
                              submission_df.index.values)

In [0]:
# Save predictions to submission data frame
submission_df["prediction"] = preds1

In [0]:
submission_df.to_csv('my_submission.csv', index=True)