In [3]:
!pip install -q catboost

[K     |████████████████████████████████| 64.8MB 45kB/s 
[?25h

In [16]:
import pandas as pd
from sklearn.cluster import MiniBatchKMeans
import numpy as np
import os
from catboost import CatBoostRegressor
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error
from math import sqrt

IN_COLLAB = True
SUBMIT = True

if IN_COLLAB:
    files_directory = '/content/drive/My Drive/'
else:
    files_directory = ''


def pre_process(df):    
    StartTime = pd.to_datetime(df['Timestamp'], infer_datetime_format=True)
    
    df['Day_in_week'] = StartTime.dt.dayofweek
    # df['Day_in_year'] = StartTime.dt.dayofyear
    # df['Month'] = StartTime.dt.month
    df['Hour_in_Day'] = StartTime.dt.hour
    df = df.drop('Timestamp', axis=1)
    
    return df


def add_weather(trips_df, weather_df):
    trips_df['Timestamp'] = pd.to_datetime(trips_df['Timestamp'], infer_datetime_format=True)
    weather_df['date'] = pd.to_datetime(weather_df['date'], infer_datetime_format=True)
    
    trips_df['date'] = trips_df['Timestamp'].dt.date
    weather_df['date'] = weather_df['date'].dt.date
    
    df = pd.merge(trips_df, weather_df, how='left', on='date').drop('date', axis=1)
    return df


def clean_training_set(trips_df):
    return trips_df[(trips_df['Trip_distance'] / 1000) / (trips_df['ETA'] / (60 * 60)) <= 200]


def split_X_y(df):
    return df.drop(['ETA', 'ID'], axis=1), df['ETA']


train = pd.read_csv(os.path.join(files_directory, 'Train.csv'))
submission_test_set = pd.read_csv(os.path.join(files_directory, 'Test.csv'))
weather = pd.read_csv(os.path.join(files_directory, 'Weather.csv'))

train = train.sort_values('Timestamp', ascending=False)
train = clean_training_set(train)

submission_test_set = add_weather(submission_test_set, weather)

train = pre_process(train)
submission_test_set = pre_process(submission_test_set)

full_train = train.copy()

print('splitting into test, validation and training sets')
test = train.iloc[:8000]
train = train.iloc[8000:]

val = train.iloc[:8000]
train = train.iloc[8000:]

X_train, y_train = split_X_y(train)
X_val, y_val = split_X_y(val)
X_test, y_test = split_X_y(test)



splitting into test, validation and training sets


In [21]:
model = CatBoostRegressor(
    loss_function='RMSE',
    iterations=7000,
    grow_policy='Lossguide',
    bootstrap_type='Bayesian',
    max_leaves=120,
    task_type='GPU' if IN_COLLAB else 'CPU'
)

if not SUBMIT:
    print('training catboost model')
    model.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        verbose=200
    )
    
    rms = sqrt(mean_squared_error(y_test, model.predict(X_test)))
    print('test score: ', rms, 'over', X_test.shape[0], 'test samples')
    print('\nWARNING: NO SUBMISSION CSV WRITTEN')

else:
    print('training catboost model on all data')
    model.fit(
        full_train.drop(['ID', 'ETA'], axis=1), full_train['ETA'],
        verbose=200
    )
    
    submission = pd.DataFrame({'ID': submission_test_set['ID'], 'ETA': model.predict(submission_test_set.drop('ID', axis=1))})
    submission.to_csv('submission.csv', index=False)
    print('\nSubmission CSV file written')


training catboost model on all data
Learning rate set to 0.039221
0:	learn: 544.8409169	total: 37.6ms	remaining: 4m 23s
200:	learn: 163.3989470	total: 6.77s	remaining: 3m 48s
400:	learn: 145.8252614	total: 13.6s	remaining: 3m 43s
600:	learn: 136.9249295	total: 20.4s	remaining: 3m 37s
800:	learn: 130.8659360	total: 27.1s	remaining: 3m 29s
1000:	learn: 125.9746321	total: 33.9s	remaining: 3m 23s
1200:	learn: 122.0813149	total: 40.7s	remaining: 3m 16s
1400:	learn: 118.7021283	total: 47.3s	remaining: 3m 9s
1600:	learn: 115.8589387	total: 53.9s	remaining: 3m 1s
1800:	learn: 113.3657736	total: 1m	remaining: 2m 54s
2000:	learn: 111.0563865	total: 1m 7s	remaining: 2m 47s
2200:	learn: 109.0081068	total: 1m 13s	remaining: 2m 40s
2400:	learn: 107.1460896	total: 1m 20s	remaining: 2m 34s
2600:	learn: 105.4255009	total: 1m 27s	remaining: 2m 27s
2800:	learn: 103.9383288	total: 1m 34s	remaining: 2m 20s
3000:	learn: 102.4489380	total: 1m 40s	remaining: 2m 14s
3200:	learn: 101.0285170	total: 1m 47s	remai