<a href="https://colab.research.google.com/github/TaahirBhorat/Zindi-Hack-Yassir/blob/master/CatBoost_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Get everything set up

In [None]:
!pip install -q catboost
!wget -q https://people.cs.uct.ac.za/~mshstu001/data.zip
!unzip -q data.zip

[K     |████████████████████████████████| 64.8MB 46kB/s 
[?25h

Train a model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor

print('reading data')
df = pd.read_csv('data.csv')

print('parsing timestamps')
timestamps = pd.to_datetime(df['trip_start_timestamp'], infer_datetime_format=True)

df = df.drop('trip_start_timestamp', axis=1)
df = df.drop('trip_end_timestamp', axis=1)

df['year'] = timestamps.dt.year
df['month'] = timestamps.dt.month
df['day'] = timestamps.dt.day
df['hour'] = timestamps.dt.hour
df['minute'] = timestamps.dt.minute

X = df.drop('trip_seconds', axis=1)
y = df['trip_seconds']

print('splitting into test, validation and training sets')
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.01)
X_val, X_test, y_val, y_test = train_test_split(X, y, shuffle=True, test_size=0.5)

model = CatBoostRegressor(
    loss_function='RMSE',
    iterations=20000,
    learning_rate=1.0,
    task_type='GPU'
)

print('training catboost model')
model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    verbose=200
)


reading data
parsing timestamps
splitting into test, validation and training sets
training catboost model
0:	learn: 891.2491754	test: 896.1737397	best: 896.1737397 (0)	total: 45.6ms	remaining: 15m 12s
200:	learn: 821.1139965	test: 822.9830972	best: 822.9830972 (200)	total: 6.2s	remaining: 10m 10s
400:	learn: 800.9464997	test: 803.1282032	best: 803.1282032 (400)	total: 12.2s	remaining: 9m 58s
600:	learn: 788.0645827	test: 789.1016697	best: 789.1016697 (600)	total: 18.3s	remaining: 9m 51s
800:	learn: 775.6870099	test: 775.8546697	best: 775.8546697 (800)	total: 24.5s	remaining: 9m 46s
1000:	learn: 766.2186385	test: 765.8766659	best: 765.8517242 (997)	total: 30.4s	remaining: 9m 37s
1200:	learn: 758.5750821	test: 757.2863277	best: 757.2863277 (1200)	total: 36.5s	remaining: 9m 31s
1400:	learn: 751.1297076	test: 750.0699356	best: 750.0506947 (1398)	total: 42.6s	remaining: 9m 25s
1600:	learn: 744.3639596	test: 742.3923875	best: 742.3923875 (1600)	total: 48.7s	remaining: 9m 19s
1800:	learn: 737

<catboost.core.CatBoostRegressor at 0x7fa1330cb710>

Test it on unseen data

In [None]:
pd.DataFrame({'actual': y_test, 'predicted': model.predict(X_test)})

Unnamed: 0,actual,predicted
2551013,540,480.925072
1104464,300,278.403658
1690628,2940,2820.261840
2156558,205,219.720484
1352615,300,268.913382
...,...,...
3766110,960,707.343314
881600,1045,843.027822
629818,420,356.245655
3589934,420,677.439508


In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rms = sqrt(mean_squared_error(y_test, model.predict(X_test)))
print(rms)

859.0637229441936


Train a default XGB regressor

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb

print('reading data')
df = pd.read_csv('data.csv')

print('parsing timestamps')
timestamps = pd.to_datetime(df['trip_start_timestamp'], infer_datetime_format=True)

df = df.drop('trip_start_timestamp', axis=1)
df = df.drop('trip_end_timestamp', axis=1)

df['year'] = timestamps.dt.year
df['month'] = timestamps.dt.month
df['day'] = timestamps.dt.day
df['hour'] = timestamps.dt.hour
df['minute'] = timestamps.dt.minute

X = df.drop('trip_seconds', axis=1)
y = df['trip_seconds']

print('splitting into test, validation and training sets')
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.01)
X_val, X_test, y_val, y_test = train_test_split(X, y, shuffle=True, test_size=0.5)

print('training xgb model')

model = xgb.XGBRegressor(
    objective='reg:squarederror',
    tree_method='gpu_hist',
    gpu_id=0
)

model.fit(
    X_train, y_train,
)

reading data
parsing timestamps
splitting into test, validation and training sets
training xgb model


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, tree_method='gpu_hist',
             verbosity=1)