In [1]:
import numpy as np
import pandas as pd
import catboost
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score

In [2]:
df_train = pd.read_csv('1_data/train_data.csv')
df_test = pd.read_csv('1_data/test_data.csv')
df_train_target = pd.read_csv('1_data/train_target.csv')

In [3]:
def creare_features(df_list):
    df_tmp = []
    for el in df_list:
        df_tmp.append(el.copy())
   
    for df in df_tmp:
        
        # dummy feats
        df['is_dist'] = (df.dist != -1.0).astype(int)
        
        # time feat
        df['due_datetime'] = pd.to_datetime(df.due)
        df['day_of_week'] = df.due_datetime.dt.dayofweek
        df['hour'] = df.due_datetime.dt.hour
        
        df['time_as_str'] = df.due.apply(lambda x: x[x.find(' ') + 1:-4])
        df['due_timedelta'] = pd.to_timedelta(df.time_as_str)
        df['total_seconds'] = df.due_timedelta.dt.total_seconds()
        df['total_minutes'] = df.total_seconds // 60
        
        df.fillna({
            'f_class': 'nan',
            's_class': 'nan',
            't_class': 'nan'
        }, inplace=True)
        
        df.drop(['time_as_str', 'due_timedelta', 'due_datetime', 'due'], inplace=True, axis=1)
    
    one_hot_columns = ['f_class', 's_class', 't_class', 'day_of_week']
    for column in one_hot_columns:
        df_tmp[0][column] = df_tmp[0][column].astype('category')
        df_tmp[1][column] = df_tmp[1][column].astype('category')
        df_tmp[1][column] = (
            df_tmp[1][column]
            .cat
            .set_categories(df_tmp[0][column].cat.categories)
        )
    dfs = []
    for df in df_tmp:
        df = pd.get_dummies(df)
        dfs.append(df)
    
    return dfs

In [4]:
df_train_feat, df_test_feat = creare_features([df_train, df_test])

In [5]:
df_train_feat.shape

(1187461, 26)

In [6]:
df_train_feat.columns

Index(['dist', 'lat', 'lon', 'is_dist', 'hour', 'total_seconds',
       'total_minutes', 'f_class_business', 'f_class_econom', 'f_class_nan',
       'f_class_vip', 's_class_business', 's_class_econom', 's_class_nan',
       's_class_vip', 't_class_business', 't_class_econom', 't_class_nan',
       't_class_vip', 'day_of_week_0', 'day_of_week_1', 'day_of_week_2',
       'day_of_week_3', 'day_of_week_4', 'day_of_week_5', 'day_of_week_6'],
      dtype='object')

In [7]:
params = {
    'iterations': 2000,
    'loss_function': "Logloss",
    'thread_count': 8,
    'depth': 5, 
    'learning_rate': 0.1, 
    'random_state': 0,
    'eval_metric': 'AUC',
}

In [8]:
train_cols = list(df_train_feat.columns)
cat_features = []

In [9]:
train_pool = catboost.Pool(
    df_train_feat.loc[:, train_cols],
    label=df_train_target.target,
    cat_features=cat_features
)

In [10]:
model = catboost.CatBoost(params)
model.fit(
    train_pool, 
    eval_set=train_pool,
    verbose=100,
)

print(model.learning_rate_)
model.get_feature_importance(train_pool, prettified=True).head(50)

0:	test: 0.5996243	best: 0.5996243 (0)	total: 157ms	remaining: 5m 14s
100:	test: 0.6446788	best: 0.6446788 (100)	total: 11.1s	remaining: 3m 28s
200:	test: 0.6492137	best: 0.6492137 (200)	total: 22s	remaining: 3m 17s
300:	test: 0.6517526	best: 0.6517526 (300)	total: 33.5s	remaining: 3m 9s
400:	test: 0.6536391	best: 0.6536391 (400)	total: 44.8s	remaining: 2m 58s
500:	test: 0.6550691	best: 0.6550691 (500)	total: 56.4s	remaining: 2m 48s
600:	test: 0.6564732	best: 0.6564732 (600)	total: 1m 10s	remaining: 2m 43s
700:	test: 0.6578976	best: 0.6578976 (700)	total: 1m 23s	remaining: 2m 34s
800:	test: 0.6590189	best: 0.6590189 (800)	total: 1m 37s	remaining: 2m 25s
900:	test: 0.6600594	best: 0.6600594 (900)	total: 1m 50s	remaining: 2m 14s
1000:	test: 0.6610927	best: 0.6610927 (1000)	total: 2m 2s	remaining: 2m 2s
1100:	test: 0.6621146	best: 0.6621146 (1100)	total: 2m 15s	remaining: 1m 50s
1200:	test: 0.6631005	best: 0.6631005 (1200)	total: 2m 29s	remaining: 1m 39s
1300:	test: 0.6641254	best: 0.6641

Unnamed: 0,Feature Id,Importances
0,dist,28.871311
1,lon,22.37242
2,lat,21.261146
3,total_minutes,6.40401
4,total_seconds,5.729841
5,is_dist,3.446398
6,hour,2.063617
7,day_of_week_0,1.227531
8,s_class_nan,0.844484
9,f_class_vip,0.794046


In [11]:
test_pool = catboost.Pool(
    df_test_feat.loc[:, train_cols],
    cat_features=cat_features
)

In [12]:
sub = pd.read_csv('1_data/sample_submission.csv')

In [13]:
sub.target = model.predict(test_pool, prediction_type='Probability')[:, 1]

In [14]:
sub.to_csv('sub_1.csv', index=False)