In [1]:
# !pip install catboost

In [1]:
from catboost import CatBoostRanker, Pool, MetricVisualizer, cv

import pandas as pd

# Read data

In [2]:
USE_FULL = True

filename = 'data/training_set_VU_DM.csv'

if USE_FULL:
    df = pd.read_csv(filename)
else:
    import random
    p = 0.1
    df = pd.read_csv(filename,
                     header=0,
                     skiprows=lambda i: i > 0 and random.random() > p)

df['date_time'] = pd.to_datetime(df['date_time'])
df.shape

(4958347, 54)

# Simple preproc + split

In [3]:
CAT_FEATURES = ['site_id',
            'visitor_location_country_id',
            'prop_country_id',
            'prop_id',
            'srch_destination_id']

bool_cols = ['prop_brand_bool',
             'promotion_flag',
             'srch_saturday_night_bool',
             'random_bool']

group_col = 'srch_id'


def get_target(row):
    """
    0=not clicked at all, 1=clicked but not booked, 5=booked
    """
    if row.booking_bool>0:
        return 5
    if row.click_bool>0 :
        return 1
    return 0

ranking_target = 'target'
df[ranking_target] = df.apply(get_target, axis=1)

In [8]:
from utils import train_test_group_split

df.sort_values([group_col, 'date_time'], inplace=True)
X = df#.drop(['position', 'click_bool', 'gross_bookings_usd', 'booking_bool'], axis=1)
y = df[ranking_target]
groups = X[group_col]


X_train, X_test, y_train, y_test, groups_train, groups_test = train_test_group_split(X, y, groups, 
                                                                                     group_array=groups,
                                                                                     train_size=0.9)
X_train = X_train.drop(['position', 'click_bool', 'gross_bookings_usd', 'booking_bool', ranking_target], axis=1)

# from sklearn.model_selection import GroupShuffleSplit  # ignores time
# gss = GroupShuffleSplit(n_splits=2, train_size=.7, random_state=42)
# >>> gss.get_n_splits()
# 2
# >>> for train_idx, test_idx in gss.split(X, y, groups):
# ...     print("TRAIN:", train_idx, "TEST:", test_idx)

test_pool = Pool(
    data=X_test.drop(['position', 'click_bool', 'gross_bookings_usd', 'booking_bool', ranking_target], axis=1),
    label=y_test,
    group_id=groups_test,
    cat_features=CAT_FEATURES,
)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape, groups_train.shape, groups_test.shape

((4462491, 50), (495856, 55), (4462491,), (495856,), (4462491,), (495856,))

In [11]:
set(X_train[group_col]).intersection(set(X_test[group_col]))

set()

In [12]:
train_pool = Pool(data=X_train,
                  label=y_train,
                  group_id=groups_train,
                  cat_features=CAT_FEATURES,
                  )

# Cross-validation

In [42]:
# %%time

# params = {
#     "iterations": 1000,
#     #           'loss_function': 'NDCG:top=5',
#     'loss_function': 'QueryRMSE',
#     'custom_metric': [
#         'NDCG:top=5;type=Base;denominator=LogPosition'
#         #                       , 'PFound', 'AverageGain:top=10'
#     ],
#     "verbose": False,
#     'early_stopping_rounds': 50,
#     #     'logging_level': 'Silent',
# }

# cv_results, fitted_models = cv(train_pool, params, nfold=3, type='TimeSeries', plot=True,
#                                return_models=True)

In [43]:
# cv_results

In [46]:
# model = fitted_models[-1]

In [44]:
# import matplotlib.pyplot as plt

# for method in ['LossFunctionChange', 'PredictionValuesChange']:
#     fi = model.get_feature_importance(test_pool, type=method)
#     feature_score_raw = pd.DataFrame(list(zip(X_test.columns, fi)),
#                                      columns=['Feature', 'Score'])

#     feature_score = feature_score_raw.sort_values(
#         by='Score', ascending=False)

#     plt.rcParams["figure.figsize"] = (7, len(feature_score) / 4)
#     ax = feature_score.plot('Feature', 'Score', kind='barh', color='c')
#     ax.set_title("Feature Importance using {}".format(method), fontsize=14)
#     ax.set_xlabel("features")
#     ax.invert_yaxis()
#     plt.show()

In [45]:
# model.get_all_params()

# Fit final model

In [15]:
params = {
    "iterations": 500,
    #           'loss_function': 'NDCG:top=5',
    'loss_function': 'QueryRMSE',
    'custom_metric': [
        'NDCG:top=5;type=Base;denominator=LogPosition'
        #                       , 'PFound', 'AverageGain:top=10'
    ],
    "verbose": False,
    'early_stopping_rounds': 50,
    #     'logging_level': 'Silent',
}

In [16]:
CAT_FEATURES

['site_id',
 'visitor_location_country_id',
 'prop_country_id',
 'prop_id',
 'srch_destination_id']

In [17]:
model = CatBoostRanker(**params)
model.fit(train_pool, eval_set=test_pool, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostRanker at 0x7fb3a87abe80>

# Eval

In [35]:
test_preds = model.predict(test_pool)

test_results_df = pd.DataFrame({group_col: groups_test,
                        'prop_id': X_test['prop_id'],
                        ranking_target+'_pred': test_preds,
                        ranking_target+'_gt': y_test,
                         })
test_results_df.sort_values(by=[group_col, ranking_target+'_pred'],
                            ascending=[True, False], inplace=True)

test_results_df.head()

Unnamed: 0,srch_id,prop_id,target_pred,target_gt
4462515,299424,109781,0.207417,0
4462508,299424,67489,0.151551,0
4462505,299424,58030,0.131362,0
4462506,299424,59976,0.089973,5
4462494,299424,18873,0.077576,0


In [38]:
from metrics import ndcg

print('NDCG which is used for competition evaluation')
test_results_df.groupby('srch_id')[ranking_target+'_gt'].apply(ndcg, at=5).mean()

NDCG which is used for competition evaluation


0.37258898953021224

# Make preds

In [39]:
# # subm_df = pd.read_csv('data/test_set_VU_DM.csv')
# subm_df['date_time'] = pd.to_datetime(subm_df['date_time'])

# subm_pool = Pool(
#     data=subm_df,
#     group_id=subm_df[group_col],
#     cat_features=CAT_FEATURES,
# )

In [40]:
# pred = model.predict(subm_pool)

# output_df = pd.DataFrame({group_col: subm_df[group_col],
#                         'prop_id': subm_df['prop_id'],
#                         ranking_target: pred,
# #                         'gt': y_test
#                          })

# output_df.sort_values([group_col, 'position'], inplace=True)
# output_df[[group_col, 'prop_id']].to_csv('sumbission.csv', index=False)
# output_df.head(10)

In [41]:
# pd.read_csv('sumbission.csv', nrows=10)