In [2]:
import xgboost as xgb
import numpy as np
import pandas as pd

from src_clean.dataloader import DataLoader
from src_clean.preprocessor.prepro_new import PreprocessorTwo
from src_clean.ranker.ranker import Ranker
from src_clean.score.ndcg import calculate_ndcg

In [3]:
train_df = pd.read_parquet(f'../final_model/train_df_basic_statistic_position.parquet')
test_df = pd.read_parquet(f'../final_model/test_df_basic_statistic_position.parquet')

train_df = PreprocessorTwo.drop_cols(train_df, exclude_cols_to_drop=['srch_id', 'prop_id'])
test_df = PreprocessorTwo.drop_cols(test_df)

In [5]:
X_train, X_valid, y_train, y_valid = DataLoader.split_df_into_train_and_val_batches(train_df, validation_size=0.1)
group_train = [group.shape[0] for group in X_train]
group_val = [group.shape[0] for group in X_valid]

In [7]:
params = {'objective': 'rank:ndcg', 'learning_rate': 0.1}
model = xgb.sklearn.XGBRanker(**params)
model.fit(np.vstack(X_train), np.hstack(y_train), group_train, verbose=True,
          eval_set=[(np.vstack(X_valid), np.hstack(y_valid))], eval_group=[group_val])

[0]	validation_0-map:0.31747
[1]	validation_0-map:0.32036
[2]	validation_0-map:0.32093
[3]	validation_0-map:0.32127
[4]	validation_0-map:0.32270
[5]	validation_0-map:0.32376
[6]	validation_0-map:0.32615
[7]	validation_0-map:0.32864
[8]	validation_0-map:0.33168
[9]	validation_0-map:0.33271
[10]	validation_0-map:0.33520
[11]	validation_0-map:0.33707
[12]	validation_0-map:0.33730
[13]	validation_0-map:0.33755
[14]	validation_0-map:0.33928
[15]	validation_0-map:0.34108
[16]	validation_0-map:0.34212
[17]	validation_0-map:0.34346
[18]	validation_0-map:0.34482
[19]	validation_0-map:0.34612
[20]	validation_0-map:0.34706
[21]	validation_0-map:0.34836
[22]	validation_0-map:0.34919
[23]	validation_0-map:0.35015
[24]	validation_0-map:0.35108
[25]	validation_0-map:0.35158
[26]	validation_0-map:0.35224
[27]	validation_0-map:0.35326
[28]	validation_0-map:0.35444
[29]	validation_0-map:0.35493
[30]	validation_0-map:0.35591
[31]	validation_0-map:0.35685
[32]	validation_0-map:0.35698
[33]	validation_0-ma

KeyboardInterrupt: 

In [6]:
train_matrix = xgb.DMatrix(data=np.vstack(X_train), label=np.hstack(y_train))
valid_matrix = xgb.DMatrix(data=np.vstack(X_valid), label=np.hstack(y_valid))

In [7]:
train_matrix.set_group(group_train)
valid_matrix.set_group(group_val)

In [8]:
params = {
    'objective': 'rank:ndcg',
}
num_round = 50
model = xgb.sklearn.XGBRanker(**params)
model.fit(np.vstack(X_train),  np.hstack(y_train), group = group_train, verbose=False,
          eval_set=[(np.vstack(X_valid), np.hstack(y_valid))], eval_group=[group_val])

In [None]:
pred = model.predict(test_df)
# make prediction
#test_matrix = xgb.DMatrix(data=test_df)
#preds = bst.predict(test_matrix)
ranker = Ranker()
ranking_file_path = ranker.make_ranking_from_prediction(
    pred, model_name="XGBoost"
)
print(f"NDCG@5: {calculate_ndcg(ranking_file_path)}")
print(f"File path: {ranking_file_path}")

## Hyperparameter tuning

In [26]:
from scipy import stats
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import make_scorer, ndcg_score

In [54]:
param = {
    'max_depth': 9,
    'eval_metric': 'ndcg@5',
    'min_child_weight': 0.1,
    'eta': 0.1,
    'gamma': 1.0,
}
num_round = 10
xgb_model = xgb.XGBRanker(
    params = param,
    dtrain = train_matrix,
    objective='rank:ndcg',
    num_boost_round = num_round,
    evals=[(valid_matrix, 'validation')]
)

In [55]:
clf = GridSearchCV(xgb_model,
                   {'max_depth': [2, 4, 6],
                    'n_estimators': [50, 100, 200]}, verbose=1, n_jobs=1, scoring='accuracy')
clf.fit(X_train, y_train)

ValueError: ctypes objects containing pointers cannot be pickled

In [33]:
import sklearn
sklearn.metrics.get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',
 'roc_auc',
 'roc_auc_ovo',
 'roc_auc_ovo_weighted',
 'roc_auc_