In [9]:
import numpy as np

from sklearn.model_selection import GridSearchCV
from rank_sklearn import XGBRanker
from rank_GroupKFold import Rank_GroupKFold
from ndcg_scorer import ndcg_at_5_scoring, ndcg_at_3_scoring, ndcg_at_10_scoring

# data load

In [10]:
from sklearn.datasets import load_svmlight_file

data = load_svmlight_file('../data/train.txt',query_id =True)

X, y, g = data[0].toarray(), data[1], data[2]

group_X = np.c_[g, X]

# training and predict

In [11]:
ranker = XGBRanker(learning_rate=0.1, n_estimators=140, max_depth=8, reg_lambda=1.4, reg_alpha=0.6,
                       min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                       n_jobs=-1, scale_pos_weight=1, silent=True, seed=27)
ranker.fit(group_X, y)

print ranker.score(group_X, y)

print ranker.predict(group_X)

print ranker.apply(group_X)

0.692242516498
[ 0.59940886 -2.70062757  0.19634044 ..., -1.8180337  -2.48530817
 -2.66612768]
[[305 283 217 ..., 130 110 141]
 [249 225 182 ..., 128 111 160]
 [290 295 228 ..., 135 135 194]
 ..., 
 [248 254 207 ..., 135 122 152]
 [226 232 186 ..., 135  33 152]
 [226 225 182 ..., 135 118 135]]


# GridSearchCV

In [12]:
cv = Rank_GroupKFold(n_splits=5)

param_grid = {
    'n_estimators': [150],  # range(140, 150),
    'max_depth': [6],
    'min_child_weight': [0, 1],   # In linear regression task, this simply corresponds to minimum number of instances needed to be in each node.
    'learning_rate': [0.1],
    'gamma': [0],  # Minimum loss reduction required to make a further partition on a leaf node of the tree.
    'reg_lambda': [1.46],  # (xgb's lambda)L2 regularization term on weights
    'reg_alpha': [0.4],  # (xgb's alpha)L1 regularization term on weights
    'subsample': [0.9],  # Subsample ratio of the training instance.
    'colsample_bytree': [0.9],  # Subsample ratio of columns when constructing each tree.
    'colsample_bylevel': [0.9]  # float Subsample ratio of columns for each split, in each level.
    # 'scale_pos_weight ': [],  # Balancing of positive and negative weights.
}

ranker = XGBRanker(learning_rate=0.1, n_estimators=140, max_depth=8, reg_lambda=1.4, reg_alpha=0.6,
                       min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
                       n_jobs=-1, scale_pos_weight=1, silent=True, seed=27)

gsearch = GridSearchCV(estimator=ranker, param_grid=param_grid,
                       scoring=ndcg_at_5_scoring, n_jobs=23, iid=False,
                       cv=cv, verbose=3, refit=True)

gsearch.fit(group_X, y)
print gsearch.best_estimator_
print gsearch.best_params_
print gsearch.best_score_

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV] start reg_alpha=0.4, colsample_bytree=0.9, colsample_bylevel=0.9, learning_rate=0.1, min_child_weight=0, n_estimators=150, subsample=0.9, reg_lambda=1.46, max_depth=6, gamma=0 
[CV] start reg_alpha=0.4, colsample_bytree=0.9, colsample_bylevel=0.9, learning_rate=0.1, min_child_weight=0, n_estimators=150, subsample=0.9, reg_lambda=1.46, max_depth=6, gamma=0 
[CV] start reg_alpha=0.4, colsample_bytree=0.9, colsample_bylevel=0.9, learning_rate=0.1, min_child_weight=0, n_estimators=150, subsample=0.9, reg_lambda=1.46, max_depth=6, gamma=0 
[CV] start reg_alpha=0.4, colsample_bytree=0.9, colsample_bylevel=0.9, learning_rate=0.1, min_child_weight=0, n_estimators=150, subsample=0.9, reg_lambda=1.46, max_depth=6, gamma=0 
[CV] start reg_alpha=0.4, colsample_bytree=0.9, colsample_bylevel=0.9, learning_rate=0.1, min_child_weight=0, n_estimators=150, subsample=0.9, reg_lambda=1.46, max_depth=6, gamma=0 
[CV] start reg_alpha=0.4, cols

[Parallel(n_jobs=23)]: Done   5 out of  10 | elapsed:   15.7s remaining:   15.7s
[Parallel(n_jobs=23)]: Done  10 out of  10 | elapsed:   15.8s finished


XGBRanker(base_score=0.5, colsample_bylevel=0.9, colsample_bytree=0.9,
     gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=6,
     min_child_weight=1, missing=None, n_estimators=150, n_jobs=-1,
     random_state=0, reg_alpha=0.4, reg_lambda=1.46, scale_pos_weight=1,
     silent=True, subsample=0.9)
{'reg_alpha': 0.4, 'colsample_bytree': 0.9, 'colsample_bylevel': 0.9, 'learning_rate': 0.1, 'min_child_weight': 1, 'n_estimators': 150, 'subsample': 0.9, 'reg_lambda': 1.46, 'max_depth': 6, 'gamma': 0}
0.452016582128
