In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import xgboost as xgb
import time

from sklearn.metrics import ndcg_score
from sklearn.model_selection import train_test_split

In [2]:
# Yahoo LtR challenge
def parser(filename, length):
    with open(filename, 'r') as f:
        rel_scores = []
        qids = []
        features = {}
        counter = 0
        for line in f:
            data = line.split()
            rel_scores.append(int(data[0]))
            qids.append(int(data[1].split(':')[1]))

            for pair in data[2:]:
                col, val = pair.split(':')
                col = int(col)
                val = float(val)

                if col not in features:
                    features[col] = [0.0] * (len(rel_scores)-1)
                features[col].append(val)
            
            for i in range(max(features.keys()) + 1):
                if i not in features.keys():
                    features[i] = [0.0] * len(rel_scores)
                if len(features[i]) < len(rel_scores):
                    features[i].extend([0.0] * (len(rel_scores) - len(features[i])))

            counter += 1
            if counter == length:
                break
    
    features['score'] = rel_scores
    features['qid'] = qids

    return pd.DataFrame(data=features)

In [3]:
yahoo_train = parser('../datasets/Yahoo/ltrc_yahoo/set2.train.txt', 100000)
yahoo_test = parser('../datasets/Yahoo/ltrc_yahoo/set2.test.txt', 10000)

In [4]:
X_tr, y_tr = yahoo_train.drop(columns=['score']), yahoo_train['score']
X_ts, y_ts = yahoo_test.drop(columns=['score']), yahoo_test['score']

quids_tr = X_tr['qid']
quids_ts = X_ts['qid']

X_tr.drop(columns=['qid'], inplace=True)
X_ts.drop(columns=['qid'], inplace=True)

In [5]:
X_tr = X_tr.reindex(sorted(X_tr.columns), axis=1)
X_ts = X_ts.reindex(sorted(X_ts.columns), axis=1)

In [6]:
ranker = xgb.XGBRanker(n_estimators=500,
                       n_jobs=-1,
                       tree_method='exact',
                       max_depth=8,
                       learning_rate=0.1,
                       objective='rank:ndcg')

start = time.time()
ranker.fit(X_tr, y_tr, qid=quids_tr)
end = time.time()

print((end - start) / 500)
print(ndcg_score(y_ts.values.reshape(1, -1), ranker.predict(X_ts).reshape(1, -1), k=10))

0.47455237579345705
0.9092782724164457


In [7]:
ranker = xgb.XGBRanker(n_estimators=500,
                       n_jobs=-1,
                       tree_method='exact',
                       max_depth=8,
                       learning_rate=0.1,
                       colsample_bytree=0.5,
                       objective='rank:ndcg')

start = time.time()
ranker.fit(X_tr, y_tr, qid=yahoo_train['qid'][:100000])
end = time.time()

print((end - start) / 500)
print(ndcg_score(y_ts.values.reshape(1, -1), ranker.predict(X_ts).reshape(1, -1), k=10))

0.25059660816192625
0.9250237420756263
