In [61]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GroupShuffleSplit
from xgboost import XGBRegressor
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler
from optuna.visualization import plot_optimization_history
from sklearn.metrics import ndcg_score
import plotly
import joblib

In [126]:
traindf = pd.read_csv("train.tsv",sep='\t')
testdf = pd.read_csv("test.tsv",sep='\t')

# ORIGINAL CODE
seed=8
gss = GroupShuffleSplit(test_size=.4, n_splits=1,random_state=seed).split(traindf,groups=traindf['#QueryID'])

X_train_inds, X_test_inds = next(gss)

train_data = traindf.iloc[X_train_inds]
X_train = train_data.loc[:,~train_data.columns.isin(['#QueryID','Docid','Label'])]
y_train = train_data.loc[:, train_data.columns.isin(['Label'])]

groups = train_data.groupby('#QueryID').size().to_frame('size')['size'].to_numpy()

test_data = traindf.iloc[X_test_inds]

X_test = test_data.loc[:, ~test_data.columns.isin(['Label'])] #drop #QueryId and Docid too?
y_test = test_data.loc[:, test_data.columns.isin(['#QueryID','Docid','Label'])]
# ==================================================================================

# X_train = train_data.loc[:,~train_data.columns.isin(['Label'])]

groups2 = traindf.groupby('#QueryID').size().to_frame('size')['size'].to_numpy()

X = traindf.loc[:, ~traindf.columns.isin(['Label','#QueryID','Docid'])]
y = traindf.loc[:, traindf.columns.isin(['Label'])]

X2 =  traindf.loc[:, ~traindf.columns.isin(['Label'])]
# X = traindf.loc[:, ~traindf.columns.isin(['Label','Docid'])]
# y = traindf.loc[:, traindf.columns.isin(['Label','#QueryID'])]


In [5]:
# Hyperparameter tuning

def objective(trial: Trial, X,y,groups,X2) -> float:
    
    param = {
                "n_estimators" : trial.suggest_int('n_estimators', 0, 1000),
                'tree_method': 'hist',
                'booster':trial.suggest_categorical('booster',['gbtree','gblinear']),
                'objective':'rank:ndcg',
                'max_depth':trial.suggest_int('max_depth', 2, 25),
                'eval_metric': 'ndcg',
                'reg_alpha':trial.suggest_int('reg_alpha', 0, 5),
                'reg_lambda':trial.suggest_int('reg_lambda', 0, 5),
                'min_child_weight':trial.suggest_int('min_child_weight', 0, 5),
                'gamma':trial.suggest_int('gamma', 0, 5),
                'learning_rate':trial.suggest_loguniform('learning_rate',0.005,0.5),
                'colsample_bytree':trial.suggest_discrete_uniform('colsample_bytree',0.1,1,0.01),
                'nthread' : -1
            }
    
    model = xgb.XGBRanker(**param,use_label_encoder=False)
    model.fit(X,y,group=groups)
    
    pred = X2.groupby(['#QueryID','Docid']).apply(lambda x:predict(model,X2))
    y_pred = pred.reset_index()[0].apply(lambda x : x[0])
    y_true = y['Label']
    
    return ndcg_score([np.asarray(y_true)],[np.asarray(y_pred)])
#     return cross_val_score(model, X,y,cv=10,scoring='r2',groups=groups).mean()

In [127]:
# model = xgb.XGBRanker(  
#     tree_method='auto',
#     booster='gbtree',
#     objective='rank:ndcg',
#     eval_metric='ndcg',
#     reg_alpha= 0,
#     reg_lambda= 4,
#     random_state=42,
#     min_child_weight=0,
#     gamma=1,
#     learning_rate=0.11449,
#     colsample_bytree=0.67, 
# #     eta=0.05, 
#     max_depth=21, 
#     n_estimators=740, 
# #     subsample=0.75 
#     )

model = xgb.XGBRanker(  
    tree_method='hist',
    booster='gbtree',
    objective='rank:ndcg',
    random_state=42, 
    learning_rate=0.1,
    colsample_bytree=0.9, 
    eta=0.05, 
    max_depth=6, 
    n_estimators=110, 
    subsample=0.75 
    )

# model = xgb.XGBRanker(  
#     tree_method='hist',
#     booster='gbtree',
#     objective='rank:ndcg',
#     eval_metric='ndcg',
#     random_state=42,
#     learning_rate=0.11449,
#     colsample_bytree=0.67, 
#     max_depth=21, 
#     )

model.fit(X_train, y_train, group=groups, verbose=True)
# model.fit(X, y, group=groups2, verbose=True)

XGBRanker(base_score=0.5, booster='gbtree', colsample_bylevel=1,
          colsample_bynode=1, colsample_bytree=0.9, eta=0.05, gamma=0,
          gpu_id=-1, importance_type='gain', interaction_constraints='',
          learning_rate=0.1, max_delta_step=0, max_depth=6, min_child_weight=1,
          missing=nan, monotone_constraints='()', n_estimators=110, n_jobs=8,
          num_parallel_tree=1, objective='rank:ndcg', random_state=42,
          reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=0.75,
          tree_method='hist', validate_parameters=1, verbosity=None)

In [57]:
def predict(model, df):
    return model.predict(df.loc[:, ~df.columns.isin(['#QueryID','Docid'])])

In [114]:
# # FULL SET PREDICTIONS
# predictions = (X2.groupby(['#QueryID','Docid'])
#                .apply(lambda x: predict(model, x)))
# yp1 = predictions.reset_index()[0].apply(lambda x : x[0])
# yt1= y['Label']

In [121]:
X_test

Unnamed: 0,#QueryID,BodyTerms,AnchorTerms,TitleTerms,URLTerms,TermsWholeDocument,IDFBody,IDFAnchor,IDFTitle,IDFURL,...,LMIRIMTitle,LMIRIMURL,LMIRIMWholeDocument,PageRank,InlinkNum,OutlinkNum,NumSlashURL,LenURL,NumChildPages,Docid
0,77d4aadf,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.014820,0.076923,0.000000,0.500000,0.275862,0.000000,f0123013322f
1,77d4aadf,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.017241,0.000000,0.000000,0.000000,93cefec12b2c
2,77d4aadf,0.000272,0.0,0.0,0.0,0.000272,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.005416,0.076923,0.000000,0.833333,1.000000,0.000000,b963e5cb7764
3,77d4aadf,0.000545,0.0,0.0,0.0,0.000544,0.0,0.0,0.0,0.0,...,0.579345,0.217697,0.556259,0.202903,0.076923,0.000000,0.333333,0.252874,0.000000,65cbeb4dba8a
4,77d4aadf,0.001634,0.0,0.5,0.0,0.002177,0.0,0.0,0.0,0.0,...,0.577392,0.512464,0.627241,0.000000,0.000000,0.000000,0.833333,0.908046,0.000000,bba7ca2b7b70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74204,d513a525,0.083333,0.0,1.0,0.5,0.147059,0.0,0.0,0.0,0.0,...,0.926792,0.400905,0.911823,0.666667,0.387500,0.000000,0.000000,0.000000,0.090909,5cbc12264e78
74205,d513a525,0.333333,0.0,0.0,0.0,0.294118,0.0,0.0,0.0,0.0,...,0.280928,0.227387,0.337392,0.333333,0.050000,0.000000,0.061669,0.100000,0.181818,f0ff696db96a
74206,d513a525,0.555556,0.0,1.0,0.0,0.588235,0.0,0.0,0.0,0.0,...,0.899914,0.669448,0.883048,1.000000,0.262500,0.000000,0.000000,0.000000,0.000000,327184da2d43
74207,d513a525,0.916667,0.0,0.5,0.0,0.941176,0.0,0.0,0.0,0.0,...,0.675018,1.000000,0.618304,0.333333,0.062500,0.000000,0.201170,0.800000,0.272727,3542a13fb0fa


In [128]:
# SPLIT SET PREDICITONS
predictions = (X_test.groupby(['#QueryID','Docid'])
               .apply(lambda x: predict(model, x)))
yp1 = predictions.reset_index()[0].apply(lambda x : x[0])
yt1= y_test['Label']

In [129]:
ndcg_score(y_true=[np.asarray(yt1)],y_score=[np.asarray(yp1)])

0.7961500618512523

In [132]:
print("X_train:",len(X_train))
print("X_test:",len(X_test))


X_train: 44090
X_test: 30126


In [125]:
len(predictions)

67120

X_train: 7096
X_test: 67120


100/100 (Optimised) : 0.824
100/100 (Original)  : 0.823
99/100  (Original)  : 0.823
01/100  (Original)  : 0.825

   train/test %             NDCG
 
Split 99/01 (Optimised) : 0.647
Split 90/10 (Optimised) : 0.772
Split 60/40 (Optimised) : 0.797
Split 50/50 (Optimised) : 0.805
Split 30/70 (Optimised) : 0.789
Split 10/90 (Optimised) : 0.820
Split 01/99 (Optimised) : 0.826

Split 99/01 (Original)  : 0.657
Split 90/10 (Original)  : 0.772
Split 60/40 (Original)  : 0.796
Split 50/50 (Original)  : 0.806
Split 30/70 (Original)  : 0.815
Split 10/90 (Original)  : 0.824
Split 01/99 (Original)  : 0.825

Split 0.1/99.9 (Original): 0.772

Cross validaton
weight optimiser


NOTE: CHECK PREDICITON FUNCTION IS USING TEST DATA PROPERLY