# Path setup & import packages

In [1]:
import pandas as pd
%load_ext autoreload
%autoreload 2
import sys
import os
root_path = '../../' # path to project root
sys.path.append('{}/code'.format(root_path))
sys.path.append('{}/code/core'.format(root_path))
sys.path.append('{}/code/datasets/'.format(root_path))
sys.path.insert(0,'{}/code/ptranking'.format(root_path))

from core.ranking_utils import *
from core.mallows import *
from core.ws_ranking import *
from core.ws_real_workflow import * 
from datasets.imdb_tmdb_dataset import * 
from datasets.basic_clmn_dataset import * 
from core.labelling.feature_lf import *
from ptranking_wrapper import PtrankingWrapper
import datasets_factory 
import numpy as np 
import yaml
import matplotlib.pyplot as plt
import pickle

# Read config & basic setup

In [2]:
config_file_path = '{}/configs/modified-mslr-web10k_ranking_experiment.yaml'.format(root_path)

with open(config_file_path,'r') as conf_file:
    conf = yaml.full_load(conf_file)
    conf['project_root'] = root_path 

data_conf = conf['data_conf']
weak_sup_conf = conf['weak_sup_conf'] # For partial ranking experiments, we should give
l2r_training_conf = conf['l2r_training_conf']
data_conf['project_root'] = root_path

# Train and evaluation - mainly with PtrankingWrapper

In [3]:
# for seed in range(5):
#     kt = None
#     save_path = os.path.join(root_path, 'data/MSLR-WEB10K/True_5')
#     file_train = 'train.npz'
#     file_test = 'test.npz'

#     train = np.load(os.path.join(save_path, file_train))
#     test = np.load(os.path.join(save_path, file_test))
#     X_train, Y_train, qid_train = train['X'], train['Y'], train['qid']
#     X_test, Y_test, qid_test = test['X'], test['Y'], test['qid']

#     # Weak supervision
#     d = X_train.shape[1]
#     r_utils = RankingUtils(d)
#     dummy_lf = FeatureRankingLF(rank_on_feature=0, d=d, highest_first=False)
#     true_ranking = dummy_lf.apply_mat(np.concatenate((np.expand_dims(Y_train, axis=-1), np.expand_dims(Y_test, axis=-1))))
#     lf = FeatureRankingLF(rank_on_feature=106, d=d, highest_first=True)
#     wl = lf.apply_mat(np.concatenate((X_train, X_test)))
#     kt = r_utils.mean_kt_distance(true_ranking, wl)
#     wl_score = ranking_to_score(wl, d=d, highest_first=False)
#     print('kt distance: ', kt)

#     Y_train = wl_score[:len(X_train)]
#     ptwrapper = PtrankingWrapper(data_conf=data_conf, weak_sup_conf=weak_sup_conf,
#                                  l2r_training_conf=l2r_training_conf, result_path=conf['results_path'],
#                                  wl_kt_distance = kt)
#     ptwrapper.set_data(X_train=X_train, X_test=X_test,
#                       Y_train=Y_train, Y_test=Y_test)
#     model = ptwrapper.get_model()
#     result = ptwrapper.train_model(model, IR=True, verbose=1)
    
#     result_path = f'results/BM25Only_{seed}.pickle'
#     with open(result_path, 'wb') as f:
#         pickle.dump(result, f)

In [4]:
for seed in range(5, 10):
    kt = None
    save_path = os.path.join(root_path, 'data/MSLR-WEB10K/True_5')
    file_train = 'train.npz'
    file_test = 'test.npz'

    train = np.load(os.path.join(save_path, file_train))
    test = np.load(os.path.join(save_path, file_test))
    X_train, Y_train, qid_train = train['X'], train['Y'], train['qid']
    X_test, Y_test, qid_test = test['X'], test['Y'], test['qid']

    # Weak supervision
    d = X_train.shape[1]
    r_utils = RankingUtils(d)
    dummy_lf = FeatureRankingLF(rank_on_feature=0, d=d, highest_first=False)
    true_ranking = dummy_lf.apply_mat(np.concatenate((np.expand_dims(Y_train, axis=-1), np.expand_dims(Y_test, axis=-1))))
    lf = FeatureRankingLF(rank_on_feature=106, d=d, highest_first=True)
    wl = lf.apply_mat(np.concatenate((X_train, X_test)))
    kt = r_utils.mean_kt_distance(true_ranking, wl)
    wl_score = ranking_to_score(wl, d=d, highest_first=False)
    print('kt distance: ', kt)

    Y_train = wl_score[:len(X_train)]
    ptwrapper = PtrankingWrapper(data_conf=data_conf, weak_sup_conf=weak_sup_conf,
                                 l2r_training_conf=l2r_training_conf, result_path=conf['results_path'],
                                 wl_kt_distance = kt)
    ptwrapper.set_data(X_train=X_train, X_test=X_test,
                      Y_train=Y_train, Y_test=Y_test)
    model = ptwrapper.get_model()
    result = ptwrapper.train_model(model, IR=True, verbose=1)
    
    result_path = f'results/BM25Only_{seed}.pickle'
    with open(result_path, 'wb') as f:
        pickle.dump(result, f)

kt distance:  0.13176914778856524
Training data shape, X_train.shape (1375, 5, 136) Y_train.shape torch.Size([1375, 5])
set_and_load_data in LTREvaluator
(1375, 5, 136) torch.Size([1375, 5]) (1375,)
data_dict {'data_id': 'MODIFIED_MSLR_WEB10K', 'dir_data': 'data/MSLR-WEB10K/processed/default', 'min_docs': 10, 'min_rele': 1, 'scale_data': False, 'scaler_id': None, 'scaler_level': None, 'train_presort': True, 'validation_presort': True, 'test_presort': True, 'train_batch_size': 64, 'validation_batch_size': 1, 'test_batch_size': 1, 'unknown_as_zero': False, 'binary_rele': False, 'num_features': 136, 'has_comment': False, 'label_type': <LABEL_TYPE.Permutation: 2>, 'max_rele_level': None, 'fold_num': 1}
data_dict {'data_id': 'MODIFIED_MSLR_WEB10K', 'dir_data': 'data/MSLR-WEB10K/processed/default', 'min_docs': 10, 'min_rele': 1, 'scale_data': False, 'scaler_id': None, 'scaler_level': None, 'train_presort': True, 'validation_presort': True, 'test_presort': True, 'train_batch_size': 64, 'valid

data_dict {'data_id': 'MODIFIED_MSLR_WEB10K', 'dir_data': 'data/MSLR-WEB10K/processed/default', 'min_docs': 10, 'min_rele': 1, 'scale_data': False, 'scaler_id': None, 'scaler_level': None, 'train_presort': True, 'validation_presort': True, 'test_presort': True, 'train_batch_size': 64, 'validation_batch_size': 1, 'test_batch_size': 1, 'unknown_as_zero': False, 'binary_rele': False, 'num_features': 136, 'has_comment': False, 'label_type': <LABEL_TYPE.Permutation: 2>, 'max_rele_level': None, 'fold_num': 1}
Sequential(
  (FeatureBN): BatchNorm1d(136, eps=1e-05, momentum=1.0, affine=True, track_running_stats=False)
  (L_1): Linear(in_features=136, out_features=30, bias=True)
  (BN_1): BatchNorm1d(30, eps=1e-05, momentum=1.0, affine=True, track_running_stats=False)
  (ACT_1): ReLU()
  (DR_2): Dropout(p=0.01, inplace=False)
  (L_2): Linear(in_features=30, out_features=30, bias=True)
  (BN_2): BatchNorm1d(30, eps=1e-05, momentum=1.0, affine=True, track_running_stats=False)
  (ACT_2): ReLU()
  

epoch 0, loss [1056.3091], train tau 0.24174556136131287, test_tau 0.39394575357437134,train_ndcg@1 tensor([0.7738]), test_ndcg@1 tensor([0.7046])train_p@1 tensor([0.9709]), test_p@1, tensor([0.7704])
epoch 1, loss [972.7486], train tau 0.22538158297538757, test_tau 0.3837161064147949,train_ndcg@1 tensor([0.7976]), test_ndcg@1 tensor([0.7390])train_p@1 tensor([0.9775]), test_p@1, tensor([0.7537])
epoch 2, loss [943.30255], train tau 0.21890825033187866, test_tau 0.38496869802474976,train_ndcg@1 tensor([0.8036]), test_ndcg@1 tensor([0.7208])train_p@1 tensor([0.9760]), test_p@1, tensor([0.7474])
epoch 3, loss [918.7407], train tau 0.21127235889434814, test_tau 0.3828810155391693,train_ndcg@1 tensor([0.8136]), test_ndcg@1 tensor([0.7312])train_p@1 tensor([0.9745]), test_p@1, tensor([0.7349])
epoch 4, loss [902.7591], train tau 0.20283561944961548, test_tau 0.37954071164131165,train_ndcg@1 tensor([0.8211]), test_ndcg@1 tensor([0.7307])train_p@1 tensor([0.9804]), test_p@1, tensor([0.7390])
