# Path setup & import packages

In [1]:
import pandas as pd
%load_ext autoreload
%autoreload 2
import sys
import os
root_path = '../../' # path to project root
sys.path.append('{}/code'.format(root_path))
sys.path.append('{}/code/core'.format(root_path))
sys.path.append('{}/code/datasets/'.format(root_path))
sys.path.insert(0,'{}/code/ptranking'.format(root_path))

from core.ranking_utils import *
from core.mallows import *
from core.ws_ranking import *
from core.ws_real_workflow import * 
from datasets.imdb_tmdb_dataset import * 
from datasets.basic_clmn_dataset import * 
from core.labelling.feature_lf import *
from ptranking_wrapper import PtrankingWrapper
import datasets_factory 
import numpy as np 
import yaml
import matplotlib.pyplot as plt
import pickle

# Read config & basic setup

In [2]:
config_file_path = '{}/configs/modified-mslr-web10k_ranking_experiment.yaml'.format(root_path)

with open(config_file_path,'r') as conf_file:
    conf = yaml.full_load(conf_file)
    conf['project_root'] = root_path 

data_conf = conf['data_conf']
weak_sup_conf = conf['weak_sup_conf'] # For partial ranking experiments, we should give
l2r_training_conf = conf['l2r_training_conf']
data_conf['project_root'] = root_path

In [3]:
# for seed in range(5):
    
#     save_path = os.path.join(root_path, 'data/MSLR-WEB10K/True_5')
#     file_train = 'train.npz'
#     file_test = 'test.npz'

#     train = np.load(os.path.join(save_path, file_train))
#     test = np.load(os.path.join(save_path, file_test))
#     X_train, Y_train, qid_train = train['X'], train['Y'], train['qid']
#     X_test, Y_test, qid_test = test['X'], test['Y'], test['qid']

#     # Weak supervision
#     d = X_train.shape[1]
#     r_utils = RankingUtils(d)
#     dummy_lf = FeatureRankingLF(rank_on_feature=0, d=d, highest_first=False)
#     true_ranking = dummy_lf.apply_mat(np.concatenate((np.expand_dims(Y_train, axis=-1), np.expand_dims(Y_test, axis=-1))))

#     df = pd.DataFrame()
#     feature_list = [0, 5, 95, 101, 106, 111, 116, 121, 133]
#     L_ = []
#     for feature in feature_list:
#         lf = FeatureRankingLF(rank_on_feature=feature, d=d, highest_first=True)
#         wl = lf.apply_mat(np.concatenate((X_train, X_test)))
#         kt = r_utils.mean_kt_distance(true_ranking, wl)
#         df = df.append(
#         {
#             'feature': feature,
#             'kt': kt
#         }, ignore_index=True)
#         L_.append(wl)

#     display(df)

#     L = []
#     for data_idx in range(len(L_[0])):
#         L_row = []
#         for wl_idx in range(len(L_)):
#             L_row.append(L_[wl_idx][data_idx])
#         L.append(L_row)

#     wsr = WeakSupRanking(r_utils)
#     wsr.train(weak_sup_conf, L)

#     m = len(L[0])
#     lst_pi_hat = wsr.infer_ranking(weak_sup_conf, L, numLFs=m)
#     kt = r_utils.mean_kt_distance(true_ranking, lst_pi_hat)
#     print('inference kt:', kt)
    
#     ptwrapper = PtrankingWrapper(data_conf=data_conf, weak_sup_conf=weak_sup_conf,
#                                  l2r_training_conf=l2r_training_conf, result_path=conf['results_path'],
#                                  wl_kt_distance = kt)

#     Y_train = ranking_to_score(lst_pi_hat, d=d, highest_first=False)[:X_train.shape[0]]
#     ptwrapper.set_data(X_train=X_train, X_test=X_test,
#                       Y_train=Y_train, Y_test=Y_test)
#     model = ptwrapper.get_model()
#     result = ptwrapper.train_model(model, IR=True, verbose=1)
#     result_path = f'results/Selected_{seed}.pickle'
#     with open(result_path, 'wb') as f:
#         pickle.dump(result, f)

In [4]:
for seed in range(5, 10):
    
    save_path = os.path.join(root_path, 'data/MSLR-WEB10K/True_5')
    file_train = 'train.npz'
    file_test = 'test.npz'

    train = np.load(os.path.join(save_path, file_train))
    test = np.load(os.path.join(save_path, file_test))
    X_train, Y_train, qid_train = train['X'], train['Y'], train['qid']
    X_test, Y_test, qid_test = test['X'], test['Y'], test['qid']

    # Weak supervision
    d = X_train.shape[1]
    r_utils = RankingUtils(d)
    dummy_lf = FeatureRankingLF(rank_on_feature=0, d=d, highest_first=False)
    true_ranking = dummy_lf.apply_mat(np.concatenate((np.expand_dims(Y_train, axis=-1), np.expand_dims(Y_test, axis=-1))))

    df = pd.DataFrame()
    feature_list = [0, 5, 95, 101, 106, 111, 116, 121, 133]
    L_ = []
    for feature in feature_list:
        lf = FeatureRankingLF(rank_on_feature=feature, d=d, highest_first=True)
        wl = lf.apply_mat(np.concatenate((X_train, X_test)))
        kt = r_utils.mean_kt_distance(true_ranking, wl)
        df = df.append(
        {
            'feature': feature,
            'kt': kt
        }, ignore_index=True)
        L_.append(wl)

    display(df)

    L = []
    for data_idx in range(len(L_[0])):
        L_row = []
        for wl_idx in range(len(L_)):
            L_row.append(L_[wl_idx][data_idx])
        L.append(L_row)

    wsr = WeakSupRanking(r_utils)
    wsr.train(weak_sup_conf, L)

    m = len(L[0])
    lst_pi_hat = wsr.infer_ranking(weak_sup_conf, L, numLFs=m)
    kt = r_utils.mean_kt_distance(true_ranking, lst_pi_hat)
    print('inference kt:', kt)
    
    ptwrapper = PtrankingWrapper(data_conf=data_conf, weak_sup_conf=weak_sup_conf,
                                 l2r_training_conf=l2r_training_conf, result_path=conf['results_path'],
                                 wl_kt_distance = kt)

    Y_train = ranking_to_score(lst_pi_hat, d=d, highest_first=False)[:X_train.shape[0]]
    ptwrapper.set_data(X_train=X_train, X_test=X_test,
                      Y_train=Y_train, Y_test=Y_test)
    model = ptwrapper.get_model()
    result = ptwrapper.train_model(model, IR=True, verbose=1)
    result_path = f'results/Selected_{seed}.pickle'
    with open(result_path, 'wb') as f:
        pickle.dump(result, f)

Unnamed: 0,feature,kt
0,0.0,0.135275
1,5.0,0.135275
2,95.0,0.121683
3,101.0,0.126483
4,106.0,0.131769
5,111.0,0.124649
6,116.0,0.123894
7,121.0,0.123409
8,133.0,0.153398


inference kt: 0.11699029126213592
Training data shape, X_train.shape (1375, 5, 136) Y_train.shape torch.Size([1375, 5])
set_and_load_data in LTREvaluator
(1375, 5, 136) torch.Size([1375, 5]) (1375,)
data_dict {'data_id': 'MODIFIED_MSLR_WEB10K', 'dir_data': 'data/MSLR-WEB10K/processed/default', 'min_docs': 10, 'min_rele': 1, 'scale_data': False, 'scaler_id': None, 'scaler_level': None, 'train_presort': True, 'validation_presort': True, 'test_presort': True, 'train_batch_size': 64, 'validation_batch_size': 1, 'test_batch_size': 1, 'unknown_as_zero': False, 'binary_rele': False, 'num_features': 136, 'has_comment': False, 'label_type': <LABEL_TYPE.Permutation: 2>, 'max_rele_level': None, 'fold_num': 1}
data_dict {'data_id': 'MODIFIED_MSLR_WEB10K', 'dir_data': 'data/MSLR-WEB10K/processed/default', 'min_docs': 10, 'min_rele': 1, 'scale_data': False, 'scaler_id': None, 'scaler_level': None, 'train_presort': True, 'validation_presort': True, 'test_presort': True, 'train_batch_size': 64, 'valid

Unnamed: 0,feature,kt
0,0.0,0.135275
1,5.0,0.135275
2,95.0,0.121683
3,101.0,0.126483
4,106.0,0.131769
5,111.0,0.124649
6,116.0,0.123894
7,121.0,0.123409
8,133.0,0.153398


inference kt: 0.11699029126213592
Training data shape, X_train.shape (1375, 5, 136) Y_train.shape torch.Size([1375, 5])
set_and_load_data in LTREvaluator
(1375, 5, 136) torch.Size([1375, 5]) (1375,)
data_dict {'data_id': 'MODIFIED_MSLR_WEB10K', 'dir_data': 'data/MSLR-WEB10K/processed/default', 'min_docs': 10, 'min_rele': 1, 'scale_data': False, 'scaler_id': None, 'scaler_level': None, 'train_presort': True, 'validation_presort': True, 'test_presort': True, 'train_batch_size': 64, 'validation_batch_size': 1, 'test_batch_size': 1, 'unknown_as_zero': False, 'binary_rele': False, 'num_features': 136, 'has_comment': False, 'label_type': <LABEL_TYPE.Permutation: 2>, 'max_rele_level': None, 'fold_num': 1}
data_dict {'data_id': 'MODIFIED_MSLR_WEB10K', 'dir_data': 'data/MSLR-WEB10K/processed/default', 'min_docs': 10, 'min_rele': 1, 'scale_data': False, 'scaler_id': None, 'scaler_level': None, 'train_presort': True, 'validation_presort': True, 'test_presort': True, 'train_batch_size': 64, 'valid

Unnamed: 0,feature,kt
0,0.0,0.135275
1,5.0,0.135275
2,95.0,0.121683
3,101.0,0.126483
4,106.0,0.131769
5,111.0,0.124649
6,116.0,0.123894
7,121.0,0.123409
8,133.0,0.153398


inference kt: 0.11699029126213592
Training data shape, X_train.shape (1375, 5, 136) Y_train.shape torch.Size([1375, 5])
set_and_load_data in LTREvaluator
(1375, 5, 136) torch.Size([1375, 5]) (1375,)
data_dict {'data_id': 'MODIFIED_MSLR_WEB10K', 'dir_data': 'data/MSLR-WEB10K/processed/default', 'min_docs': 10, 'min_rele': 1, 'scale_data': False, 'scaler_id': None, 'scaler_level': None, 'train_presort': True, 'validation_presort': True, 'test_presort': True, 'train_batch_size': 64, 'validation_batch_size': 1, 'test_batch_size': 1, 'unknown_as_zero': False, 'binary_rele': False, 'num_features': 136, 'has_comment': False, 'label_type': <LABEL_TYPE.Permutation: 2>, 'max_rele_level': None, 'fold_num': 1}
data_dict {'data_id': 'MODIFIED_MSLR_WEB10K', 'dir_data': 'data/MSLR-WEB10K/processed/default', 'min_docs': 10, 'min_rele': 1, 'scale_data': False, 'scaler_id': None, 'scaler_level': None, 'train_presort': True, 'validation_presort': True, 'test_presort': True, 'train_batch_size': 64, 'valid

Unnamed: 0,feature,kt
0,0.0,0.135275
1,5.0,0.135275
2,95.0,0.121683
3,101.0,0.126483
4,106.0,0.131769
5,111.0,0.124649
6,116.0,0.123894
7,121.0,0.123409
8,133.0,0.153398


inference kt: 0.11699029126213592
Training data shape, X_train.shape (1375, 5, 136) Y_train.shape torch.Size([1375, 5])
set_and_load_data in LTREvaluator
(1375, 5, 136) torch.Size([1375, 5]) (1375,)
data_dict {'data_id': 'MODIFIED_MSLR_WEB10K', 'dir_data': 'data/MSLR-WEB10K/processed/default', 'min_docs': 10, 'min_rele': 1, 'scale_data': False, 'scaler_id': None, 'scaler_level': None, 'train_presort': True, 'validation_presort': True, 'test_presort': True, 'train_batch_size': 64, 'validation_batch_size': 1, 'test_batch_size': 1, 'unknown_as_zero': False, 'binary_rele': False, 'num_features': 136, 'has_comment': False, 'label_type': <LABEL_TYPE.Permutation: 2>, 'max_rele_level': None, 'fold_num': 1}
data_dict {'data_id': 'MODIFIED_MSLR_WEB10K', 'dir_data': 'data/MSLR-WEB10K/processed/default', 'min_docs': 10, 'min_rele': 1, 'scale_data': False, 'scaler_id': None, 'scaler_level': None, 'train_presort': True, 'validation_presort': True, 'test_presort': True, 'train_batch_size': 64, 'valid

Unnamed: 0,feature,kt
0,0.0,0.135275
1,5.0,0.135275
2,95.0,0.121683
3,101.0,0.126483
4,106.0,0.131769
5,111.0,0.124649
6,116.0,0.123894
7,121.0,0.123409
8,133.0,0.153398


inference kt: 0.11699029126213592
Training data shape, X_train.shape (1375, 5, 136) Y_train.shape torch.Size([1375, 5])
set_and_load_data in LTREvaluator
(1375, 5, 136) torch.Size([1375, 5]) (1375,)
data_dict {'data_id': 'MODIFIED_MSLR_WEB10K', 'dir_data': 'data/MSLR-WEB10K/processed/default', 'min_docs': 10, 'min_rele': 1, 'scale_data': False, 'scaler_id': None, 'scaler_level': None, 'train_presort': True, 'validation_presort': True, 'test_presort': True, 'train_batch_size': 64, 'validation_batch_size': 1, 'test_batch_size': 1, 'unknown_as_zero': False, 'binary_rele': False, 'num_features': 136, 'has_comment': False, 'label_type': <LABEL_TYPE.Permutation: 2>, 'max_rele_level': None, 'fold_num': 1}
data_dict {'data_id': 'MODIFIED_MSLR_WEB10K', 'dir_data': 'data/MSLR-WEB10K/processed/default', 'min_docs': 10, 'min_rele': 1, 'scale_data': False, 'scaler_id': None, 'scaler_level': None, 'train_presort': True, 'validation_presort': True, 'test_presort': True, 'train_batch_size': 64, 'valid