# Path setup & import packages

In [1]:
import pandas as pd
%load_ext autoreload
%autoreload 2
import sys
import os
root_path = '../../../' # path to project root
sys.path.append('{}/code'.format(root_path))
sys.path.append('{}/code/core'.format(root_path))
sys.path.append('{}/code/datasets/'.format(root_path))
sys.path.insert(0,'{}/code/ptranking'.format(root_path))

from core.ranking_utils import *
from core.mallows import *
from core.ws_ranking import *
from core.ws_real_workflow import * 
from datasets.imdb_tmdb_dataset import * 
from datasets.basic_clmn_dataset import * 
from core.labelling.feature_lf import *
from ptranking_wrapper import PtrankingWrapper
import datasets_factory 
import numpy as np 
import yaml
import matplotlib.pyplot as plt
import pickle

# Read config & basic setup

In [2]:
config_file_path = '{}/configs/imdb-tmdb_ranking_experiment_play2.yaml'.format(root_path)

with open(config_file_path,'r') as conf_file:
    conf = yaml.full_load(conf_file)
    conf['project_root'] = root_path 

data_conf = conf['data_conf']
weak_sup_conf = conf['weak_sup_conf'] # For partial ranking experiments, we should give
l2r_training_conf = conf['l2r_training_conf']
data_conf['project_root'] = root_path

In [3]:
weak_sup_conf['synthetic'] = False

# Train and evaluation - mainly with PtrankingWrapper

In [5]:
for seed in range(5):
    dataset= datasets_factory.create_dataset(data_conf)
    dataset.create_samples()
    
    if l2r_training_conf['use_weak_labels']:
        Y_tilde, thetas = get_weak_labels(dataset, weak_sup_conf, root_path=root_path)
        r_utils = RankingUtils(data_conf['dimension'])
        kt = r_utils.mean_kt_distance(Y_tilde,dataset.Y)
        print('kt distance: ', kt)
        dataset.set_Y_tilde(Y_tilde)
    else:
        kt = None
    
    ptwrapper = PtrankingWrapper(data_conf=data_conf, weak_sup_conf=weak_sup_conf,
                                 l2r_training_conf=l2r_training_conf, result_path=conf['results_path'],
                                 wl_kt_distance = kt)
    X_train, X_test, Y_train, Y_test = dataset.get_train_test_torch(use_weak_labels=l2r_training_conf['use_weak_labels'])
    ptwrapper.set_data(X_train=X_train, X_test=X_test,
                      Y_train=Y_train, Y_test=Y_test)
    model = ptwrapper.get_model()
    result = ptwrapper.train_model(model, verbose=1)
    
    with open(os.path.join(f'results/with_imdbscore_seed_{seed}.pickle'), 'wb') as f:
        pickle.dump(result, f)
        
    print(seed, max(result['test_tau']))

Generate samples...
Weak labels generated and saved in ../../../data/imdb-tmdb/processed/210513_dim-10_ntrain-500_ntest-1000_model-ListMLE_weaklabel-False/LFs/weak_labels.pkl
Use our weak supervision...train_method: triplet_opt,inference_rule: weighted_kemeny
kt distance:  0.34995000000000004
use_weak_labels:True, we will use weak labels
Training data shape, X_train.shape torch.Size([5000, 5, 255]) Y_train.shape torch.Size([5000, 5])
set_and_load_data in LTREvaluator
(5000, 5, 255) (5000, 5) (5000,)
data_dict {'data_id': 'imdb_tmdb', 'dir_data': 'data/imdb-tmdb/processed/210513_dim-10_ntrain-500_ntest-1000_model-ListMLE_weaklabel-False', 'min_docs': 10, 'min_rele': 1, 'scale_data': False, 'scaler_id': None, 'scaler_level': None, 'train_presort': True, 'validation_presort': True, 'test_presort': True, 'train_batch_size': 64, 'validation_batch_size': 1, 'test_batch_size': 1, 'unknown_as_zero': False, 'binary_rele': False, 'num_features': 255, 'has_comment': False, 'label_type': <LABEL_TY

epoch 0, loss [1168.231], train tau 0.021727770566940308, test_tau 0.3475000262260437,train_ndcg@1 tensor([0.9762]), test_ndcg@1 tensor([0.6607])
epoch 1, loss [561.65137], train tau 0.013404875993728638, test_tau 0.3499000072479248,train_ndcg@1 tensor([0.9837]), test_ndcg@1 tensor([0.6530])
epoch 2, loss [433.47894], train tau 0.011124253273010254, test_tau 0.35110002756118774,train_ndcg@1 tensor([0.9859]), test_ndcg@1 tensor([0.6555])
epoch 3, loss [367.28027], train tau 0.009203463792800903, test_tau 0.35100001096725464,train_ndcg@1 tensor([0.9876]), test_ndcg@1 tensor([0.6525])
epoch 4, loss [323.3191], train tau 0.007823050022125244, test_tau 0.3516000211238861,train_ndcg@1 tensor([0.9897]), test_ndcg@1 tensor([0.6532])
The experiment result is saved in ../../../tmp/results/210513_dim-10_ntrain-500_ntest-1000_model-ListMLE_weaklabel-False/result_summary.pkl
2 [0.35160002]
Generate samples...
Weak labels generated and saved in ../../../data/imdb-tmdb/processed/210513_dim-10_ntrain-