# Path setup & import packages

In [14]:
import pandas as pd
%load_ext autoreload
%autoreload 2
import sys
import os
root_path = '../../../' # path to project root
sys.path.append('{}/code'.format(root_path))
sys.path.append('{}/code/core'.format(root_path))
sys.path.append('{}/code/datasets/'.format(root_path))
sys.path.insert(0,'{}/code/ptranking'.format(root_path))

from core.ranking_utils import *
from core.mallows import *
from core.ws_ranking import *
from core.ws_real_workflow import * 
from datasets.imdb_tmdb_dataset import * 
from datasets.basic_clmn_dataset import * 
from core.labelling.feature_lf import *
from ptranking_wrapper import PtrankingWrapper
import datasets_factory 
import numpy as np 
import yaml
import matplotlib.pyplot as plt
import pickle
label_feature = 'vote_average'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Read config & basic setup

In [15]:
config_file_path = '{}/configs/imdb-tmdb_ranking_experiment_play.yaml'.format(root_path)

with open(config_file_path,'r') as conf_file:
    conf = yaml.full_load(conf_file)
    conf['project_root'] = root_path 

data_conf = conf['data_conf']
weak_sup_conf = conf['weak_sup_conf'] # For partial ranking experiments, we should give
l2r_training_conf = conf['l2r_training_conf']
data_conf['project_root'] = root_path
d = data_conf['dimension']

In [16]:
weak_sup_conf['synthetic'] = False

# Dataset sampling

In [17]:
dataset= datasets_factory.create_dataset(data_conf)
dataset.create_samples()

Generate samples...


In [34]:
feature_list = data_conf['features']
nonbinary_features = []

df = pd.read_csv(os.path.join(root_path, 'data',
                              'imdb-tmdb', 'merged_imdb_tmdb_with_additional_features.csv')).fillna(0)
for feature in feature_list:
    if (len(df[feature].unique()) > 3) and (feature != label_feature):
        nonbinary_features.append(feature)
nonbinary_features    

['count_production_countries',
 'year',
 'count_spoken_langueges',
 'num_critic_for_reviews',
 'duration',
 'director_facebook_likes',
 'gross',
 'num_voted_users',
 'cast_total_facebook_likes',
 'facenumber_in_poster',
 'num_user_for_reviews',
 'budget_imdb',
 'title_year',
 'aspect_ratio',
 'movie_facebook_likes',
 'budget_tmdb',
 'popularity',
 'revenue',
 'runtime',
 'vote_count',
 'actor_facebook_likes']

In [39]:
wl_feature_list = []
for feature in nonbinary_features:
    wl = []
    lf = FeatureRankingLF(feature, d=d, highest_first=True)
    for row in dataset.lst_feature_map:
        wl.append(lf.apply(row))
    kt = r_utils.mean_kt_distance(wl, dataset.Y)
    print(feature, 'kt distance: ', kt)
    if kt < 0.375:
        wl_feature_list.append(feature)
    

count_production_countries kt distance:  0.4843333333333333
year kt distance:  0.5539999999999999
count_spoken_langueges kt distance:  0.4633333333333333
num_critic_for_reviews kt distance:  0.4021666666666666
duration kt distance:  0.3703333333333333
director_facebook_likes kt distance:  0.4471666666666666
gross kt distance:  0.44966666666666666
num_voted_users kt distance:  0.3535
cast_total_facebook_likes kt distance:  0.4766666666666667
facenumber_in_poster kt distance:  0.5265
num_user_for_reviews kt distance:  0.3926666666666667
budget_imdb kt distance:  0.5001666666666668
title_year kt distance:  0.5529999999999999
aspect_ratio kt distance:  0.48849999999999993
movie_facebook_likes kt distance:  0.4506666666666666
budget_tmdb kt distance:  0.4848333333333334
popularity kt distance:  0.39633333333333337
revenue kt distance:  0.42799999999999994
runtime kt distance:  0.358
vote_count kt distance:  0.385
actor_facebook_likes kt distance:  0.4751666666666667


In [42]:
wl_feature_list = [
 'num_critic_for_reviews',
 'budget_tmdb',
 'popularity',
 'revenue',
 'runtime']

# Label model learning & inference

In [43]:
if l2r_training_conf['use_weak_labels']:
    Y_tilde, thetas = get_weak_labels(dataset, weak_sup_conf, root_path=root_path)
    r_utils = RankingUtils(data_conf['dimension'])
    kt = r_utils.mean_kt_distance(Y_tilde,dataset.Y)
    print('kt distance: ', kt)
    dataset.set_Y_tilde(Y_tilde)
else:
    kt = None

Weak labels generated and saved in ../../../data/imdb-tmdb/processed/210513_dim-10_ntrain-500_ntest-1000_model-ListMLE_weaklabel-False/LFs/weak_labels.pkl
Use our weak supervision...train_method: triplet_opt,inference_rule: weighted_kemeny
kt distance:  0.4026666666666667


# Train and evaluation - mainly with PtrankingWrapper

In [7]:
ptwrapper = PtrankingWrapper(data_conf=data_conf, weak_sup_conf=weak_sup_conf,
                             l2r_training_conf=l2r_training_conf, result_path=conf['results_path'],
                             wl_kt_distance = kt)
X_train, X_test, Y_train, Y_test = dataset.get_train_test_torch(use_weak_labels=l2r_training_conf['use_weak_labels'])
ptwrapper.set_data(X_train=X_train, X_test=X_test,
                  Y_train=Y_train, Y_test=Y_test)
model = ptwrapper.get_model()
result = ptwrapper.train_model(model, verbose=1)

use_weak_labels:True, we will use weak labels


NameError: name 'd' is not defined

# Training log visualization

In [None]:
plt.rcParams['font.size'] = '24'
losses = result['loss']
train_tau = result['train_tau']
test_tau = result['test_tau']
train_ndcg1 = result['train_ndcg1']
train_ndcg3 = result['train_ndcg3']
train_ndcg5 = result['train_ndcg5']
test_ndcg1 = result['test_ndcg1']
test_ndcg3 = result['test_ndcg3']
test_ndcg5 = result['test_ndcg5']


fig, axes = plt.subplots(ncols=2, figsize=(16,9))
axes[0].plot(losses)
axes[0].set_title(f'Training loss', fontsize=22)
axes[1].plot(train_tau, label='train_mean_kt')
axes[1].plot(test_tau, label='test_mean_kt')
axes[1].set_ylim(0,1)
axes[1].legend(fontsize=18)
axes[1].set_title(f'Kendall Tau', fontsize=22)
plt.show()

fig, axes = plt.subplots(ncols=2, figsize=(16,9))
axes[0].plot(train_ndcg1, label='NDCG@1')
axes[0].plot(train_ndcg3, label='NDCG@3')
axes[0].plot(train_ndcg5, label='NDCG@5')
axes[0].legend()
axes[0].set_title(f'Train NDCG', fontsize=22)
axes[1].plot(test_ndcg1, label='NDCG@1')
axes[1].plot(test_ndcg3, label='NDCG@3')
axes[1].plot(test_ndcg5, label='NDCG@5')
axes[1].set_title(f'Test NDCG', fontsize=22)
axes[1].legend()
plt.show()