# Run bias comparison

#### Load libs

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pyplot
%matplotlib inline
import scipy.sparse as sps

In [2]:
from Data_manager.DataReader_ImportAll import *
import multiprocessing, traceback, os
from functools import  partial
# TODO ADDED
import skopt
import datetime, time
# TODO /ADDED

from ParameterTuning.SearchAbstractClass import SearchInputRecommenderParameters
from ParameterTuning.run_parameter_search import runParameterSearch_Content, runParameterSearch_Collaborative
from Base.Evaluation.Evaluator import EvaluatorHoldout
from Data_manager.DataSplitter_leave_k_out import DataSplitter_leave_k_out

from KNN.ItemKNNCBFRecommender import ItemKNNCBFRecommender


In [3]:
dataset_class = Movielens1MReader # Movielens10MReader # CiteULike_aReader
feature_weighting = "none" # "TF-IDF" # "BM25" # 
allow_bias_ICM = False
ICM_bias = None # None or float

In [4]:
similarity_type_list = ["cosine"]

output_folder = "result_experiments/{}".format(dataset_class.DATASET_SUBFOLDER)
if allow_bias_ICM:
    output_folder += "ICM_bias/"
else:
    output_folder += "ICM_original/"
output_folder += "feature_weighting_"+feature_weighting+"/"

dataSplitter = DataSplitter_leave_k_out(dataset_class(), k_value=1, validation_set=True)
dataSplitter.load_data()

all_available_ICM_names = dataSplitter.get_loaded_ICM_names()

print("Available ICM: ", all_available_ICM_names)

ICM_name = all_available_ICM_names[0]


DataSplitter_leave_k_out: Cold users not allowed
DataSplitter_k_fold for DataReader: Movielens1M
	 Num items: 3882
	 Num users: 6039
 	 Train interactions: 986002, density: 4.21E-02
 	 Test interactions: 6039, density: 2.58E-04
	 Validation interactions: 6039, density: 2.58E-04



	 Statistics for ICM_genres: n_features 18, feature occurrences 6405, density: 9.17E-02


DataSplitter_k_fold: Done.
Available ICM:  ['ICM_genres']


In [5]:
print("Processing ICM: '{}'".format(ICM_name))

dataset_object = dataset_class(ICM_to_load_list = [ICM_name])

dataSplitter = DataSplitter_leave_k_out(dataset_object, k_value=1, validation_set=True)
dataSplitter.load_data()

ICM_object = dataSplitter.get_ICM_from_name(ICM_name)

Processing ICM: 'ICM_genres'
DataSplitter_leave_k_out: Cold users not allowed
DataSplitter_k_fold for DataReader: Movielens1M
	 Num items: 3882
	 Num users: 6039
 	 Train interactions: 986002, density: 4.21E-02
 	 Test interactions: 6039, density: 2.58E-04
	 Validation interactions: 6039, density: 2.58E-04



	 Statistics for ICM_genres: n_features 18, feature occurrences 6405, density: 9.17E-02


DataSplitter_k_fold: Done.


In [6]:
ICM_object
np.unique(ICM_object.data) # NB: 0 always included, cause ICM is sparse

array([1.])

In [7]:
URM_train, URM_validation, URM_test = dataSplitter.get_holdout_split()

In [8]:
evaluator_validation = EvaluatorHoldout(URM_validation, cutoff_list=[10])
evaluator_test = EvaluatorHoldout(URM_test, cutoff_list=[5, 10, 15, 20, 25, 30])

In [9]:
recommender_class = ItemKNNCBFRecommender
recommender_parameters = SearchInputRecommenderParameters(
    CONSTRUCTOR_POSITIONAL_ARGS = [ICM_object, URM_train],
    CONSTRUCTOR_KEYWORD_ARGS = {},
    FIT_POSITIONAL_ARGS = [],
    FIT_KEYWORD_ARGS = {
        'topK': 50,
        'shrink': 100, 
        'similarity': 'cosine',
        'normalize': True,
        'feature_weighting': feature_weighting,
        'ICM_bias': ICM_bias,
        #**similarity_args
    }
)

In [10]:
start_time = time.time()

# Construct a new recommender instance
recommender_instance = recommender_class(*recommender_parameters.CONSTRUCTOR_POSITIONAL_ARGS,
                                         **recommender_parameters.CONSTRUCTOR_KEYWORD_ARGS)

recommender_instance.fit(*recommender_parameters.FIT_POSITIONAL_ARGS,
                         **recommender_parameters.FIT_KEYWORD_ARGS)
#                          **current_fit_parameters,
#                          **self.hyperparams_single_value)

train_time = time.time() - start_time
start_time = time.time()

Similarity column 3882 ( 100 % ), 4144.27 column/sec, elapsed time 0.02 min


In [11]:
# Evaluate recommender and get results for the first cutoff
evaluation_result_dict, evaluation_result_string = evaluator_validation.evaluateRecommender(recommender_instance)
# evaluation_result_dict = evaluation_result_dict[list(evaluation_result_dict.keys())[0]]

evaluation_time = time.time() - start_time

EvaluatorHoldout: Processed 6039 ( 100.00% ) in 7.72 seconds. Users per second: 782


In [12]:
# Evaluate recommender and get results for the first cutoff
test_result_dict, test_result_string = evaluator_test.evaluateRecommender(recommender_instance)
# test_result_dict = test_result_dict[list(test_result_dict.keys())[0]]

test_time = time.time() - start_time

EvaluatorHoldout: Processed 5333 ( 88.31% ) in 30.00 seconds. Users per second: 178
EvaluatorHoldout: Processed 6039 ( 100.00% ) in 33.77 seconds. Users per second: 179


In [13]:
print('Train time: ', train_time)
print()
print('Validation time: ', evaluation_time)
print('Validation result:', evaluation_result_string)
print('Test time: ', test_time)
print('Test result:', test_result_string)

Train time:  1.0371918678283691

Validation time:  7.747517824172974
Validation result: CUTOFF: 10 - ROC_AUC: 0.0170926, PRECISION: 0.0032290, PRECISION_TEST_LEN: 0.0322901, RECALL: 0.0322901, RECALL_TEST_LEN: 0.0322901, MAP: 0.0103483, MRR: 0.0103483, NDCG: 0.0153831, F1: 0.0058709, HIT_RATE: 0.0322901, ARHR: 0.0103483, RMSE: 3.3697977, NOVELTY: 0.0300290, DIVERSITY_MEAN_INTER_LIST: 0.9498848, DIVERSITY_HERFINDAHL: 0.9949728, COVERAGE_ITEM: 0.4528594, COVERAGE_USER: 1.0000000, DIVERSITY_GINI: 0.2044271, SHANNON_ENTROPY: 8.7069995, 

Test time:  41.55809783935547
Test result: CUTOFF: 5 - ROC_AUC: 0.0084865, PRECISION: 0.0034112, PRECISION_TEST_LEN: 0.0170558, RECALL: 0.0170558, RECALL_TEST_LEN: 0.0170558, MAP: 0.0077800, MRR: 0.0077800, NDCG: 0.0100495, F1: 0.0056853, HIT_RATE: 0.0170558, ARHR: 0.0077800, RMSE: 3.3767858, NOVELTY: 0.0148808, DIVERSITY_MEAN_INTER_LIST: 0.9669467, DIVERSITY_HERFINDAHL: 0.9933573, COVERAGE_ITEM: 0.3588357, COVERAGE_USER: 1.0000000, DIVERSITY_GINI: 0.20478