In [1]:
%env MKL_THREADING_LAYER=tbb
%env OPENBLAS_NUM_THREADS=24
%env NUMBA_NUM_THREADS=96
%env MKL_NUM_THREADS=96
%env OMP_NUM_THREADS=1

env: MKL_THREADING_LAYER=tbb
env: OPENBLAS_NUM_THREADS=24
env: NUMBA_NUM_THREADS=96
env: MKL_NUM_THREADS=96
env: OMP_NUM_THREADS=1


In [2]:
import os

os.environ["MKL_THREADING_LAYER"] = "tbb"
os.environ["OPENBLAS_NUM_THREADS"] = "24"
os.environ["NUMBA_NUM_THREADS"] = "96"
os.environ["MKL_NUM_THREADS"] = "96"
os.environ["OMP_NUM_THREADS"] = "1"
# os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
# for random generation


# basic functions
import pandas as pd

pd.options.display.max_rows = 100
import pickle
import scipy

# custom-made functions
import modelling_mf
from optimize_hp import optimize_lkpy, optimize_cornac

# lenskit RS library
from lenskit.algorithms import als


# cornac RS library
from cornac.models import MF


# cornac RS library
import cornac
from cornac.eval_methods import BaseMethod
from cornac.metrics import RMSE


# lenskit RS library
from lenskit import crossfold as xf
from lenskit.algorithms import Recommender, basic
from lenskit import util, batch, topn
from lenskit.metrics.predict import rmse

In [3]:
data = "ml1m"
# user-item interactions
ml1m_ratings = pd.read_csv(
    "data/" + data + "_events.dat", header=None, sep="::", engine="python"
).drop(3, axis=1)
ml1m_ratings.columns = ["user", "item", "rating"]

In [4]:
ratings = ml1m_ratings.copy()
ratings = ratings.drop_duplicates(subset=["user", "item"], keep="last")
data_strategy = "ml1m"

In [5]:
user_col = "user"  # the name of the column that includes the users
item_col = "item"  # the name of the column that includes the items
predict_col = "rating"  # the name of the column that includes the interaction

In [6]:
evaluation_way = "cross_validation"
verbose = True
plot = True
save_plot = True  # save the plots
fallback = False
nr_recs = 10
sampling_strategy = "frac"
partition_way = "user"

In [7]:
mapping_dict = {}  # Create a dictionary that maps each item to an integer - necessary for Cornac.
i = 0
for mov in ratings[item_col].unique():
    mapping_dict[mov] = i
    i += 1
ratings[item_col] = ratings[item_col].map(
    lambda x: mapping_dict.get(x, x)
)  # Map in the ratings file

In [8]:
algo_versions = {"CornacMF": [{"bias": True}, {"bias": False}]}

In [9]:
algorithm_cornac = MF
algo_name = "CornacMF"
versions = algo_versions[algo_name]

In [10]:
args = versions[0]
print(data_strategy, args)

p = "best_parameters/" + algo_name + "/" + data_strategy + "_" + str(args) + ".pkl"
if os.path.isfile(p):
    print("We got them already")
    with open(p, "rb") as f:
        best_params = pickle.load(f)
optimal_k = best_params["k"]
optimal_reg = best_params["lambda_reg"]
optimal_lr = best_params["learning_rate"]
algorithm=algorithm_cornac(
            k=optimal_k,
            use_bias=args["bias"],
            lambda_reg=optimal_reg,
            learning_rate=optimal_lr)
n=nr_recs


# MODELING

all_items = set(ratings.item.unique())

if partition_way == "user":
    if sampling_strategy == "frac":
        sample = xf.SampleFrac(0.2, rng_spec=0)
    elif sampling_strategy == "N":
        sample = xf.SampleN(5, rng_spec=0)
    sets = [
        i for i in enumerate(xf.partition_users(ratings, 5, sample, rng_spec=0))
    ]
elif partition_way == "row":
    sets = [i for i in enumerate(xf.partition_rows(ratings, 5, rng_spec=0))]

ml1m {'bias': True}
We got them already


# Test!

In [11]:
i, tp = sets[2]

In [12]:
train_df = tp[0]
test_df = tp[1]
eval_method = BaseMethod.from_splits(
    train_data=list(
        train_df[["user", "item", "rating"]].to_records(index=False)
    ),
    test_data=list(
        test_df[["user", "item", "rating"]].to_records(index=False)
    ),
    exclude_unknowns=False,
    verbose=verbose,
)

rating_threshold = 1.0
exclude_unknowns = False
---
Training data:
Number of users = 6040
Number of items = 3703
Number of ratings = 961956
Max rating = 5.0
Min rating = 1.0
Global mean = 3.6
---
Test data:
Number of users = 1208
Number of items = 2994
Number of ratings = 38253
Number of unknown users = 0
Number of unknown items = 3
---
Total users = 6040
Total items = 3706


In [13]:
models = [algorithm]
metrics = [RMSE()]
exp = cornac.Experiment(
    eval_method=eval_method,
    models=models,
    metrics=metrics,
    user_based=False,
    save_dir="cornacLogs",
    verbose=verbose,
)

In [14]:
exp.run()


[MF] Training started!

[MF] Evaluation started!


Rating:   0%|          | 0/38253 [00:00<?, ?it/s]


TEST:
...
   |   RMSE | Train (s) | Test (s)
-- + ------ + --------- + --------
MF | 0.8617 |    0.9496 |   0.7394



In [15]:
loss = exp.result[0].metric_avg_results["RMSE"]

In [16]:
test_users = test_df.user.unique()  # the users in the test set
recs, stdev_20 = modelling_mf.recommend_cornac(
                exp=exp, all_items=all_items, user_col=user_col, item_col=item_col, n=n
            )
recs_grouped = recs.groupby([user_col])[item_col].apply(list)


In [17]:
precision, recall, ndcg = modelling_mf.calculate_topn_metrics(recs, test_df)
pop_bias = modelling_mf.calculate_pop_bias_per_item(
                all_items, item_col, user_col, predict_col, train_df, recs
            )

In [18]:
GAP_vs_GAP = modelling_mf.calculate_ave_pop_per_user(
                test_users, item_col, user_col, pop_bias, train_df, recs_grouped
            )

In [19]:
ARP, ave_PL, ACLT = modelling_mf.calculate_all_pb_metrics(
                pop_bias, test_users, item_col, user_col, train_df, recs_grouped, recs
            )

nr of longtail 2717
2.2491721854304636
