<i>Copyright (c) Recommenders contributors.</i>

<i>Licensed under the MIT License.</i>

# TF-IDF Content-Based Recommendation


In [1]:
import sys
import logging
import scipy
import numpy as np
import pandas as pd
import dataset_loader

from recommenders.models.tfidf.tfidf_utils import TfidfRecommender
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.datasets import movielens
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k, mae, rmse, novelty, historical_item_novelty, user_item_serendipity, user_serendipity, serendipity, catalog_coverage, distributional_coverage

# Print version
print(f"System version: {sys.version}")

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


System version: 3.9.21 (main, Dec 11 2024, 16:24:11) 
[GCC 11.2.0]


### 1. Load the dataset into a dataframe
Let's begin by loading the metadata file for the dataset into a Pandas dataframe. This file contains metadata about each of the scientific articles included in the full dataset.

In [2]:
# Top k items to recommend
TOP_K = 10

In [None]:
dataset = 'movielens'
want_col=["userID", "itemID", "rating", "timestamp", 'title', 'genres']
num_rows = 10000
ratio = 0.75
seed = 42

params = {
    "dataset": dataset,
    "want_col": want_col,
    "num_rows": num_rows,
    "ratio": ratio,
    "seed": seed,
}

In [4]:
# set log level to INFO
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s %(levelname)-8s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

In [5]:
data = dataset_loader.loader(dataset, want_col, num_rows)
# print(data.shape)
# print(data['userID'].nunique())

# Convert the float precision to 32-bit in order to reduce memory consumption
data["rating"] = data["rating"].astype(np.float32)
data.head()


Unnamed: 0,userID,itemID,rating,timestamp,title,genres
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure|Children|Fantasy
1,1,29,3.5,2005-04-02 23:31:16,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,32,3.5,2005-04-02 23:33:39,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,1,47,3.5,2005-04-02 23:32:07,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,3.5,2005-04-02 23:29:40,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


### 4. Instantiate the recommender
All functions for data preparation and recommendation are contained within the **TfidfRecommender** class we have imported. Prior to running these functions, we must create an object of this class.

Select one of the following tokenization methods to use in the model:

| tokenization_method | Description                                                                                                                      |
|:--------------------|:---------------------------------------------------------------------------------------------------------------------------------|
| 'none'              | No tokenization is applied. Each word is considered a token.                                                                     |
| 'nltk'              | Simple stemming is applied using NLTK.                                                                                           |
| 'bert'              | HuggingFace BERT word tokenization ('bert-base-cased') is applied.                                                               |
| 'scibert'           | SciBERT word tokenization ('allenai/scibert_scivocab_cased') is applied.<br>This is recommended for scientific journal articles. |

In [6]:
# Create the recommender object
recommender = TfidfRecommender(id_col='itemID', tokenization_method='bert')

### 5. Prepare text for use in the TF-IDF recommender

In [7]:
data['genres'] = data['genres'].str.replace('|', ' ', regex=False)
data.head()

Unnamed: 0,userID,itemID,rating,timestamp,title,genres
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure Children Fantasy
1,1,29,3.5,2005-04-02 23:31:16,"City of Lost Children, The (Cité des enfants p...",Adventure Drama Fantasy Mystery Sci-Fi
2,1,32,3.5,2005-04-02 23:33:39,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery Sci-Fi Thriller
3,1,47,3.5,2005-04-02 23:32:07,Seven (a.k.a. Se7en) (1995),Mystery Thriller
4,1,50,3.5,2005-04-02 23:29:40,"Usual Suspects, The (1995)",Crime Mystery Thriller


In [8]:
df_clean = data.drop(columns=['userID', 'rating', 'timestamp'])
df_clean = df_clean.drop_duplicates(subset=['itemID'])
cols_to_clean = ['title','genres']
clean_col = 'cleaned_text'
df_clean = recommender.clean_dataframe(df_clean, cols_to_clean, clean_col)
df_clean.head()

Unnamed: 0,itemID,title,genres,cleaned_text
0,2,Jumanji (1995),Adventure Children Fantasy,Jumanji 1995 Adventure Children Fantasy
1,29,"City of Lost Children, The (Cité des enfants p...",Adventure Drama Fantasy Mystery Sci-Fi,City of Lost Children The Cité des enfants per...
2,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery Sci-Fi Thriller,Twelve Monkeys aka 12 Monkeys 1995 Mystery Sci...
3,47,Seven (a.k.a. Se7en) (1995),Mystery Thriller,Seven aka Se7en 1995 Mystery Thriller
4,50,"Usual Suspects, The (1995)",Crime Mystery Thriller,Usual Suspects The 1995 Crime Mystery Thriller


In [9]:
len(df_clean)

4919

In [10]:
df_clean = df_clean.reset_index(drop=True)
df_clean.head()

Unnamed: 0,itemID,title,genres,cleaned_text
0,2,Jumanji (1995),Adventure Children Fantasy,Jumanji 1995 Adventure Children Fantasy
1,29,"City of Lost Children, The (Cité des enfants p...",Adventure Drama Fantasy Mystery Sci-Fi,City of Lost Children The Cité des enfants per...
2,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery Sci-Fi Thriller,Twelve Monkeys aka 12 Monkeys 1995 Mystery Sci...
3,47,Seven (a.k.a. Se7en) (1995),Mystery Thriller,Seven aka Se7en 1995 Mystery Thriller
4,50,"Usual Suspects, The (1995)",Crime Mystery Thriller,Usual Suspects The 1995 Crime Mystery Thriller


In [11]:
header = {
    "col_user": "userID",
    "col_item": "itemID",
    "col_rating": "rating",
    "col_timestamp": "timestamp",
    "col_title": "title",
    "col_genres": "genres",
    "col_year": "year",
    "col_prediction": "prediction",
}

In [12]:
train, test = python_stratified_split(
    data, ratio=ratio, col_user=header["col_user"], col_item=header["col_item"], seed=seed
)

In [13]:
train = recommender.clean_dataframe(train, cols_to_clean, clean_col)
train.head()

Unnamed: 0,userID,itemID,rating,timestamp,title,genres,cleaned_text
72,1,2140,4.0,2005-04-02 23:48:25,"Dark Crystal, The (1982)",Adventure Fantasy,Dark Crystal The 1982 Adventure Fantasy
128,1,4915,3.0,2005-04-02 23:54:36,"Beastmaster, The (1982)",Action Adventure Fantasy,Beastmaster The 1982 Action Adventure Fantasy
171,1,8636,4.5,2005-04-02 23:44:53,Spider-Man 2 (2004),Action Adventure Sci-Fi IMAX,SpiderMan 2 2004 Action Adventure SciFi IMAX
10,1,293,4.0,2005-04-02 23:31:43,Léon: The Professional (a.k.a. The Professiona...,Action Crime Drama Thriller,Léon The Professional aka The Professional Léo...
98,1,3000,3.5,2005-04-02 23:29:29,Princess Mononoke (Mononoke-hime) (1997),Action Adventure Animation Drama Fantasy,Princess Mononoke Mononokehime 1997 Action Adv...


Let's also tokenize the cleaned text for use in the TF-IDF model. The tokens are stored within our TfidfRecommender object.

In [14]:
# Tokenize text with tokenization_method specified in class instantiation
tf, vectors_tokenized = recommender.tokenize_text(df_clean, text_col="cleaned_text")

2025-02-23 15:29:29 DEBUG    Starting new HTTPS connection (1): huggingface.co:443
2025-02-23 15:29:29 DEBUG    https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/tokenizer_config.json HTTP/1.1" 200 0


### 6. Recommend articles using TF-IDF
Let's now fit the recommender model to the processed data (tokens) and retrieve the top k recommended articles.

When creating our object, we specified k=5 so the `recommend_top_k_items` function will return the top 5 recommendations for each public domain article.

In [15]:
# Fit the TF-IDF vectorizer
recommender.fit(tf, vectors_tokenized)
tokens = recommender.get_tokens()
print(list(tokens.keys())[:10])


['ju', 'man', 'ji', '1995', 'adventure', 'children', 'fantasy', 'ju man', 'man ji', 'ji 1995']


In [16]:
len(tokens)

31854

In [17]:
top_k_items = recommender.recommend_top_k_items(df_clean, k=5)
top_k_items.head()

Unnamed: 0,itemID,rec_rank,rec_score,rec_itemID
0,2,1,0.285302,2103
1,2,2,0.274415,158
2,2,3,0.274145,60
3,2,4,0.207562,6557
4,2,5,0.195682,8


In [18]:
merged_df = data.merge(top_k_items, on='itemID', how='inner')
merged_df['prediction'] = merged_df['rating'] * merged_df['rec_score']
top_k = merged_df[['userID', 'rec_itemID', 'prediction']]
top_k.rename(columns={'rec_itemID': 'itemID'}, inplace=True)
top_k.sort_values(["userID", "prediction"], ascending=[True, False]).head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_k.rename(columns={'rec_itemID': 'itemID'}, inplace=True)


Unnamed: 0,userID,itemID,prediction
240,1,6461,2.134401
595,1,4886,2.031263
520,1,3440,2.024283
365,1,3479,1.990644
530,1,2143,1.990644


In [19]:
filtered_top_k = top_k.merge(train, on=["userID", "itemID"], how="left", indicator=True)
filtered_top_k = filtered_top_k[filtered_top_k["_merge"] == "left_only"].drop(columns=["_merge"])
filtered_top_k = filtered_top_k[["userID", "itemID", "prediction"]]
filtered_top_k.head()

Unnamed: 0,userID,itemID,prediction
0,1,2103,0.998557
1,1,158,0.960454
2,1,60,0.959509
3,1,6557,0.726467
4,1,8,0.684886


In [20]:
idx = filtered_top_k.groupby("userID")["prediction"].idxmax()
top = filtered_top_k.loc[idx]
top.head()

Unnamed: 0,userID,itemID,prediction
240,1,6461,2.134401
920,2,6078,2.353111
2070,3,519,2.779691
2250,4,836,2.324594
2530,5,635,3.059744


In [21]:
args = [test, top_k]
kwargs = dict(
    col_user="userID",
    col_item="itemID",
    col_rating="rating",
    col_prediction="prediction",
    relevancy_method="top_k",
    k=TOP_K,
)

eval_map = map_at_k(*args, **kwargs)
eval_ndcg_at_k = ndcg_at_k(*args, **kwargs)
eval_precision_at_k = precision_at_k(*args, **kwargs)
eval_recall_at_k = recall_at_k(*args, **kwargs)

In [22]:
args1 = [test, top]
kwargs1 = dict(
    col_user="userID",
    col_item="itemID",
    col_rating="rating",
    col_prediction="prediction",
    relevancy_method="top_k",
    k=1,
)
eval_ndcg = ndcg_at_k(*args1, **kwargs1)
eval_precision = precision_at_k(*args1, **kwargs1)
eval_recall = recall_at_k(*args1, **kwargs1)

In [23]:
eval_mae = mae(test, top_k)
eval_rmse = rmse(test, top_k)

In [24]:
eval_novelty = novelty(train, top)
eval_historical_item_novelty = historical_item_novelty(train, top)
eval_user_item_serendipity = user_item_serendipity(train, top)
eval_user_serendipity = user_serendipity(train, top)
eval_serendipity = serendipity(train, top)
eval_catalog_coverage = catalog_coverage(train, top)
eval_distributional_coverage = distributional_coverage(train, top)

  avg_novelty = reco_item_novelty.agg({"product": "sum"})[0] / n_recommendations
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reco_train_user_item_sim[col_sim].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reco_train_user_item_sim[col_sim].fillna(0, inplace=True)
The behavior will change in pandas 

In [25]:
print("Precision:\t%f" % eval_precision,
      "Precision@K:\t%f" % eval_precision_at_k,
      "Recall:\t%f" % eval_recall,
      "Recall@K:\t%f" % eval_recall_at_k,
      "MAE:\t%f" % eval_mae,
      "RMSE:\t%f" % eval_rmse,
      "NDCG:\t%f" % eval_ndcg,
      "Novelty:\t%f" % eval_novelty,
      "Serendipity:\t%f" % eval_serendipity,
      "Catalog coverage:\t%f" % eval_catalog_coverage,
      "Distributional coverage:\t%f" % eval_distributional_coverage,
      sep='\n')

Precision:	0.060606
Precision@K:	0.033333
Recall:	0.002081
Recall@K:	0.014224
MAE:	2.505839
RMSE:	2.731569
NDCG:	0.060606
Novelty:	11.535177
Serendipity:	0.896041
Catalog coverage:	0.019729
Distributional coverage:	5.837740


In [26]:
eval_historical_item_novelty

Unnamed: 0,itemID,item_novelty
0,1,8.439408
1,2,10.106832
2,3,10.194295
3,4,12.194295
4,5,10.609333
...,...,...
4354,115617,14.194295
4355,116797,14.194295
4356,117511,14.194295
4357,117590,14.194295


In [27]:
eval_user_item_serendipity

Unnamed: 0,userID,itemID,user_item_serendipity
0,1,6461,0.879732
1,2,6078,0.809789
2,3,519,0.844367
3,4,836,0.852143
4,5,635,0.850388
...,...,...,...
193,194,2580,0.828745
194,195,2587,0.827840
195,196,1996,1.000000
196,197,2587,0.965304


In [28]:
eval_user_serendipity

Unnamed: 0,userID,user_serendipity
0,1,0.879732
1,2,0.809789
2,3,0.844367
3,4,0.852143
4,5,0.850388
...,...,...
193,194,0.828745
194,195,0.827840
195,196,1.000000
196,197,0.965304


# mlflow implementation

remember to run this: mlflow server --host 127.0.0.1 --port 8080

In [None]:
#metrics
metrics = {
        "precision_at_K": eval_precision,
        "recall_at_K": eval_recall,
        "NDCG_at_K": eval_ndcg,
        "RMSE": eval_rmse,
        "MAE": eval_mae,
        "novelty": eval_novelty,
        "serendipity": eval_serendipity,
        "catalog_coverage": eval_catalog_coverage,
        "distributional_coverage": eval_distributional_coverage
    }

: 

In [None]:
import mlflow
from mlflow.models import infer_signature


# Set our tracking server uri for logging
mlflow.set_tracking_uri(uri="http://127.0.0.1:8080")

# Create a new MLflow Experiment
mlflow.set_experiment("MLflow Content Based Filtering")

# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(params)

    # Log the loss metric
    mlflow.log_metrics(metrics)

    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("Metrics Info", "CBF model for movielens dataset")

    # Infer the model signature
    signature = infer_signature(train, recommender.fit(tf, vectors_tokenized))

    # Log the model
    model_info = mlflow.sklearn.log_model(
        sk_model=recommender,
        artifact_path="CBF-model",
        signature=signature,
        input_example=train,
        registered_model_name="CBF-model test",
    )


2025-02-23 15:29:47 DEBUG    Starting new HTTP connection (1): 127.0.0.1:8080
2025-02-23 15:29:47 DEBUG    http://127.0.0.1:8080 "GET /api/2.0/mlflow/experiments/get-by-name?experiment_name=MLflow+Content+Based+Filtering HTTP/1.1" 200 286
2025-02-23 15:29:47 DEBUG    Popen(['git', 'version'], cwd=/home/rs/Desktop/personalization-privacy-and-explainability-of-recommendation-algorithms/notebooks, stdin=None, shell=False, universal_newlines=False)
2025-02-23 15:29:47 DEBUG    Popen(['git', 'version'], cwd=/home/rs/Desktop/personalization-privacy-and-explainability-of-recommendation-algorithms/notebooks, stdin=None, shell=False, universal_newlines=False)
2025-02-23 15:29:47 DEBUG    Resetting dropped connection: 127.0.0.1
2025-02-23 15:29:47 DEBUG    http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/create HTTP/1.1" 200 939
2025-02-23 15:29:47 DEBUG    Resetting dropped connection: 127.0.0.1
2025-02-23 15:29:47 DEBUG    http://127.0.0.1:8080 "POST /api/2.0/mlflow/runs/log-batch HTTP/1.1" 20