<i>Copyright (c) Recommenders contributors.</i>

<i>Licensed under the MIT License.</i>

# TF-IDF Content-Based Recommendation


In [84]:
import sys
import logging
import scipy
import numpy as np
import pandas as pd
import dataset_loader

from recommenders.models.tfidf.tfidf_utils import TfidfRecommender
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.datasets import movielens
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k, mae, rmse, novelty, historical_item_novelty, user_item_serendipity, user_serendipity, serendipity, catalog_coverage, distributional_coverage

# Print version
print(f"System version: {sys.version}")

System version: 3.9.21 (main, Dec 11 2024, 16:24:11) 
[GCC 11.2.0]


### 1. Load the dataset into a dataframe
Let's begin by loading the metadata file for the dataset into a Pandas dataframe. This file contains metadata about each of the scientific articles included in the full dataset.

In [85]:
# Top k items to recommend
TOP_K = 10

In [86]:
# set log level to INFO
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s %(levelname)-8s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

In [None]:
data = dataset_loader.loader("movielens", want_col=["userID", "itemID", "rating", "timestamp", 'title', 'genres'], num_rows=100000)
print(data.shape)
print(data['userID'].nunique())

# Convert the float precision to 32-bit in order to reduce memory consumption
data["rating"] = data["rating"].astype(np.float32)
data.head()


(100000, 6)
702


Unnamed: 0,userID,itemID,rating,timestamp,title,genres
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure|Children|Fantasy
1,1,29,3.5,2005-04-02 23:31:16,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,32,3.5,2005-04-02 23:33:39,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,1,47,3.5,2005-04-02 23:32:07,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,3.5,2005-04-02 23:29:40,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


### 4. Instantiate the recommender
All functions for data preparation and recommendation are contained within the **TfidfRecommender** class we have imported. Prior to running these functions, we must create an object of this class.

Select one of the following tokenization methods to use in the model:

| tokenization_method | Description                                                                                                                      |
|:--------------------|:---------------------------------------------------------------------------------------------------------------------------------|
| 'none'              | No tokenization is applied. Each word is considered a token.                                                                     |
| 'nltk'              | Simple stemming is applied using NLTK.                                                                                           |
| 'bert'              | HuggingFace BERT word tokenization ('bert-base-cased') is applied.                                                               |
| 'scibert'           | SciBERT word tokenization ('allenai/scibert_scivocab_cased') is applied.<br>This is recommended for scientific journal articles. |

In [88]:
# Create the recommender object
recommender = TfidfRecommender(id_col='itemID', tokenization_method='bert')

### 5. Prepare text for use in the TF-IDF recommender

In [89]:
data['genres'] = data['genres'].str.replace('|', ' ', regex=False)
data.head()

Unnamed: 0,userID,itemID,rating,timestamp,title,genres
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure Children Fantasy
1,1,29,3.5,2005-04-02 23:31:16,"City of Lost Children, The (Cité des enfants p...",Adventure Drama Fantasy Mystery Sci-Fi
2,1,32,3.5,2005-04-02 23:33:39,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery Sci-Fi Thriller
3,1,47,3.5,2005-04-02 23:32:07,Seven (a.k.a. Se7en) (1995),Mystery Thriller
4,1,50,3.5,2005-04-02 23:29:40,"Usual Suspects, The (1995)",Crime Mystery Thriller


In [90]:
df_clean = data.drop(columns=['userID', 'rating', 'timestamp'])
df_clean = df_clean.drop_duplicates(subset=['itemID'])
cols_to_clean = ['title','genres']
clean_col = 'cleaned_text'
df_clean = recommender.clean_dataframe(df_clean, cols_to_clean, clean_col)
df_clean.head()

Unnamed: 0,itemID,title,genres,cleaned_text
0,2,Jumanji (1995),Adventure Children Fantasy,Jumanji 1995 Adventure Children Fantasy
1,29,"City of Lost Children, The (Cité des enfants p...",Adventure Drama Fantasy Mystery Sci-Fi,City of Lost Children The Cité des enfants per...
2,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery Sci-Fi Thriller,Twelve Monkeys aka 12 Monkeys 1995 Mystery Sci...
3,47,Seven (a.k.a. Se7en) (1995),Mystery Thriller,Seven aka Se7en 1995 Mystery Thriller
4,50,"Usual Suspects, The (1995)",Crime Mystery Thriller,Usual Suspects The 1995 Crime Mystery Thriller


In [91]:
len(df_clean)

8227

In [92]:
df_clean = df_clean.reset_index(drop=True)
df_clean.head()

Unnamed: 0,itemID,title,genres,cleaned_text
0,2,Jumanji (1995),Adventure Children Fantasy,Jumanji 1995 Adventure Children Fantasy
1,29,"City of Lost Children, The (Cité des enfants p...",Adventure Drama Fantasy Mystery Sci-Fi,City of Lost Children The Cité des enfants per...
2,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery Sci-Fi Thriller,Twelve Monkeys aka 12 Monkeys 1995 Mystery Sci...
3,47,Seven (a.k.a. Se7en) (1995),Mystery Thriller,Seven aka Se7en 1995 Mystery Thriller
4,50,"Usual Suspects, The (1995)",Crime Mystery Thriller,Usual Suspects The 1995 Crime Mystery Thriller


In [93]:
header = {
    "col_user": "userID",
    "col_item": "itemID",
    "col_rating": "rating",
    "col_timestamp": "timestamp",
    "col_title": "title",
    "col_genres": "genres",
    "col_year": "year",
    "col_prediction": "prediction",
}

In [94]:
train, test = python_stratified_split(
    data, ratio=0.75, col_user=header["col_user"], col_item=header["col_item"], seed=42
)

In [95]:
train = recommender.clean_dataframe(train, cols_to_clean, clean_col)
train.head()

Unnamed: 0,userID,itemID,rating,timestamp,title,genres,cleaned_text
72,1,2140,4.0,2005-04-02 23:48:25,"Dark Crystal, The (1982)",Adventure Fantasy,Dark Crystal The 1982 Adventure Fantasy
128,1,4915,3.0,2005-04-02 23:54:36,"Beastmaster, The (1982)",Action Adventure Fantasy,Beastmaster The 1982 Action Adventure Fantasy
171,1,8636,4.5,2005-04-02 23:44:53,Spider-Man 2 (2004),Action Adventure Sci-Fi IMAX,SpiderMan 2 2004 Action Adventure SciFi IMAX
10,1,293,4.0,2005-04-02 23:31:43,Léon: The Professional (a.k.a. The Professiona...,Action Crime Drama Thriller,Léon The Professional aka The Professional Léo...
98,1,3000,3.5,2005-04-02 23:29:29,Princess Mononoke (Mononoke-hime) (1997),Action Adventure Animation Drama Fantasy,Princess Mononoke Mononokehime 1997 Action Adv...


Let's also tokenize the cleaned text for use in the TF-IDF model. The tokens are stored within our TfidfRecommender object.

In [96]:
# Tokenize text with tokenization_method specified in class instantiation
tf, vectors_tokenized = recommender.tokenize_text(df_clean, text_col="cleaned_text")

2025-02-22 15:52:44 DEBUG    https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/tokenizer_config.json HTTP/1.1" 200 0


### 6. Recommend articles using TF-IDF
Let's now fit the recommender model to the processed data (tokens) and retrieve the top k recommended articles.

When creating our object, we specified k=5 so the `recommend_top_k_items` function will return the top 5 recommendations for each public domain article.

In [97]:
# Fit the TF-IDF vectorizer
recommender.fit(tf, vectors_tokenized)
tokens = recommender.get_tokens()
print(list(tokens.keys())[:10])


['ju', 'man', 'ji', '1995', 'adventure', 'children', 'fantasy', 'ju man', 'man ji', 'ji 1995']


In [98]:
len(tokens)

52894

In [99]:
top_k_items = recommender.recommend_top_k_items(df_clean, k=5)
top_k_items.head()

Unnamed: 0,itemID,rec_rank,rec_score,rec_itemID
0,2,1,0.293833,2103
1,2,2,0.282491,60
2,2,3,0.2819,158
3,2,4,0.217104,6557
4,2,5,0.20367,8


In [100]:
merged_df = data.merge(top_k_items, on='itemID', how='inner')
merged_df['prediction'] = merged_df['rating'] * merged_df['rec_score']
top_k = merged_df[['userID', 'rec_itemID', 'prediction']]
top_k.rename(columns={'rec_itemID': 'itemID'}, inplace=True)
top_k.sort_values(["userID", "prediction"], ascending=[True, False]).head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_k.rename(columns={'rec_itemID': 'itemID'}, inplace=True)


Unnamed: 0,userID,itemID,prediction
450,1,3795,2.468874
345,1,8610,2.445297
240,1,6461,2.142163
595,1,4886,2.038848
365,1,3479,1.998442


In [101]:
filtered_top_k = top_k.merge(train, on=["userID", "itemID"], how="left", indicator=True)
filtered_top_k = filtered_top_k[filtered_top_k["_merge"] == "left_only"].drop(columns=["_merge"])
filtered_top_k = filtered_top_k[["userID", "itemID", "prediction"]]
filtered_top_k.head()

Unnamed: 0,userID,itemID,prediction
0,1,2103,1.028416
1,1,60,0.988719
2,1,158,0.986651
3,1,6557,0.759864
4,1,8,0.712846


In [102]:
idx = filtered_top_k.groupby("userID")["prediction"].idxmax()
top = filtered_top_k.loc[idx]
top.head()

Unnamed: 0,userID,itemID,prediction
450,1,3795,2.468874
920,2,6078,2.357777
2070,3,519,2.741739
2250,4,836,2.388728
2530,5,635,3.054812


In [103]:
args = [test, top_k]
kwargs = dict(
    col_user="userID",
    col_item="itemID",
    col_rating="rating",
    col_prediction="prediction",
    relevancy_method="top_k",
    k=TOP_K,
)

eval_map = map_at_k(*args, **kwargs)
eval_ndcg_at_k = ndcg_at_k(*args, **kwargs)
eval_precision_at_k = precision_at_k(*args, **kwargs)
eval_recall_at_k = recall_at_k(*args, **kwargs)

In [104]:
args1 = [test, top]
kwargs1 = dict(
    col_user="userID",
    col_item="itemID",
    col_rating="rating",
    col_prediction="prediction",
    relevancy_method="top_k",
    k=1,
)
eval_ndcg = ndcg_at_k(*args1, **kwargs1)
eval_precision = precision_at_k(*args1, **kwargs1)
eval_recall = recall_at_k(*args1, **kwargs1)

In [105]:
eval_mae = mae(test, top_k)
eval_rmse = rmse(test, top_k)

In [106]:
eval_novelty = novelty(train, top)
eval_historical_item_novelty = historical_item_novelty(train, top)
eval_user_item_serendipity = user_item_serendipity(train, top)
eval_user_serendipity = user_serendipity(train, top)
eval_serendipity = serendipity(train, top)
eval_catalog_coverage = catalog_coverage(train, top)
eval_distributional_coverage = distributional_coverage(train, top)

  avg_novelty = reco_item_novelty.agg({"product": "sum"})[0] / n_recommendations
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reco_train_user_item_sim[col_sim].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reco_train_user_item_sim[col_sim].fillna(0, inplace=True)
The behavior will change in pandas 

In [107]:
print("Precision:\t%f" % eval_precision,
      "Precision@K:\t%f" % eval_precision_at_k,
      "Recall:\t%f" % eval_recall,
      "Recall@K:\t%f" % eval_recall_at_k,
      "MAE:\t%f" % eval_mae,
      "RMSE:\t%f" % eval_rmse,
      "NDCG:\t%f" % eval_ndcg,
      "Novelty:\t%f" % eval_novelty,
      "Serendipity:\t%f" % eval_serendipity,
      "Catalog coverage:\t%f" % eval_catalog_coverage,
      "Distributional coverage:\t%f" % eval_distributional_coverage,
      sep='\n')

Precision:	0.028490
Precision@K:	0.025071
Recall:	0.000822
Recall@K:	0.009740
MAE:	2.506718
RMSE:	2.706387
NDCG:	0.028490
Novelty:	11.134221
Serendipity:	0.923676
Catalog coverage:	0.025130
Distributional coverage:	6.468497


In [108]:
eval_historical_item_novelty

Unnamed: 0,itemID,item_novelty
0,1,8.726882
1,2,9.909085
2,3,10.522062
3,4,12.735056
4,5,10.522062
...,...,...
7516,118696,15.194488
7517,118997,16.194488
7518,119141,16.194488
7519,125916,16.194488


In [109]:
eval_user_item_serendipity

Unnamed: 0,userID,itemID,user_item_serendipity
0,1,3795,1.000000
1,2,6078,0.898759
2,3,519,0.869362
3,4,836,0.864917
4,5,635,0.922434
...,...,...,...
697,698,27,0.895552
698,699,635,0.935996
699,700,185,0.776952
700,701,7669,0.949348


In [110]:
eval_user_serendipity

Unnamed: 0,userID,user_serendipity
0,1,1.000000
1,2,0.898759
2,3,0.869362
3,4,0.864917
4,5,0.922434
...,...,...
697,698,0.895552
698,699,0.935996
699,700,0.776952
700,701,0.949348
