<i>Copyright (c) Recommenders contributors.</i>

<i>Licensed under the MIT License.</i>

# TF-IDF Content-Based Recommendation


In [93]:
import sys
import logging
import scipy
import numpy as np
import pandas as pd

from recommenders.models.tfidf.tfidf_utils import TfidfRecommender
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.datasets import movielens
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k, mae, rmse, novelty, historical_item_novelty, user_item_serendipity, user_serendipity, serendipity, catalog_coverage, distributional_coverage

# Print version
print(f"System version: {sys.version}")

System version: 3.9.21 (main, Dec 11 2024, 16:35:24) [MSC v.1929 64 bit (AMD64)]


### 1. Load the dataset into a dataframe
Let's begin by loading the metadata file for the dataset into a Pandas dataframe. This file contains metadata about each of the scientific articles included in the full dataset.

In [94]:
# Top k items to recommend
TOP_K = 10

# Select MovieLens data size: 100k, 1m, 10m, or 20m
MOVIELENS_DATA_SIZE = "100k"

In [95]:
# set log level to INFO
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s %(levelname)-8s %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)

In [96]:
data = movielens.load_pandas_df(
    size=MOVIELENS_DATA_SIZE,
    header=["userID", "itemID", "rating", "timestamp"],
    title_col="title",
    genres_col='genres',
    year_col='year'
)
# Convert the float precision to 32-bit in order to reduce memory consumption
data["rating"] = data["rating"].astype(np.float32)
data.head()


2025-02-21 20:46:57 DEBUG    Starting new HTTPS connection (1): files.grouplens.org:443
2025-02-21 20:46:57 DEBUG    https://files.grouplens.org:443 "GET /datasets/movielens/ml-100k.zip HTTP/1.1" 200 4924029
2025-02-21 20:46:57 INFO     Downloading https://files.grouplens.org/datasets/movielens/ml-100k.zip
100%|██████████| 4.81k/4.81k [00:01<00:00, 3.01kKB/s]


Unnamed: 0,userID,itemID,rating,timestamp,title,genres,year
0,196,242,3.0,881250949,Kolya (1996),Comedy,1996
1,186,302,3.0,891717742,L.A. Confidential (1997),Crime|Film-Noir|Mystery|Thriller,1997
2,22,377,1.0,878887116,Heavyweights (1994),Children's|Comedy,1994
3,244,51,2.0,880606923,Legends of the Fall (1994),Drama|Romance|War|Western,1994
4,166,346,1.0,886397596,Jackie Brown (1997),Crime|Drama,1997


### 4. Instantiate the recommender
All functions for data preparation and recommendation are contained within the **TfidfRecommender** class we have imported. Prior to running these functions, we must create an object of this class.

Select one of the following tokenization methods to use in the model:

| tokenization_method | Description                                                                                                                      |
|:--------------------|:---------------------------------------------------------------------------------------------------------------------------------|
| 'none'              | No tokenization is applied. Each word is considered a token.                                                                     |
| 'nltk'              | Simple stemming is applied using NLTK.                                                                                           |
| 'bert'              | HuggingFace BERT word tokenization ('bert-base-cased') is applied.                                                               |
| 'scibert'           | SciBERT word tokenization ('allenai/scibert_scivocab_cased') is applied.<br>This is recommended for scientific journal articles. |

In [97]:
# Create the recommender object
recommender = TfidfRecommender(id_col='itemID', tokenization_method='bert')

### 5. Prepare text for use in the TF-IDF recommender

In [98]:
data['genres'] = data['genres'].str.replace('|', ' ', regex=False)
data.head()

Unnamed: 0,userID,itemID,rating,timestamp,title,genres,year
0,196,242,3.0,881250949,Kolya (1996),Comedy,1996
1,186,302,3.0,891717742,L.A. Confidential (1997),Crime Film-Noir Mystery Thriller,1997
2,22,377,1.0,878887116,Heavyweights (1994),Children's Comedy,1994
3,244,51,2.0,880606923,Legends of the Fall (1994),Drama Romance War Western,1994
4,166,346,1.0,886397596,Jackie Brown (1997),Crime Drama,1997


In [99]:
df_clean = data.drop(columns=['userID', 'rating', 'timestamp'])
df_clean = df_clean.drop_duplicates(subset=['itemID'])
cols_to_clean = ['title','genres']
clean_col = 'cleaned_text'
df_clean = recommender.clean_dataframe(df_clean, cols_to_clean, clean_col)
df_clean.head()

Unnamed: 0,itemID,title,genres,year,cleaned_text
0,242,Kolya (1996),Comedy,1996,Kolya 1996 Comedy
1,302,L.A. Confidential (1997),Crime Film-Noir Mystery Thriller,1997,LA Confidential 1997 Crime FilmNoir Mystery Th...
2,377,Heavyweights (1994),Children's Comedy,1994,Heavyweights 1994 Childrens Comedy
3,51,Legends of the Fall (1994),Drama Romance War Western,1994,Legends of the Fall 1994 Drama Romance War Wes...
4,346,Jackie Brown (1997),Crime Drama,1997,Jackie Brown 1997 Crime Drama


In [100]:
len(df_clean)

1682

In [101]:
df_clean = df_clean.reset_index(drop=True)
df_clean.head()

Unnamed: 0,itemID,title,genres,year,cleaned_text
0,242,Kolya (1996),Comedy,1996,Kolya 1996 Comedy
1,302,L.A. Confidential (1997),Crime Film-Noir Mystery Thriller,1997,LA Confidential 1997 Crime FilmNoir Mystery Th...
2,377,Heavyweights (1994),Children's Comedy,1994,Heavyweights 1994 Childrens Comedy
3,51,Legends of the Fall (1994),Drama Romance War Western,1994,Legends of the Fall 1994 Drama Romance War Wes...
4,346,Jackie Brown (1997),Crime Drama,1997,Jackie Brown 1997 Crime Drama


In [102]:
header = {
    "col_user": "userID",
    "col_item": "itemID",
    "col_rating": "rating",
    "col_timestamp": "timestamp",
    "col_title": "title",
    "col_genres": "genres",
    "col_year": "year",
    "col_prediction": "prediction",
}

In [103]:
train, test = python_stratified_split(
    data, ratio=0.75, col_user=header["col_user"], col_item=header["col_item"], seed=42
)

In [104]:
train = recommender.clean_dataframe(train, cols_to_clean, clean_col)
train.head()

Unnamed: 0,userID,itemID,rating,timestamp,title,genres,year,cleaned_text
15270,1,54,3.0,878543308,Outbreak (1995),Action Drama Thriller,1995,Outbreak 1995 Action Drama Thriller
4411,1,80,4.0,876893008,Hot Shots! Part Deux (1993),Action Comedy War,1993,Hot Shots Part Deux 1993 Action Comedy War
23028,1,138,1.0,878543006,D3: The Mighty Ducks (1996),Children's Comedy,1996,D3 The Mighty Ducks 1996 Childrens Comedy
3710,1,84,4.0,875072923,Robert A. Heinlein's The Puppet Masters (1994),Horror Sci-Fi,1994,Robert A Heinleins The Puppet Masters 1994 Hor...
37820,1,37,2.0,878543030,Nadja (1994),Drama,1994,Nadja 1994 Drama


Let's also tokenize the cleaned text for use in the TF-IDF model. The tokens are stored within our TfidfRecommender object.

In [105]:
# Tokenize text with tokenization_method specified in class instantiation
tf, vectors_tokenized = recommender.tokenize_text(df_clean, text_col="cleaned_text")

2025-02-21 20:47:01 DEBUG    https://huggingface.co:443 "HEAD /bert-base-cased/resolve/main/tokenizer_config.json HTTP/1.1" 200 0


### 6. Recommend articles using TF-IDF
Let's now fit the recommender model to the processed data (tokens) and retrieve the top k recommended articles.

When creating our object, we specified k=5 so the `recommend_top_k_items` function will return the top 5 recommendations for each public domain article.

In [106]:
# Fit the TF-IDF vectorizer
recommender.fit(tf, vectors_tokenized)
tokens = recommender.get_tokens()
print(list(tokens.keys())[:10])


['ko', 'lya', '1996', 'comedy', 'ko lya', 'lya 1996', '1996 comedy', 'ko lya 1996', 'lya 1996 comedy', 'la']


In [107]:
len(tokens)

11390

In [108]:
top_k_items = recommender.recommend_top_k_items(df_clean, k=5)
top_k_items.head()

Unnamed: 0,itemID,rec_rank,rec_score,rec_itemID
0,242,1,0.155171,105
1,242,2,0.153158,1048
2,242,3,0.153158,1102
3,242,4,0.115768,1356
4,242,5,0.115768,1287


In [109]:
merged_df = data.merge(top_k_items, on='itemID', how='inner')
merged_df['prediction'] = merged_df['rating'] * merged_df['rec_score']
top_k = merged_df[['userID', 'rec_itemID', 'prediction']]
top_k.rename(columns={'rec_itemID': 'itemID'}, inplace=True)
top_k.sort_values(["userID", "prediction"], ascending=[True, False]).head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_k.rename(columns={'rec_itemID': 'itemID'}, inplace=True)


Unnamed: 0,userID,itemID,prediction
102350,1,268,5.0
172750,1,268,5.0
28410,1,738,3.0
28411,1,785,3.0
51035,1,665,2.781441


In [110]:
filtered_top_k = top_k.merge(train, on=["userID", "itemID"], how="left", indicator=True)
filtered_top_k = filtered_top_k[filtered_top_k["_merge"] == "left_only"].drop(columns=["_merge"])
filtered_top_k = filtered_top_k[["userID", "itemID", "prediction"]]
filtered_top_k.head()

Unnamed: 0,userID,itemID,prediction
0,196,105,0.465513
1,196,1048,0.459475
2,196,1102,0.459475
3,196,1356,0.347303
4,196,1287,0.347303


In [111]:
idx = filtered_top_k.groupby("userID")["prediction"].idxmax()
top = filtered_top_k.loc[idx]
top.head()

Unnamed: 0,userID,itemID,prediction
28410,1,738,3.0
366410,2,500,4.0
185940,3,50,1.649034
69465,4,406,2.210119
128275,5,665,2.225152


In [112]:
args = [test, top_k]
kwargs = dict(
    col_user="userID",
    col_item="itemID",
    col_rating="rating",
    col_prediction="prediction",
    relevancy_method="top_k",
    k=TOP_K,
)

eval_map = map_at_k(*args, **kwargs)
eval_ndcg_at_k = ndcg_at_k(*args, **kwargs)
eval_precision_at_k = precision_at_k(*args, **kwargs)
eval_recall_at_k = recall_at_k(*args, **kwargs)

In [113]:
args1 = [test, top]
kwargs1 = dict(
    col_user="userID",
    col_item="itemID",
    col_rating="rating",
    col_prediction="prediction",
    relevancy_method="top_k",
    k=1,
)
eval_ndcg = ndcg_at_k(*args1, **kwargs1)
eval_precision = precision_at_k(*args1, **kwargs1)
eval_recall = recall_at_k(*args1, **kwargs1)

In [114]:
eval_mae = mae(test, top_k)
eval_rmse = rmse(test, top_k)

In [115]:
eval_novelty = novelty(train, top)
eval_historical_item_novelty = historical_item_novelty(train, top)
eval_user_item_serendipity = user_item_serendipity(train, top)
eval_user_serendipity = user_serendipity(train, top)
eval_serendipity = serendipity(train, top)
eval_catalog_coverage = catalog_coverage(train, top)
eval_distributional_coverage = distributional_coverage(train, top)

  avg_novelty = reco_item_novelty.agg({"product": "sum"})[0] / n_recommendations
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reco_train_user_item_sim[col_sim].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  reco_train_user_item_sim[col_sim].fillna(0, inplace=True)
The behavior will change in pandas 

In [116]:
print("Precision:\t%f" % eval_precision,
      "Precision@K:\t%f" % eval_precision_at_k,
      "Recall:\t%f" % eval_recall,
      "Recall@K:\t%f" % eval_recall_at_k,
      "MAE:\t%f" % eval_mae,
      "RMSE:\t%f" % eval_rmse,
      "NDCG:\t%f" % eval_ndcg,
      "Novelty:\t%f" % eval_novelty,
      "Serendipity:\t%f" % eval_serendipity,
      "Catalog coverage:\t%f" % eval_catalog_coverage,
      "Distributional coverage:\t%f" % eval_distributional_coverage,
      sep='\n')

Precision:	0.191941
Precision@K:	0.077837
Recall:	0.011412
Recall@K:	0.042721
MAE:	2.642405
RMSE:	2.891421
NDCG:	0.191941
Novelty:	10.976871
Serendipity:	0.849694
Catalog coverage:	0.057471
Distributional coverage:	5.303474


In [117]:
eval_historical_item_novelty

Unnamed: 0,itemID,item_novelty
0,1,7.793570
1,2,9.655290
2,3,10.044702
3,4,8.890668
4,5,10.172081
...,...,...
1648,1678,16.194449
1649,1679,16.194449
1650,1680,16.194449
1651,1681,16.194449


In [118]:
eval_user_item_serendipity

Unnamed: 0,userID,itemID,user_item_serendipity
0,1,738,0.891589
1,2,500,0.914873
2,3,50,0.801334
3,4,406,0.884681
4,5,665,0.774015
...,...,...,...
938,939,117,0.711782
939,940,195,0.676544
940,941,1284,0.875141
941,942,1284,0.927612


In [119]:
eval_user_serendipity

Unnamed: 0,userID,user_serendipity
0,1,0.891589
1,2,0.914873
2,3,0.801334
3,4,0.884681
4,5,0.774015
...,...,...
938,939,0.711782
939,940,0.676544
940,941,0.875141
941,942,0.927612
