<i>Copyright (c) Recommenders contributors.</i>

<i>Licensed under the MIT License.</i>

# TF-IDF Content-Based Recommendation


In [16]:
import sys
import logging
import scipy
import numpy as np
import pandas as pd
import dataset_loader

from recommenders.models.tfidf.tfidf_utils import TfidfRecommender
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.datasets import movielens
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, mae, rmse, novelty, historical_item_novelty, user_item_serendipity, user_serendipity, serendipity, catalog_coverage, distributional_coverage
from metrics import precision_at_k, recall_at_k, mrr, accuracy, user_coverage, item_coverage

# Print version
print(f"System version: {sys.version}")

System version: 3.9.21 (main, Dec 11 2024, 16:35:24) [MSC v.1929 64 bit (AMD64)]


### 1. Load the dataset into a dataframe
Let's begin by loading the metadata file for the dataset into a Pandas dataframe. This file contains metadata about each of the scientific articles included in the full dataset.

In [17]:
# Top k items to recommend
TOP_K = 10

In [18]:
data = dataset_loader.loader("movielens", want_col=["userID", "itemID", "rating", "timestamp", 'title', 'genres'], num_rows=5000)
print(data.shape)
print(data['userID'].nunique())

# Convert the float precision to 32-bit in order to reduce memory consumption
data["rating"] = data["rating"].astype(np.float32)
data.head()


(5000, 6)
50


Unnamed: 0,userID,itemID,rating,timestamp,title,genres
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure|Children|Fantasy
1,1,29,3.5,2005-04-02 23:31:16,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi
2,1,32,3.5,2005-04-02 23:33:39,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller
3,1,47,3.5,2005-04-02 23:32:07,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,3.5,2005-04-02 23:29:40,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


### 4. Instantiate the recommender
All functions for data preparation and recommendation are contained within the **TfidfRecommender** class we have imported. Prior to running these functions, we must create an object of this class.

Select one of the following tokenization methods to use in the model:

| tokenization_method | Description                                                                                                                      |
|:--------------------|:---------------------------------------------------------------------------------------------------------------------------------|
| 'none'              | No tokenization is applied. Each word is considered a token.                                                                     |
| 'nltk'              | Simple stemming is applied using NLTK.                                                                                           |
| 'bert'              | HuggingFace BERT word tokenization ('bert-base-cased') is applied.                                                               |
| 'scibert'           | SciBERT word tokenization ('allenai/scibert_scivocab_cased') is applied.<br>This is recommended for scientific journal articles. |

In [19]:
# Create the recommender object
recommender = TfidfRecommender(id_col='itemID', tokenization_method='bert')

### 5. Prepare text for use in the TF-IDF recommender

In [20]:
data['genres'] = data['genres'].str.replace('|', ' ', regex=False)
data.head()

Unnamed: 0,userID,itemID,rating,timestamp,title,genres
0,1,2,3.5,2005-04-02 23:53:47,Jumanji (1995),Adventure Children Fantasy
1,1,29,3.5,2005-04-02 23:31:16,"City of Lost Children, The (Cité des enfants p...",Adventure Drama Fantasy Mystery Sci-Fi
2,1,32,3.5,2005-04-02 23:33:39,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery Sci-Fi Thriller
3,1,47,3.5,2005-04-02 23:32:07,Seven (a.k.a. Se7en) (1995),Mystery Thriller
4,1,50,3.5,2005-04-02 23:29:40,"Usual Suspects, The (1995)",Crime Mystery Thriller


In [21]:
df_clean = data.drop(columns=['userID', 'rating', 'timestamp'])
df_clean = df_clean.drop_duplicates(subset=['itemID'])
cols_to_clean = ['title','genres']
clean_col = 'cleaned_text'
df_clean = recommender.clean_dataframe(df_clean, cols_to_clean, clean_col)
df_clean.head()

Unnamed: 0,itemID,title,genres,cleaned_text
0,2,Jumanji (1995),Adventure Children Fantasy,Jumanji 1995 Adventure Children Fantasy
1,29,"City of Lost Children, The (Cité des enfants p...",Adventure Drama Fantasy Mystery Sci-Fi,City of Lost Children The Cité des enfants per...
2,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery Sci-Fi Thriller,Twelve Monkeys aka 12 Monkeys 1995 Mystery Sci...
3,47,Seven (a.k.a. Se7en) (1995),Mystery Thriller,Seven aka Se7en 1995 Mystery Thriller
4,50,"Usual Suspects, The (1995)",Crime Mystery Thriller,Usual Suspects The 1995 Crime Mystery Thriller


In [22]:
len(df_clean)

2090

In [23]:
df_clean = df_clean.reset_index(drop=True)
df_clean.head()

Unnamed: 0,itemID,title,genres,cleaned_text
0,2,Jumanji (1995),Adventure Children Fantasy,Jumanji 1995 Adventure Children Fantasy
1,29,"City of Lost Children, The (Cité des enfants p...",Adventure Drama Fantasy Mystery Sci-Fi,City of Lost Children The Cité des enfants per...
2,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery Sci-Fi Thriller,Twelve Monkeys aka 12 Monkeys 1995 Mystery Sci...
3,47,Seven (a.k.a. Se7en) (1995),Mystery Thriller,Seven aka Se7en 1995 Mystery Thriller
4,50,"Usual Suspects, The (1995)",Crime Mystery Thriller,Usual Suspects The 1995 Crime Mystery Thriller


In [24]:
header = {
    "col_user": "userID",
    "col_item": "itemID",
    "col_rating": "rating",
    "col_timestamp": "timestamp",
    "col_title": "title",
    "col_genres": "genres",
    "col_year": "year",
    "col_prediction": "prediction",
}

In [25]:
train, test = python_stratified_split(
    data, ratio=0.75, col_user=header["col_user"], col_item=header["col_item"], seed=42
)

In [26]:
train = recommender.clean_dataframe(train, cols_to_clean, clean_col)
train.head()

Unnamed: 0,userID,itemID,rating,timestamp,title,genres,cleaned_text
72,1,2140,4.0,2005-04-02 23:48:25,"Dark Crystal, The (1982)",Adventure Fantasy,Dark Crystal The 1982 Adventure Fantasy
128,1,4915,3.0,2005-04-02 23:54:36,"Beastmaster, The (1982)",Action Adventure Fantasy,Beastmaster The 1982 Action Adventure Fantasy
171,1,8636,4.5,2005-04-02 23:44:53,Spider-Man 2 (2004),Action Adventure Sci-Fi IMAX,SpiderMan 2 2004 Action Adventure SciFi IMAX
10,1,293,4.0,2005-04-02 23:31:43,Léon: The Professional (a.k.a. The Professiona...,Action Crime Drama Thriller,Léon The Professional aka The Professional Léo...
98,1,3000,3.5,2005-04-02 23:29:29,Princess Mononoke (Mononoke-hime) (1997),Action Adventure Animation Drama Fantasy,Princess Mononoke Mononokehime 1997 Action Adv...


Let's also tokenize the cleaned text for use in the TF-IDF model. The tokens are stored within our TfidfRecommender object.

In [27]:
# Tokenize text with tokenization_method specified in class instantiation
tf, vectors_tokenized = recommender.tokenize_text(df_clean, text_col="cleaned_text")

### 6. Recommend articles using TF-IDF
Let's now fit the recommender model to the processed data (tokens) and retrieve the top k recommended articles.

When creating our object, we specified k=5 so the `recommend_top_k_items` function will return the top 5 recommendations for each public domain article.

In [28]:
# Fit the TF-IDF vectorizer
recommender.fit(tf, vectors_tokenized)
tokens = recommender.get_tokens()
print(list(tokens.keys())[:10])


['ju', 'man', 'ji', '1995', 'adventure', 'children', 'fantasy', 'ju man', 'man ji', 'ji 1995']


In [29]:
len(tokens)

15191

In [30]:
top_k_items = recommender.recommend_top_k_items(df_clean, k=5)
top_k_items.sort_values("rec_score", ascending=False)

Unnamed: 0,itemID,rec_rank,rec_score,rec_itemID
3266,550,2,1.000000,289
3265,550,1,1.000000,550
9445,289,1,1.000000,550
9446,289,2,1.000000,289
2165,252,1,1.000000,550
...,...,...,...,...
9330,125916,1,0.020885,4745
9331,125916,2,0.011637,1292
9332,125916,3,0.010832,279
9333,125916,4,0.010518,175


In [31]:
merged_df = data.merge(top_k_items, on='itemID', how='inner')
merged_df['prediction'] = merged_df['rating'] * merged_df['rec_score']
top_k = merged_df[['userID', 'rec_itemID', 'prediction']]
top_k.rename(columns={'rec_itemID': 'itemID'}, inplace=True)
top_k.sort_values(["userID", "prediction"], ascending=[True, False]).head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_k.rename(columns={'rec_itemID': 'itemID'}, inplace=True)


Unnamed: 0,userID,itemID,prediction
595,1,4886,1.995398
365,1,3479,1.972219
530,1,2143,1.972219
596,1,53121,1.958034
535,1,8580,1.940257


In [32]:
filtered_top_k = top_k.merge(train, on=["userID", "itemID"], how="left", indicator=True)
filtered_top_k = filtered_top_k[filtered_top_k["_merge"] == "left_only"].drop(columns=["_merge"])
filtered_top_k = filtered_top_k[["userID", "itemID", "prediction"]]
filtered_top_k.head()

Unnamed: 0,userID,itemID,prediction
0,1,158,1.04985
1,1,60,1.014141
2,1,8,0.756539
3,1,2093,0.596716
5,1,4874,0.799001


In [33]:
idx = filtered_top_k.groupby("userID")["prediction"].idxmax()
top = filtered_top_k.loc[idx]
top.head()

Unnamed: 0,userID,itemID,prediction
595,1,4886,1.995398
915,2,4638,2.235275
2070,3,519,2.767191
2250,4,95,2.264449
2525,5,4498,2.105869


In [34]:
args = [test, top_k]
kwargs = dict(
    col_user="userID",
    col_item="itemID",
    col_rating="rating",
    col_prediction="prediction",
    relevancy_method="top_k",
    k=TOP_K
)

eval_map = map_at_k(*args, **kwargs)
eval_ndcg_at_k = ndcg_at_k(*args, **kwargs)
eval_precision_at_k = precision_at_k(*args, col_user="userID", col_item="itemID", col_rating="rating", col_prediction="prediction", k=TOP_K)
eval_recall_at_k = recall_at_k(*args, col_user="userID", col_item="itemID", col_rating="rating", col_prediction="prediction", k=TOP_K)

In [35]:
args1 = [test, top]
kwargs1 = dict(
    col_user="userID",
    col_item="itemID",
    col_rating="rating",
    col_prediction="prediction",
    relevancy_method="top_k",
    k=1
)
eval_ndcg = ndcg_at_k(*args1, **kwargs1)
eval_precision = precision_at_k(*args, col_user="userID", col_item="itemID", col_rating="rating", col_prediction="prediction", k=1)
eval_recall = recall_at_k(*args, col_user="userID", col_item="itemID", col_rating="rating", col_prediction="prediction", k=1)

In [36]:
eval_mae = mae(test, top_k)
eval_rmse = rmse(test, top_k)

In [37]:
# eval_novelty = novelty(train, top)
# eval_historical_item_novelty = historical_item_novelty(train, top)
# eval_user_item_serendipity = user_item_serendipity(train, top)
# eval_user_serendipity = user_serendipity(train, top)
# eval_serendipity = serendipity(train, top)
# eval_catalog_coverage = catalog_coverage(train, top)
# eval_distributional_coverage = distributional_coverage(train, top)

In [38]:
# eval_f1 = f1(test, top)
# eval_mrr = mrr(test, top)
# eval_accuracy = accuracy(test, top)
# eval_user_coverage = user_coverage(test, top)
# eval_item_coverage = item_coverage(test, top)

In [39]:
print("Precision:\t%f" % eval_precision,
      "Precision@K:\t%f" % eval_precision_at_k,
      "Recall:\t%f" % eval_recall,
      "Recall@K:\t%f" % eval_recall_at_k,
      "MAE:\t%f" % eval_mae,
      "RMSE:\t%f" % eval_rmse,
      "NDCG:\t%f" % eval_ndcg,
      # "Novelty:\t%f" % eval_novelty,
      # "Serendipity:\t%f" % eval_serendipity,
      # "Catalog coverage:\t%f" % eval_catalog_coverage,
      # "Distributional coverage:\t%f" % eval_distributional_coverage,
      sep='\n')

Precision:	0.060000
Precision@K:	0.300000
Recall:	0.002937
Recall@K:	0.019380
MAE:	2.863804
RMSE:	3.049122
NDCG:	0.080000


In [40]:
# print("F1:\t%f" % eval_f1,
#       "Accuracy:\t%f" % eval_accuracy,
#       "User covarage:\t%f" % eval_user_coverage,
#       "Item coverage:\t%f" % eval_item_coverage,
#       sep='\n')

In [41]:
# eval_historical_item_novelty

In [42]:
# eval_user_item_serendipity

In [43]:
# eval_user_serendipity