In [1]:
from cornac.models import ItemKNN, UserKNN, SVD, BPR, WMF, GMF, NeuMF
from cornac.eval_methods.base_method import BaseMethod
from cornac.data import Dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from load_filter_and_sample import load_and_filter_data, sample_users
import cornac.metrics as met

  from .autonotebook import tqdm as notebook_tqdm
  match = re.match("^#\s*version\s*([0-9a-z]*)\s*$", line)


In [2]:
user_id = 2840
anime_id = 5 

k_values = [5, 10, 15, 30, 50, 100]
threshold = 3.5
top_n = 10

split_percentage = 0.8      # 80% of the data will be used for training and 20% for testing

In [3]:
def plot_similarity_matrix(similarity_matrix, similarity_metric, base):
    plt.figure()
    plt.imshow(similarity_matrix, cmap='hot', interpolation='nearest')
    plt.title(f"{base}-{base} {similarity_metric} Similarity Matrix")
    plt.colorbar()

In [4]:
IMPORT_PATH_BASE = "datasets/"
user_path = IMPORT_PATH_BASE + "user-filtered.csv"
item_path = IMPORT_PATH_BASE + "anime-dataset-2023.csv"

user_df, anime_df = load_and_filter_data(user_path, item_path, filtering=False, logging=True)

Loading user ratings...
User ratings loaded
Loading animes list...
Animes list loaded


In [5]:
anime_df.drop(columns=['Other name', 'Name', 'Synopsis', 'Source', 'Premiered', 'Status', 'Producers', 'Licensors', 'Duration'], inplace=True) # Drop unnecessary columns
anime_df.rename(columns={'English name': 'Name'}, inplace=True) # Rename 'English name' to 'Name'
anime_df = anime_df.drop(anime_df[anime_df.eq('UNKNOWN').any(axis=1)].index) # Drop rows with 'UNKNOWN' values
anime_df = anime_df[anime_df['Type'].isin(['Movie', 'TV', 'TV Short'])] # Only keep Movies, TV and TV Short
anime_df = anime_df[anime_df['anime_id'].isin(user_df['anime_id'])] # Only keep items that are in user_df
anime_df['Score'] = anime_df['Score'].astype(float)
anime_df['Episodes'] = anime_df['Episodes'].astype(float)
anime_df['Members'] = anime_df['Members'].astype(float)
anime_df['Favorites'] = anime_df['Favorites'].astype(float)
anime_df['Popularity'] = anime_df['Popularity'].astype(float)
anime_df['Rank'] = anime_df['Rank'].astype(float)

In [6]:
anime_df

Unnamed: 0,anime_id,Name,Score,Genres,Type,Episodes,Aired,Studios,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
0,1,Cowboy Bebop,8.75,"Action, Award Winning, Sci-Fi",TV,26.0,"Apr 3, 1998 to Apr 24, 1999",Sunrise,R - 17+ (violence & profanity),41.0,43.0,78525.0,914193.0,1771505.0,https://cdn.myanimelist.net/images/anime/4/196...
1,5,Cowboy Bebop: The Movie,8.38,"Action, Sci-Fi",Movie,1.0,"Sep 1, 2001",Bones,R - 17+ (violence & profanity),189.0,602.0,1448.0,206248.0,360978.0,https://cdn.myanimelist.net/images/anime/1439/...
2,6,Trigun,8.22,"Action, Adventure, Sci-Fi",TV,26.0,"Apr 1, 1998 to Sep 30, 1998",Madhouse,PG-13 - Teens 13 or older,328.0,246.0,15035.0,356739.0,727252.0,https://cdn.myanimelist.net/images/anime/7/203...
3,7,Witch Hunter Robin,7.25,"Action, Drama, Mystery, Supernatural",TV,26.0,"Jul 3, 2002 to Dec 25, 2002",Sunrise,PG-13 - Teens 13 or older,2764.0,1795.0,613.0,42829.0,111931.0,https://cdn.myanimelist.net/images/anime/10/19...
4,8,Beet the Vandel Buster,6.94,"Adventure, Fantasy, Supernatural",TV,52.0,"Sep 30, 2004 to Sep 29, 2005",Toei Animation,PG - Children,4240.0,5126.0,14.0,6413.0,15001.0,https://cdn.myanimelist.net/images/anime/7/215...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20915,48470,D_Cide Traumerei the Animation,5.81,"Action, Drama, Fantasy",TV,13.0,"Jul 10, 2021 to Oct 2, 2021",SANZIGEN,PG-13 - Teens 13 or older,9647.0,4243.0,102.0,6258.0,23808.0,https://cdn.myanimelist.net/images/anime/1635/...
20916,48471,Irina: The Vampire Cosmonaut,7.31,"Fantasy, Sci-Fi",TV,12.0,"Oct 4, 2021 to Dec 20, 2021",Arvo Animation,PG-13 - Teens 13 or older,2459.0,1499.0,689.0,49075.0,140342.0,https://cdn.myanimelist.net/images/anime/1393/...
20921,48488,Higurashi: When They Cry – SOTSU,6.73,"Avant Garde, Horror, Mystery, Supernatural, Su...",TV,15.0,"Jul 1, 2021 to Sep 30, 2021",Passione,R - 17+ (violence & profanity),5201.0,1882.0,722.0,41407.0,104539.0,https://cdn.myanimelist.net/images/anime/1083/...
20922,48491,Encouragement of Climb: Next Summit,7.63,"Adventure, Slice of Life",TV,12.0,"Oct 5, 2022 to Dec 21, 2022",8bit,PG-13 - Teens 13 or older,1307.0,4347.0,90.0,5556.0,22465.0,https://cdn.myanimelist.net/images/anime/1942/...


In [7]:
user_df = user_df[user_df['anime_id'].isin(anime_df['anime_id'])] # Only keep users that are in item_df
user_df = user_df[user_df['rating'] > 0] # Remove reviews with rating 0 because it is not a valid rating

In [8]:
users_lot_reviews = sample_users(user_df, min_reviews=400, max_reviews=600, n_users=150)
users_medium_reviews = sample_users(user_df, min_reviews=100, max_reviews=200, n_users=400)
users_small_reviews = sample_users(user_df, min_reviews=50, max_reviews=100, n_users=500)

In [9]:
user_df = pd.concat([users_lot_reviews, users_medium_reviews, users_small_reviews]).reset_index(drop=True)
user_df

Unnamed: 0,user_id,anime_id,rating
0,4136,22043,8
1,4136,31630,7
2,4136,3269,10
3,4136,873,6
4,4136,48,8
...,...,...,...
166384,353272,37976,6
166385,353272,6500,7
166386,353272,4063,6
166387,353272,36882,2


In [10]:
items = anime_df['anime_id'].values
users = user_df['user_id'].values

In [11]:
def split_data(data, split_percentage):
    def split_group(group):
        group = group.sample(frac=1, random_state=1).reset_index(drop=True)
        split_index = int(len(group) * split_percentage)
        return group.iloc[:split_index], group.iloc[split_index:]
    
    grouped = data.groupby('user_id')
    
    train_list, test_list = zip(*grouped.apply(lambda x: split_group(x)).values)
    
    train_set = pd.concat(train_list).reset_index(drop=True)
    test_set = pd.concat(test_list).reset_index(drop=True)
    
    return train_set, test_set

In [12]:
train, test = split_data(user_df, split_percentage)

  train_list, test_list = zip(*grouped.apply(lambda x: split_group(x)).values)


In [13]:
train_dataset = Dataset.from_uir(train[['user_id', 'anime_id', 'rating']].values.tolist(), seed=42)
test_dataset = Dataset.from_uir(test[['user_id', 'anime_id', 'rating']].values.tolist(), seed=42)

In [14]:
eval_method = BaseMethod.from_splits(train[['user_id', 'anime_id', 'rating']].values, test[['user_id', 'anime_id', 'rating']].values)
metrics = [met.RMSE(), met.Precision(k=10), met.Recall(k=10)]

# Item-based Collaborative Filtering

In [15]:
itemcf = ItemKNN(k=10, similarity="cosine", verbose=True)
itemcf.fit(train_dataset)
result = eval_method.evaluate(itemcf, metrics, user_based=False)
for metric in result:
    print(metric)

100%|██████████| 3057/3057 [00:00<00:00, 13001.14it/s]
100%|██████████| 3057/3057 [00:00<00:00, 13956.89it/s]


        |   RMSE | Precision@10 | Recall@10 | Train (s) | Test (s)
------- + ------ + ------------ + --------- + --------- + --------
ItemKNN | 1.5465 |       0.0142 |    0.0053 |    1.4632 |  17.9402

None


# User-based Collaborative Filtering

In [16]:
usercf = UserKNN(k=10, similarity="cosine", verbose=True)
usercf.fit(train_dataset)
result = eval_method.evaluate(usercf, metrics, user_based=False)
for metric in result:
    print(metric)

  0%|          | 0/1050 [00:00<?, ?it/s]

100%|██████████| 1050/1050 [00:00<00:00, 14700.45it/s]
100%|██████████| 1050/1050 [00:00<00:00, 16395.10it/s]


        |   RMSE | Precision@10 | Recall@10 | Train (s) | Test (s)
------- + ------ + ------------ + --------- + --------- + --------
UserKNN | 1.3286 |       0.0015 |    0.0006 |    0.1238 |   9.7319

None


## Matrix-Factorization using SVD

In [17]:
svdcf=SVD()
svdcf.fit(train_dataset)
result = eval_method.evaluate(svdcf, metrics, user_based=False)
for metric in result:
    print(metric)

    |   RMSE | Precision@10 | Recall@10 | Train (s) | Test (s)
--- + ------ + ------------ + --------- + --------- + --------
SVD | 1.3251 |       0.0417 |    0.0145 |    0.0692 |   4.2079

None


## Matrix-Factorization using WMF

In [18]:
wmfcf = WMF()
wmfcf.fit(train_dataset)
result = eval_method.evaluate(wmfcf, metrics, user_based=False)
print("WMF")
for metric in result:
    print(metric)

100%|██████████| 100/100 [00:27<00:00,  3.64it/s, loss=132]   


Learning completed!


100%|██████████| 100/100 [00:27<00:00,  3.67it/s, loss=56.7]


Learning completed!
WMF
    |   RMSE | Precision@10 | Recall@10 | Train (s) | Test (s)
--- + ------ + ------------ + --------- + --------- + --------
WMF | 3.2415 |       0.3177 |    0.1282 |   27.4575 |   6.5667

None


## Get recommendations

In [20]:
if user_id in test_dataset.uid_map:
    user_idx = test_dataset.uid_map[user_id]
else:
    print(f"user_id {user_id} not found in test_dataset. Using a random user.")
    user_id = np.random.choice(test_dataset.user_ids)
    user_idx = test_dataset.uid_map[user_id]
    print(f"Random user_id: {user_id}")

user_id 2840 not found in test_dataset. Using a random user.
Random user_id: 147222


In [21]:
def get_recommendations(model, user_idx, top_n=10):
    scores = model.score(user_idx)
    ranked_indices = np.argsort(-scores)
    return ranked_indices[:top_n]

user_idx = test_dataset.uid_map[user_id]

itemcf_recos = get_recommendations(itemcf, user_idx)
print(itemcf_recos)
usercf_recos = get_recommendations(usercf, user_idx)
print(usercf_recos)
svd_recos = get_recommendations(svdcf, user_idx)
print(svd_recos)
wmf_recos = get_recommendations(wmfcf, user_idx)
print(wmf_recos)

[2932 2990 2972 2971 2970 2969 2968 2967 2503 2964]
[2360 2870  928 2610 1257 2652 1605  611 1144 2366]
[ 323  310 1453   13  793 1817 2062  633 2000 1670]
[ 663  393   52  389  980    0 1147  157 2108  263]


In [22]:
def indices_to_anime_ids(indices, dataset):
    return [dataset.iid_map[idx] for idx in indices if idx in dataset.iid_map]

itemcf_anime_ids = indices_to_anime_ids(itemcf_recos, test_dataset)
usercf_anime_ids = indices_to_anime_ids(usercf_recos, test_dataset)
svd_anime_ids = indices_to_anime_ids(svd_recos, test_dataset)
wmf_anime_ids = indices_to_anime_ids(wmf_recos, test_dataset)

print("ItemKNN Recommendations:")
print(itemcf_anime_ids)
print()
print("UserKNN Recommendations:")
print(usercf_anime_ids)
print()
print("SVD Recommendations:")
print(svd_anime_ids)
print()
print("WMF Recommendations:")
print(wmf_anime_ids)

ItemKNN Recommendations:
[1715, 2091]

UserKNN Recommendations:
[1311, 1924]

SVD Recommendations:
[1290, 2210, 2446, 762, 1572, 1638]

WMF Recommendations:
[1070, 1644, 997, 1313, 1293]


# Diversifying Recommendations

In [23]:
similarity_matrix = itemcf.sim_mat

On utilise :
- Maximal Marginal Relevance (MMR)
- Max-Sum Diversification (MSD)
- Diversity-weighted Utility Maximization (DUM)

In [24]:
def divers_recos(method, relevance_scores, similarity_matrix, alpha=0.5, top_n=10):
    selected_items = []
    item_indices = list(range(len(relevance_scores)))
    
    while len(selected_items) < top_n and item_indices:
        if not selected_items:
            next_item = item_indices.pop(np.argmax(relevance_scores[item_indices]))
        else:
            max_score = -np.inf
            next_item = None
            for item in item_indices:
                diversity = sum([1 - similarity_matrix[item, selected_item] for selected_item in selected_items])
                if method == "mmr":
                    score = alpha * relevance_scores[item] - (1 - alpha) * (diversity / len(selected_items))
                elif method == "msd":
                    score = relevance_scores[item] + diversity
                elif method == "dum":
                    score = alpha * relevance_scores[item] + (1 - alpha) * diversity
                
                if score > max_score:
                    max_score = score
                    next_item = item
            item_indices.remove(next_item)
        selected_items.append(next_item)
    
    return selected_items

In [25]:
relevance_scores = np.array([itemcf.score(user_idx, i) for i in range(test_dataset.num_items)])

mmr_recos = divers_recos("mmr", relevance_scores, similarity_matrix, alpha=0.5, top_n=top_n)
msd_recos = divers_recos("msd", relevance_scores, similarity_matrix, alpha=0.5, top_n=top_n)
dum_recos = divers_recos("dum", relevance_scores, similarity_matrix, alpha=0.5, top_n=top_n)

In [26]:
print("MMR Recommendations:")
for idx in mmr_recos:
    if idx in test_dataset.uid_map:
        anime_id = test_dataset.uid_map[idx]
        print(f"\tAnime ID: {anime_id}, Score: {relevance_scores[idx]:.2f}")
print()
print("MSD Recommendations:")
for idx in msd_recos:
    if idx in test_dataset.uid_map:
        anime_id = test_dataset.uid_map[idx]
        print(f"\tAnime ID: {anime_id}, Rating: {relevance_scores[idx]:.2f}")
print()
print("DUM Recommendations:")
for idx in dum_recos:
    if idx in test_dataset.uid_map:
        anime_id = test_dataset.uid_map[idx]
        print(f"\tAnime ID: {anime_id}, Rating: {relevance_scores[idx]:.2f}")


MMR Recommendations:

MSD Recommendations:

DUM Recommendations:


In [27]:
mmr_anime_ids = indices_to_anime_ids(mmr_recos, test_dataset)
msd_anime_ids = indices_to_anime_ids(msd_recos, test_dataset)
dum_anime_ids = indices_to_anime_ids(dum_recos, test_dataset)

print("ItemKNN Recommendations:")
print(itemcf_anime_ids)
print()
print("UserKNN Recommendations:")
print(usercf_anime_ids)
print()
print("SVD Recommendations:")
print(svd_anime_ids)
print()
print("WMF Recommendations:")
print(wmf_anime_ids)
print()
print("MMR Recommendations:")
print(mmr_anime_ids)
print()
print("MSD Recommendations:")
print(msd_anime_ids)
print()
print("DUM Recommendations:")
print(dum_anime_ids)

ItemKNN Recommendations:
[1715, 2091]

UserKNN Recommendations:
[1311, 1924]

SVD Recommendations:
[1290, 2210, 2446, 762, 1572, 1638]

WMF Recommendations:
[1070, 1644, 997, 1313, 1293]

MMR Recommendations:
[91, 168, 1644, 2189, 2283]

MSD Recommendations:
[1644]

DUM Recommendations:
[1644]
