In [None]:
!pip install requests pillow

In [None]:
from cornac.models import ItemKNN, UserKNN, SVD, WMF
from cornac.eval_methods.base_method import BaseMethod
from cornac.data import Dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from load_filter_and_sample import load_and_filter_data, sample_users
import cornac.metrics as met
import requests
from PIL import Image
from io import BytesIO

In [None]:
user_id = 2840
anime_id = 5 

k_values = [5, 10, 15, 30, 50, 100]
threshold = 3.5
top_n = 10

split_percentage = 0.8      # 80% of the data will be used for training and 20% for testing

In [None]:
def plot_similarity_matrix(similarity_matrix, similarity_metric, base):
    plt.figure()
    plt.imshow(similarity_matrix, cmap='hot', interpolation='nearest')
    plt.title(f"{base}-{base} {similarity_metric} Similarity Matrix")
    plt.colorbar()

In [None]:
IMPORT_PATH_BASE = "datasets/"
user_path = IMPORT_PATH_BASE + "user-filtered.csv"
item_path = IMPORT_PATH_BASE + "anime-dataset-2023.csv"

user_df, anime_df = load_and_filter_data(user_path, item_path, filtering=False, logging=True)

In [None]:
anime_df.drop(columns=['Other name', 'Name', 'Synopsis', 'Source', 'Premiered', 'Status', 'Producers', 'Licensors', 'Duration'], inplace=True) # Drop unnecessary columns
anime_df.rename(columns={'English name': 'Name'}, inplace=True) # Rename 'English name' to 'Name'
anime_df = anime_df.drop(anime_df[anime_df.eq('UNKNOWN').any(axis=1)].index) # Drop rows with 'UNKNOWN' values
anime_df = anime_df[anime_df['Type'].isin(['Movie', 'TV', 'TV Short'])] # Only keep Movies, TV and TV Short
anime_df = anime_df[anime_df['anime_id'].isin(user_df['anime_id'])] # Only keep items that are in user_df
anime_df['Score'] = anime_df['Score'].astype(float)
anime_df['Episodes'] = anime_df['Episodes'].astype(float)
anime_df['Members'] = anime_df['Members'].astype(float)
anime_df['Favorites'] = anime_df['Favorites'].astype(float)
anime_df['Popularity'] = anime_df['Popularity'].astype(float)
anime_df['Rank'] = anime_df['Rank'].astype(float)

In [None]:
anime_df

In [None]:
user_df = user_df[user_df['anime_id'].isin(anime_df['anime_id'])] # Only keep users that are in item_df
user_df = user_df[user_df['rating'] > 0] # Remove reviews with rating 0 because it is not a valid rating

In [None]:
users_lot_reviews = sample_users(user_df, min_reviews=400, max_reviews=600, n_users=150)
users_medium_reviews = sample_users(user_df, min_reviews=100, max_reviews=200, n_users=400)
users_small_reviews = sample_users(user_df, min_reviews=50, max_reviews=100, n_users=500)

In [None]:
user_df = pd.concat([users_lot_reviews, users_medium_reviews, users_small_reviews]).reset_index(drop=True)
user_df

In [None]:
items = anime_df['anime_id'].values
users = user_df['user_id'].values

In [None]:
def split_data(data, split_percentage):
    def split_group(group):
        group = group.sample(frac=1, random_state=1).reset_index(drop=True)
        split_index = int(len(group) * split_percentage)
        return group.iloc[:split_index], group.iloc[split_index:]
    
    grouped = data.groupby('user_id')
    
    train_list, test_list = zip(*grouped.apply(lambda x: split_group(x)).values)
    
    train_set = pd.concat(train_list).reset_index(drop=True)
    test_set = pd.concat(test_list).reset_index(drop=True)
    
    return train_set, test_set

In [None]:
train, test = split_data(user_df, split_percentage)

In [None]:
train_dataset = Dataset.from_uir(train[['user_id', 'anime_id', 'rating']].values.tolist(), seed=42)
test_dataset = Dataset.from_uir(test[['user_id', 'anime_id', 'rating']].values.tolist(), seed=42)

In [None]:
eval_method = BaseMethod.from_splits(train[['user_id', 'anime_id', 'rating']].values, test[['user_id', 'anime_id', 'rating']].values)
metrics = [met.RMSE(), met.Precision(k=10), met.Recall(k=10)]

# Item-based Collaborative Filtering

In [None]:
itemcf = ItemKNN(k=10, similarity="cosine", verbose=True)
itemcf.fit(train_dataset)
result = eval_method.evaluate(itemcf, metrics, user_based=False)
for metric in result:
    print(metric)

# User-based Collaborative Filtering

In [None]:
usercf = UserKNN(k=10, similarity="cosine", verbose=True)
usercf.fit(train_dataset)
result = eval_method.evaluate(usercf, metrics, user_based=False)
for metric in result:
    print(metric)

## Matrix-Factorization using SVD

In [None]:
svdcf=SVD()
svdcf.fit(train_dataset)
result = eval_method.evaluate(svdcf, metrics, user_based=False)
for metric in result:
    print(metric)

## Matrix-Factorization using WMF

In [None]:
wmfcf = WMF()
wmfcf.fit(train_dataset)
result = eval_method.evaluate(wmfcf, metrics, user_based=False)
print("WMF")
for metric in result:
    print(metric)

## Get recommendations

In [None]:
if user_id in test_dataset.uid_map:
    user_idx = test_dataset.uid_map[user_id]
else:
    print(f"user_id {user_id} not found in test_dataset. Using a random user.")
    user_id = np.random.choice(test_dataset.user_ids)
    user_idx = test_dataset.uid_map[user_id]
    print(f"Random user_id: {user_id}")

In [None]:
def get_recommendations(model, user_idx, top_n=10):
    scores = model.score(user_idx)
    ranked_indices = np.argsort(-scores)
    return ranked_indices[:top_n]

user_idx = test_dataset.uid_map[user_id]

itemcf_recos = get_recommendations(itemcf, user_idx)
print(itemcf_recos)
usercf_recos = get_recommendations(usercf, user_idx)
print(usercf_recos)
svd_recos = get_recommendations(svdcf, user_idx)
print(svd_recos)
wmf_recos = get_recommendations(wmfcf, user_idx)
print(wmf_recos)

In [None]:
def indices_to_anime_ids(indices, dataset):
    return [dataset.iid_map[idx] for idx in indices if idx in dataset.iid_map]

itemcf_anime_ids = indices_to_anime_ids(itemcf_recos, test_dataset)
usercf_anime_ids = indices_to_anime_ids(usercf_recos, test_dataset)
svd_anime_ids = indices_to_anime_ids(svd_recos, test_dataset)
wmf_anime_ids = indices_to_anime_ids(wmf_recos, test_dataset)

print("ItemKNN Recommendations:")
print(itemcf_anime_ids)
print()
print("UserKNN Recommendations:")
print(usercf_anime_ids)
print()
print("SVD Recommendations:")
print(svd_anime_ids)
print()
print("WMF Recommendations:")
print(wmf_anime_ids)

# Diversifying Recommendations

In [None]:
similarity_matrix = itemcf.sim_mat

On utilise :
- Maximal Marginal Relevance (MMR)
- Max-Sum Diversification (MSD)
- Diversity-weighted Utility Maximization (DUM)

In [None]:
def divers_recos(method, relevance_scores, similarity_matrix, alpha=0.5, top_n=10):
    selected_items = []
    item_indices = list(range(len(relevance_scores)))
    
    while len(selected_items) < top_n and item_indices:
        if not selected_items:
            next_item = item_indices.pop(np.argmax(relevance_scores[item_indices]))
        else:
            max_score = -np.inf
            next_item = None
            for item in item_indices:
                diversity = sum([1 - similarity_matrix[item, selected_item] for selected_item in selected_items])
                if method == "mmr":
                    score = alpha * relevance_scores[item] - (1 - alpha) * (diversity / len(selected_items))
                elif method == "msd":
                    score = relevance_scores[item] + diversity
                elif method == "dum":
                    score = alpha * relevance_scores[item] + (1 - alpha) * diversity
                
                if score > max_score:
                    max_score = score
                    next_item = item
            item_indices.remove(next_item)
        selected_items.append(next_item)
    
    return selected_items

In [None]:
relevance_scores = np.array([itemcf.score(user_idx, i) for i in range(test_dataset.num_items)])

mmr_recos = divers_recos("mmr", relevance_scores, similarity_matrix, alpha=0.5, top_n=top_n)
msd_recos = divers_recos("msd", relevance_scores, similarity_matrix, alpha=0.5, top_n=top_n)
dum_recos = divers_recos("dum", relevance_scores, similarity_matrix, alpha=0.5, top_n=top_n)

In [None]:
def display_anime(anime_id):
    anime = anime_df[anime_df['anime_id'] == anime_id]

    try:
        url = anime['Image URL'].values[0]
        title = anime['Name'].values[0]

        response = requests.get(url)
        img = Image.open(BytesIO(response.content))

        plt.imshow(img)
        plt.axis('off')
        plt.title(title)
        plt.show()
    except:
        return

In [None]:
print("MMR Recommendations:")
for idx in mmr_recos:
    if idx in test_dataset.iid_map:
        anime_id = test_dataset.iid_map[idx]
        print(f"Anime ID: {anime_id}, Rating: {relevance_scores[idx]:.2f}")
print()
print("MSD Recommendations:")
for idx in msd_recos:
    if idx in test_dataset.iid_map:
        anime_id = test_dataset.iid_map[idx]
        print(f"Anime ID: {anime_id}, Rating: {relevance_scores[idx]:.2f}")
print()
print("DUM Recommendations:")
for idx in dum_recos:
    if idx in test_dataset.iid_map:
        anime_id = test_dataset.iid_map[idx]
        print(f"Anime ID: {anime_id}, Rating: {relevance_scores[idx]:.2f}")


In [None]:
def display_anime_list(anime_ids):
    for anime_id in anime_ids:
        display_anime(anime_id)

In [None]:
mmr_anime_ids = indices_to_anime_ids(mmr_recos, test_dataset)
msd_anime_ids = indices_to_anime_ids(msd_recos, test_dataset)
dum_anime_ids = indices_to_anime_ids(dum_recos, test_dataset)

print("ItemKNN Recommendations:")
display_anime_list(itemcf_anime_ids)
print()
print("UserKNN Recommendations:")
display_anime_list(usercf_anime_ids)
print()
print("SVD Recommendations:")
display_anime_list(svd_anime_ids)
print()
print("WMF Recommendations:")
display_anime_list(wmf_anime_ids)
print()
print("MMR Recommendations:")
display_anime_list(mmr_anime_ids)
print()
print("MSD Recommendations:")
display_anime_list(msd_anime_ids)
print()
print("DUM Recommendations:")
display_anime_list(dum_anime_ids)