In [None]:
from cornac.models import ItemKNN, UserKNN
import cornac
from cornac.eval_methods import RatioSplit
from cornac.data import Reader, Dataset
import scipy.sparse as sparse
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from load_filter_and_sample import load_and_filter_data

In [None]:
user_id = 10
anime_id = 5 

k_values = [5, 10, 15, 30, 50, 100]
threshold = 3.5
top_n = 10

split_percentage = 0.8      # 80% of the data will be used for training and 20% for testing
core = 500   

In [None]:
def plot_similarity_matrix(similarity_matrix, similarity_metric, base):
    plt.figure()
    plt.imshow(similarity_matrix, cmap='hot', interpolation='nearest')
    plt.title(f"{base}-{base} {similarity_metric} Similarity Matrix")
    plt.colorbar()

In [None]:
IMPORT_PATH_BASE = "datasets/"
user_path = IMPORT_PATH_BASE + "user-filtered.csv"
item_path = IMPORT_PATH_BASE + "anime-dataset-2023.csv"

# Loading the dataframes
user_df, anime_df = load_and_filter_data(user_path, item_path, threshold=core, logging=True)

In [None]:
anime_df.drop(columns=['Other name', 'Name', 'Synopsis', 'Source', 'Premiered', 'Status', 'Producers', 'Licensors', 'Duration'], inplace=True) # Drop unnecessary columns
anime_df.rename(columns={'English name': 'Name'}, inplace=True) # Rename 'English name' to 'Name'
anime_df = anime_df.drop(anime_df[anime_df.eq('UNKNOWN').any(axis=1)].index) # Drop rows with 'UNKNOWN' values
anime_df = anime_df[anime_df['Type'].isin(['Movie', 'TV', 'TV Short'])] # Only keep Movies, TV and TV Short
anime_df = anime_df[anime_df['anime_id'].isin(user_df['anime_id'])] # Only keep items that are in user_df
anime_df['Score'] = anime_df['Score'].astype(float)
anime_df['Episodes'] = anime_df['Episodes'].astype(float)
anime_df['Members'] = anime_df['Members'].astype(float)
anime_df['Favorites'] = anime_df['Favorites'].astype(float)
anime_df['Popularity'] = anime_df['Popularity'].astype(float)
anime_df['Rank'] = anime_df['Rank'].astype(float)

In [None]:
anime_df

In [None]:
user_df = user_df[user_df['anime_id'].isin(anime_df['anime_id'])] # Only keep users that are in item_df

In [None]:
user_df = user_df[user_df['rating'] > 0] # Remove reviews with rating 0 because it is not a valid rating

In [None]:
user_counts = user_df['user_id'].value_counts() # Count the number of reviews per user

In [None]:
user_counts = user_counts[user_counts >= core] # Filter users with more than threshold reviews

In [None]:
user_df = user_df[user_df['user_id'].isin(user_counts.index)] # Only keep users with more than threshold reviews

In [None]:
items = anime_df['anime_id'].values
users = user_df['user_id'].values

In [None]:
items

In [None]:
user_df['rating']

In [None]:
anime_df['anime_id'].nunique()

In [None]:
data = anime_df [['anime_id', 'Name', 'Score', 'Genres', 'Type', 'Episodes', 'Aired', 'Studios', 'Rating', 'Rank', 'Popularity', 'Favorites', 'Scored By', 'Members']].values.tolist()

ratio_split = RatioSplit(data=data, test_size=0.2, rating_threshold=threshold, seed=42)

In [None]:
# Initialize ItemKNN model
itemcf = ItemKNN(k=20, similarity="cosine", verbose=True)

# Fit the model using the training set from ratio split
itemcf.fit(ratio_split.train_set)

In [None]:
# plot_similarity_matrix(itemcf.sim_mat, 'cosine', 'Item')

In [None]:
itemcf.score(user_id, anime_id)

In [None]:
data = user_df[['user_id', 'anime_id', 'rating']].values.tolist()

# Define evaluation method (e.g., ratio split)
ratio_split = RatioSplit(data=data, test_size=0.4, rating_threshold=threshold, seed=42)

In [None]:
usercf=UserKNN(k=10)
usercf.fit(ratio_split.train_set)

In [None]:
plot_similarity_matrix(usercf.sim_mat, 'cosine', 'User')

In [None]:
usercf.score(user_id, anime_id)