In [19]:
from cornac.models import ItemKNN, UserKNN
from cornac.eval_methods.base_method import BaseMethod
from cornac.data import Dataset
import pandas as pd
import matplotlib.pyplot as plt
from load_filter_and_sample import load_and_filter_data, sample_users
import cornac.metrics as met


In [2]:
user_id = 10
anime_id = 5 

k_values = [5, 10, 15, 30, 50, 100]
threshold = 3.5
top_n = 10

split_percentage = 0.8      # 80% of the data will be used for training and 20% for testing
core = 500   

In [3]:
def plot_similarity_matrix(similarity_matrix, similarity_metric, base):
    plt.figure()
    plt.imshow(similarity_matrix, cmap='hot', interpolation='nearest')
    plt.title(f"{base}-{base} {similarity_metric} Similarity Matrix")
    plt.colorbar()

In [4]:
IMPORT_PATH_BASE = "datasets/"
user_path = IMPORT_PATH_BASE + "user-filtered.csv"
item_path = IMPORT_PATH_BASE + "anime-dataset-2023.csv"

# Loading the dataframes
user_df, anime_df = load_and_filter_data(user_path, item_path, threshold=core, filtering=False, logging=True)

Loading user ratings...
User ratings loaded
Loading animes list...
Animes list loaded


In [5]:
anime_df.drop(columns=['Other name', 'Name', 'Synopsis', 'Source', 'Premiered', 'Status', 'Producers', 'Licensors', 'Duration'], inplace=True) # Drop unnecessary columns
anime_df.rename(columns={'English name': 'Name'}, inplace=True) # Rename 'English name' to 'Name'
anime_df = anime_df.drop(anime_df[anime_df.eq('UNKNOWN').any(axis=1)].index) # Drop rows with 'UNKNOWN' values
anime_df = anime_df[anime_df['Type'].isin(['Movie', 'TV', 'TV Short'])] # Only keep Movies, TV and TV Short
anime_df = anime_df[anime_df['anime_id'].isin(user_df['anime_id'])] # Only keep items that are in user_df
anime_df['Score'] = anime_df['Score'].astype(float)
anime_df['Episodes'] = anime_df['Episodes'].astype(float)
anime_df['Members'] = anime_df['Members'].astype(float)
anime_df['Favorites'] = anime_df['Favorites'].astype(float)
anime_df['Popularity'] = anime_df['Popularity'].astype(float)
anime_df['Rank'] = anime_df['Rank'].astype(float)

In [6]:
anime_df

Unnamed: 0,anime_id,Name,Score,Genres,Type,Episodes,Aired,Studios,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
0,1,Cowboy Bebop,8.75,"Action, Award Winning, Sci-Fi",TV,26.0,"Apr 3, 1998 to Apr 24, 1999",Sunrise,R - 17+ (violence & profanity),41.0,43.0,78525.0,914193.0,1771505.0,https://cdn.myanimelist.net/images/anime/4/196...
1,5,Cowboy Bebop: The Movie,8.38,"Action, Sci-Fi",Movie,1.0,"Sep 1, 2001",Bones,R - 17+ (violence & profanity),189.0,602.0,1448.0,206248.0,360978.0,https://cdn.myanimelist.net/images/anime/1439/...
2,6,Trigun,8.22,"Action, Adventure, Sci-Fi",TV,26.0,"Apr 1, 1998 to Sep 30, 1998",Madhouse,PG-13 - Teens 13 or older,328.0,246.0,15035.0,356739.0,727252.0,https://cdn.myanimelist.net/images/anime/7/203...
3,7,Witch Hunter Robin,7.25,"Action, Drama, Mystery, Supernatural",TV,26.0,"Jul 3, 2002 to Dec 25, 2002",Sunrise,PG-13 - Teens 13 or older,2764.0,1795.0,613.0,42829.0,111931.0,https://cdn.myanimelist.net/images/anime/10/19...
4,8,Beet the Vandel Buster,6.94,"Adventure, Fantasy, Supernatural",TV,52.0,"Sep 30, 2004 to Sep 29, 2005",Toei Animation,PG - Children,4240.0,5126.0,14.0,6413.0,15001.0,https://cdn.myanimelist.net/images/anime/7/215...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20915,48470,D_Cide Traumerei the Animation,5.81,"Action, Drama, Fantasy",TV,13.0,"Jul 10, 2021 to Oct 2, 2021",SANZIGEN,PG-13 - Teens 13 or older,9647.0,4243.0,102.0,6258.0,23808.0,https://cdn.myanimelist.net/images/anime/1635/...
20916,48471,Irina: The Vampire Cosmonaut,7.31,"Fantasy, Sci-Fi",TV,12.0,"Oct 4, 2021 to Dec 20, 2021",Arvo Animation,PG-13 - Teens 13 or older,2459.0,1499.0,689.0,49075.0,140342.0,https://cdn.myanimelist.net/images/anime/1393/...
20921,48488,Higurashi: When They Cry – SOTSU,6.73,"Avant Garde, Horror, Mystery, Supernatural, Su...",TV,15.0,"Jul 1, 2021 to Sep 30, 2021",Passione,R - 17+ (violence & profanity),5201.0,1882.0,722.0,41407.0,104539.0,https://cdn.myanimelist.net/images/anime/1083/...
20922,48491,Encouragement of Climb: Next Summit,7.63,"Adventure, Slice of Life",TV,12.0,"Oct 5, 2022 to Dec 21, 2022",8bit,PG-13 - Teens 13 or older,1307.0,4347.0,90.0,5556.0,22465.0,https://cdn.myanimelist.net/images/anime/1942/...


In [7]:
user_df = user_df[user_df['anime_id'].isin(anime_df['anime_id'])] # Only keep users that are in item_df
user_df = user_df[user_df['rating'] > 0] # Remove reviews with rating 0 because it is not a valid rating

In [8]:
users_lot_reviews = sample_users(user_df, min_reviews=400, max_reviews=600, n_users=150)
users_medium_reviews = sample_users(user_df, min_reviews=100, max_reviews=200, n_users=400)
users_small_reviews = sample_users(user_df, min_reviews=50, max_reviews=100, n_users=500)

In [9]:
user_df = pd.concat([users_lot_reviews, users_medium_reviews, users_small_reviews]).reset_index(drop=True)
user_df

Unnamed: 0,user_id,anime_id,rating
0,4136,22043,8
1,4136,31630,7
2,4136,3269,10
3,4136,873,6
4,4136,48,8
...,...,...,...
166384,353272,37976,6
166385,353272,6500,7
166386,353272,4063,6
166387,353272,36882,2


In [10]:
items = anime_df['anime_id'].values
users = user_df['user_id'].values

In [11]:
def split_data(data, split_percentage):
    def split_group(group):
        group = group.sample(frac=1, random_state=1).reset_index(drop=True)
        split_index = int(len(group) * split_percentage)
        return group.iloc[:split_index], group.iloc[split_index:]
    
    grouped = data.groupby('user_id')
    
    train_list, test_list = zip(*grouped.apply(lambda x: split_group(x)).values)
    
    train_set = pd.concat(train_list).reset_index(drop=True)
    test_set = pd.concat(test_list).reset_index(drop=True)
    
    return train_set, test_set

In [13]:
train, test = split_data(user_df, split_percentage)

  train_list, test_list = zip(*grouped.apply(lambda x: split_group(x)).values)


In [14]:
train_dataset = Dataset.from_uir(train[['user_id', 'anime_id', 'rating']].values.tolist(), seed=42)
test_dataset = Dataset.from_uir(test[['user_id', 'anime_id', 'rating']].values.tolist(), seed=42)

# Item-based Collaborative Filtering

In [15]:
# Initialize ItemKNN model
itemcf = ItemKNN(k=10, similarity="cosine", verbose=True)

# Fit the model using the training set from ratio split
itemcf.fit(train_dataset)

100%|██████████| 3057/3057 [00:00<00:00, 23955.36it/s]


<cornac.models.knn.recom_knn.ItemKNN at 0x7f6911b24380>

In [24]:
eval_method = BaseMethod.from_splits(train[['user_id', 'anime_id', 'rating']].values, test[['user_id', 'anime_id', 'rating']].values)
metrics = [met.MSE(), met.Precision(k=10), met.Recall(k=10)]

result = eval_method.evaluate(itemcf, metrics, user_based=False)

100%|██████████| 3057/3057 [00:00<00:00, 27925.43it/s]


In [21]:
for metric in result:
    print(metric)

        |    MSE | Precision@10 | Recall@10 | Train (s) | Test (s)
------- + ------ + ------------ + --------- + --------- + --------
ItemKNN | 2.3917 |       0.0142 |    0.0053 |    0.2942 |   6.8114

None


# User-based Collaborative Filtering

In [22]:
# Initialize UserKNN model
usercf = UserKNN(k=10, similarity="cosine", verbose=True)

# Fit the model using the training set from ratio split
usercf.fit(train_dataset)

100%|██████████| 1050/1050 [00:00<00:00, 26524.80it/s]


<cornac.models.knn.recom_knn.UserKNN at 0x7f6911b24830>

In [25]:
result = eval_method.evaluate(usercf, metrics, user_based=False)

100%|██████████| 1050/1050 [00:00<00:00, 31963.91it/s]


In [26]:
for metric in result:
    print(metric)

        |    MSE | Precision@10 | Recall@10 | Train (s) | Test (s)
------- + ------ + ------------ + --------- + --------- + --------
UserKNN | 1.7651 |       0.0015 |    0.0006 |    0.0651 |   6.5035

None
