# Anime Dataset 2023

Dataset Kaggle : https://www.kaggle.com/datasets/dbdmobile/myanimelist-dataset
- anime-dataset-2023.csv
- user-filtered.csv

The Goal of this project is to recommand anime for a specific user considering his anime list.

In [1]:
from cornac.models import ItemKNN, UserKNN

import scipy.sparse as sparse
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances

  from .autonotebook import tqdm as notebook_tqdm
  match = re.match("^#\s*version\s*([0-9a-z]*)\s*$", line)


In [2]:
IMPORT_PATH_BASE = "datasets/"

## Loading the datasets

In [3]:
# Read the anime dataset
anime_df = pd.read_csv(IMPORT_PATH_BASE + 'anime-dataset-2023.csv')

# Read the user dataset
user_df = pd.read_csv(IMPORT_PATH_BASE + 'user-filtered.csv')

In [4]:
anime_df.columns

Index(['anime_id', 'Name', 'English name', 'Other name', 'Score', 'Genres',
       'Synopsis', 'Type', 'Episodes', 'Aired', 'Premiered', 'Status',
       'Producers', 'Licensors', 'Studios', 'Source', 'Duration', 'Rating',
       'Rank', 'Popularity', 'Favorites', 'Scored By', 'Members', 'Image URL'],
      dtype='object')

## Filtering the data

In [5]:
anime_df.drop(columns=['Other name', 'Name', 'Synopsis', 'Source', 'Premiered', 'Status', 'Producers', 'Licensors', 'Duration'], inplace=True)
anime_df.rename(columns={'English name': 'Name'}, inplace=True)
anime_df = anime_df.drop(anime_df[anime_df.eq('UNKNOWN').any(axis=1)].index)
anime_df = anime_df[anime_df['Type'].isin(['Movie', 'TV', 'TV Short'])]
user_df = user_df[user_df['anime_id'].isin(anime_df['anime_id'])]
anime_df = anime_df[anime_df['anime_id'].isin(user_df['anime_id'])]


In [6]:
anime_df

Unnamed: 0,anime_id,Name,Score,Genres,Type,Episodes,Aired,Studios,Rating,Rank,Popularity,Favorites,Scored By,Members,Image URL
0,1,Cowboy Bebop,8.75,"Action, Award Winning, Sci-Fi",TV,26.0,"Apr 3, 1998 to Apr 24, 1999",Sunrise,R - 17+ (violence & profanity),41.0,43,78525,914193.0,1771505,https://cdn.myanimelist.net/images/anime/4/196...
1,5,Cowboy Bebop: The Movie,8.38,"Action, Sci-Fi",Movie,1.0,"Sep 1, 2001",Bones,R - 17+ (violence & profanity),189.0,602,1448,206248.0,360978,https://cdn.myanimelist.net/images/anime/1439/...
2,6,Trigun,8.22,"Action, Adventure, Sci-Fi",TV,26.0,"Apr 1, 1998 to Sep 30, 1998",Madhouse,PG-13 - Teens 13 or older,328.0,246,15035,356739.0,727252,https://cdn.myanimelist.net/images/anime/7/203...
3,7,Witch Hunter Robin,7.25,"Action, Drama, Mystery, Supernatural",TV,26.0,"Jul 3, 2002 to Dec 25, 2002",Sunrise,PG-13 - Teens 13 or older,2764.0,1795,613,42829.0,111931,https://cdn.myanimelist.net/images/anime/10/19...
4,8,Beet the Vandel Buster,6.94,"Adventure, Fantasy, Supernatural",TV,52.0,"Sep 30, 2004 to Sep 29, 2005",Toei Animation,PG - Children,4240.0,5126,14,6413.0,15001,https://cdn.myanimelist.net/images/anime/7/215...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20915,48470,D_Cide Traumerei the Animation,5.81,"Action, Drama, Fantasy",TV,13.0,"Jul 10, 2021 to Oct 2, 2021",SANZIGEN,PG-13 - Teens 13 or older,9647.0,4243,102,6258.0,23808,https://cdn.myanimelist.net/images/anime/1635/...
20916,48471,Irina: The Vampire Cosmonaut,7.31,"Fantasy, Sci-Fi",TV,12.0,"Oct 4, 2021 to Dec 20, 2021",Arvo Animation,PG-13 - Teens 13 or older,2459.0,1499,689,49075.0,140342,https://cdn.myanimelist.net/images/anime/1393/...
20921,48488,Higurashi: When They Cry – SOTSU,6.73,"Avant Garde, Horror, Mystery, Supernatural, Su...",TV,15.0,"Jul 1, 2021 to Sep 30, 2021",Passione,R - 17+ (violence & profanity),5201.0,1882,722,41407.0,104539,https://cdn.myanimelist.net/images/anime/1083/...
20922,48491,Encouragement of Climb: Next Summit,7.63,"Adventure, Slice of Life",TV,12.0,"Oct 5, 2022 to Dec 21, 2022",8bit,PG-13 - Teens 13 or older,1307.0,4347,90,5556.0,22465,https://cdn.myanimelist.net/images/anime/1942/...


In [7]:
user_df

Unnamed: 0,user_id,anime_id,rating
0,0,67,9
1,0,6702,7
2,0,242,10
3,0,4898,0
5,0,24,9
...,...,...,...
109224741,353404,243,7
109224743,353404,392,9
109224744,353404,882,6
109224745,353404,883,8


# Data evaluation and recommendation

In [8]:
def filter_users(train, test, core):
    users_r_count = train.groupby("user_id")["rating"].count().rename("count").reset_index()
    users_to_remove = users_r_count[users_r_count["count"] < core]["user_id"].values

    train_filtered = train[~train["user_id"].isin(users_to_remove)]
    test_filtered = test[~test["user_id"].isin(users_to_remove)]

    users_in_train = set(train.user_id.unique())
    users_in_test = set(test.user_id.unique())
    users_not_in_train = users_in_test - users_in_train

    test_final = test_filtered[~test_filtered["user_id"].isin(list(users_not_in_train))]

    return (train_filtered, test_final)

def filter_items(train, test):
    items_in_train = set(train.anime_id.unique())
    items_in_test = set(test.anime_id.unique())
    items_not_in_train = items_in_test - items_in_train

    test_final = test[~test["anime_id"].isin(list(items_not_in_train))]

    def all_negative(l):
            return sum(l) == -(len(l))

    user_ratings = test_final.groupby("user_id")["rating"].apply(list).reset_index()
    user_ratings["all_negative_sampling"] = user_ratings.apply(lambda r, f=all_negative: f(r["rating"]), axis=1)
    users_with_no_real_test_ratings = user_ratings[user_ratings.all_negative_sampling == True]["user_id"].unique().tolist()

    test_final = test_final[~test_final.user_id.isin(users_with_no_real_test_ratings)]

    return (train, test_final)

def split_data(data, split_percentage, core, items, users):
    train_set = data[0 : int(split_percentage * data.shape[0])]
    test_set = data[int(split_percentage * data.shape[0]):]

    train1 , test1 = filter_users(train_set, test_set, core)
    train2, test2 = filter_items(train1, test1)

    users = list(train2.user_id.unique())    # get all unique users
    items = list(train2.anime_id.unique())   # get all unique animes
    rating = list(train2.rating)            # get ratings

    rows = train2.user_id.astype(pd.api.types.CategoricalDtype(categories = users)).cat.codes    # Get the associated row indices
    cols = train2.anime_id.astype(pd.api.types.CategoricalDtype(categories = items)).cat.codes   # Get the associated row indices

    rating_matrix = sparse.csr_matrix((rating, (rows, cols)), shape = (len(users), len(items)))

    return rating_matrix, train2, test2
    
def plot_similarity_matrix(similarity_matrix, similarity_metric, base):
    """
    Plot the similarity matrix.

    Parameters
    ----------
        similarity_matrix : numpy.ndarray
            -> The similarity matrix between users or items.
        similarity_metric : str
            -> The metric used to compute the similarity matrix.
        base : str
            -> The base (users or items).
    """
    _ = plt.figure()
    plt.imshow(similarity_matrix, cmap='hot', interpolation='nearest')
    plt.title(f"{base}-{base} {similarity_metric} Similarity Matrix")
    plt.colorbar()

In [13]:
user_id = 10
k_values = [5, 10, 15, 30, 50, 100]
threshold = 3.5
top_n = 10

split_percentage = 0.8      # 80% of the data will be used for training and 20% for testing
core = 1000                  # keep only items that where rated by at least $core$ users

items = anime_df['anime_id'].values
users = user_df['user_id'].values

user_counts = user_df.value_counts('user_id')
user_counts = user_counts[user_counts >= core]

data = user_df[user_df['user_id'].isin(user_counts.index)]

In [14]:
print(data.shape[0])

6330047


In [15]:
rating_matrix, train, test = split_data(data, split_percentage, core, items, users)

ValueError: Cannot set a DataFrame with multiple columns to the single column all_negative_sampling

## 3.2 Item-based Collaborative Filtering

### • Item Similarity Computation

In [None]:
itemcf=ItemKNN(k=10)
itemcf.fit(train.T)

### • Neighborhood Formation

In [None]:
plot_similarity_matrix(itemcf.sim_mat, 'cosine', 'Item')

### • Recommendation Generation

In [None]:
itemcf.predict(user_id, top_n)

## 3.3 User-based Collaborative Filtering

### • Item Similarity Computation

In [12]:
usercf=ItemKNN(k=10)
usercf.fit(train)

AttributeError: 'DataFrame' object has no attribute 'reset'

### • Neighborhood Formation

In [None]:
plot_similarity_matrix(usercf.sim_mat, 'cosine', 'User')

### • Recommendation Generation

In [None]:
usercf.predict(user_id, top_n)