## Importing stuff

In [2]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import matplotlib.pyplot as plt
import json
import requests
import string
import random

In [3]:
token_file = 'D:/Nitty/Personal Projects/Anime-recommendation-system/API/key.json'
with open(token_file) as f:
    data = json.load(f)
ACCESS_TOKEN = data['access_token']

In [4]:
anime_df = pd.read_csv("D:/Nitty/Personal Projects/Anime-recommendation-system/anime_dataset/anime_cleaned.csv")
user_df= pd.read_csv("D:/Nitty/Personal Projects/Anime-recommendation-system/anime_dataset/users_cleaned.csv")
anime_list_df = pd.read_csv("D:/Nitty/Personal Projects/Anime-recommendation-system/anime_dataset/animelists_cleaned.csv")

## Pre processing 

In [5]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6668 entries, 0 to 6667
Data columns (total 33 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   anime_id         6668 non-null   int64  
 1   title            6668 non-null   object 
 2   title_english    3438 non-null   object 
 3   title_japanese   6663 non-null   object 
 4   title_synonyms   4481 non-null   object 
 5   image_url        6666 non-null   object 
 6   type             6668 non-null   object 
 7   source           6668 non-null   object 
 8   episodes         6668 non-null   int64  
 9   status           6668 non-null   object 
 10  airing           6668 non-null   bool   
 11  aired_string     6668 non-null   object 
 12  aired            6668 non-null   object 
 13  duration         6668 non-null   object 
 14  rating           6586 non-null   object 
 15  score            6668 non-null   float64
 16  scored_by        6668 non-null   int64  
 17  rank          

In [6]:
anime_df.columns

Index(['anime_id', 'title', 'title_english', 'title_japanese',
       'title_synonyms', 'image_url', 'type', 'source', 'episodes', 'status',
       'airing', 'aired_string', 'aired', 'duration', 'rating', 'score',
       'scored_by', 'rank', 'popularity', 'members', 'favorites', 'background',
       'premiered', 'broadcast', 'related', 'producer', 'licensor', 'studio',
       'genre', 'opening_theme', 'ending_theme', 'duration_min',
       'aired_from_year'],
      dtype='object')

In [7]:
anime_df.isna().sum()

anime_id              0
title                 0
title_english      3230
title_japanese        5
title_synonyms     2187
image_url             2
type                  0
source                0
episodes              0
status                0
airing                0
aired_string          0
aired                 0
duration              0
rating               82
score                 0
scored_by             0
rank                356
popularity            0
members               0
favorites             0
background         5855
premiered          3702
broadcast          3688
related               0
producer           2266
licensor           3881
studio                0
genre                 4
opening_theme         0
ending_theme          0
duration_min          0
aired_from_year       0
dtype: int64

In [8]:
def fetch_anime_data(anime_id):
    url = f"https://api.myanimelist.net/v2/anime/{anime_id}?fields=genres,main_picture,title,title_english,title_japanese,rating"
    headers = {"Authorization": f"Bearer {ACCESS_TOKEN}"}
    
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            data = response.json()
            
            genre = ", ".join([g['name'] for g in data.get('genres', [])])
            image_url = data.get('main_picture', {}).get('large', None)
            
            print(f"anime {anime_id}:{genre}, {image_url}")
            return genre, image_url
        
    except Exception as e:
        print(f"Error fetching data for anime_id {anime_id}: {e}")
    return None, None, None, None, None

for index, row in anime_df.iterrows():
    if any(pd.isna(row[col]) for col in ['genre', 'image_url']):
        genre, image_url= fetch_anime_data(row['anime_id'])
        
        if pd.isna(row['genre']):
            anime_df.at[index, 'genre'] = genre
        if pd.isna(row['image_url']):
            anime_df.at[index, 'image_url'] = image_url

anime_df.isna().sum(), anime_df.info()

anime 33389:, https://cdn.myanimelist.net/images/anime/2/84200l.webp
anime 35576:Comedy, https://cdn.myanimelist.net/images/anime/1375/93755l.jpg
anime 32695:, https://cdn.myanimelist.net/images/anime/12/78478l.jpg
anime 17813:Supernatural, https://cdn.myanimelist.net/images/anime/4/68993l.jpg
anime 23971:Comedy, Historical, None
anime 37018:Kids, https://cdn.myanimelist.net/images/anime/6/89762l.jpg
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6668 entries, 0 to 6667
Data columns (total 33 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   anime_id         6668 non-null   int64  
 1   title            6668 non-null   object 
 2   title_english    3438 non-null   object 
 3   title_japanese   6663 non-null   object 
 4   title_synonyms   4481 non-null   object 
 5   image_url        6667 non-null   object 
 6   type             6668 non-null   object 
 7   source           6668 non-null   object 
 8   episodes         6668 no

(anime_id              0
 title                 0
 title_english      3230
 title_japanese        5
 title_synonyms     2187
 image_url             1
 type                  0
 source                0
 episodes              0
 status                0
 airing                0
 aired_string          0
 aired                 0
 duration              0
 rating               82
 score                 0
 scored_by             0
 rank                356
 popularity            0
 members               0
 favorites             0
 background         5855
 premiered          3702
 broadcast          3688
 related               0
 producer           2266
 licensor           3881
 studio                0
 genre                 0
 opening_theme         0
 ending_theme          0
 duration_min          0
 aired_from_year       0
 dtype: int64,
 None)

In [9]:
anime_samp_df=anime_df[['anime_id','title','genre']]

In [10]:
"""anime_samp_df['genre'] = anime_samp_df['genre'].apply(lambda x: x.split(', ') if isinstance(x, str) else x)
genre_dummies = anime_samp_df['genre'].apply(pd.Series).stack().str.get_dummies().reset_index(level=1, drop=True)
anime_samp_df = anime_samp_df.join(genre_dummies)
anime_samp_df = anime_samp_df.drop(columns=['genre'])
anime_samp_df.info()"""


"anime_samp_df['genre'] = anime_samp_df['genre'].apply(lambda x: x.split(', ') if isinstance(x, str) else x)\ngenre_dummies = anime_samp_df['genre'].apply(pd.Series).stack().str.get_dummies().reset_index(level=1, drop=True)\nanime_samp_df = anime_samp_df.join(genre_dummies)\nanime_samp_df = anime_samp_df.drop(columns=['genre'])\nanime_samp_df.info()"

In [11]:
anime_samp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6668 entries, 0 to 6667
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   anime_id  6668 non-null   int64 
 1   title     6668 non-null   object
 2   genre     6668 non-null   object
dtypes: int64(1), object(2)
memory usage: 156.4+ KB


In [12]:
anime_list_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31284030 entries, 0 to 31284029
Data columns (total 11 columns):
 #   Column               Dtype  
---  ------               -----  
 0   username             object 
 1   anime_id             int64  
 2   my_watched_episodes  int64  
 3   my_start_date        object 
 4   my_finish_date       object 
 5   my_score             int64  
 6   my_status            int64  
 7   my_rewatching        float64
 8   my_rewatching_ep     int64  
 9   my_last_updated      object 
 10  my_tags              object 
dtypes: float64(1), int64(5), object(5)
memory usage: 2.6+ GB


In [13]:
anime_list_df.head()

Unnamed: 0,username,anime_id,my_watched_episodes,my_start_date,my_finish_date,my_score,my_status,my_rewatching,my_rewatching_ep,my_last_updated,my_tags
0,karthiga,21,586,0000-00-00,0000-00-00,9,1,,0,2013-03-03 10:52:53,
1,karthiga,59,26,0000-00-00,0000-00-00,7,2,,0,2013-03-10 13:54:51,
2,karthiga,74,26,0000-00-00,0000-00-00,7,2,,0,2013-04-27 16:43:35,
3,karthiga,120,26,0000-00-00,0000-00-00,7,2,,0,2013-03-03 10:53:57,
4,karthiga,178,26,0000-00-00,0000-00-00,7,2,0.0,0,2013-03-27 15:59:13,


In [14]:
anime_list_df.isna().sum()

username                    243
anime_id                      0
my_watched_episodes           0
my_start_date                 0
my_finish_date                0
my_score                      0
my_status                     0
my_rewatching           6878247
my_rewatching_ep              0
my_last_updated               0
my_tags                29290429
dtype: int64

In [15]:
def generate_random_string(length=8):
    characters = string.ascii_letters + string.digits
    return ''.join(random.choice(characters) for _ in range(length))

anime_list_df['username'] = anime_list_df['username'].apply(lambda x: generate_random_string() if pd.isna(x) else x)
anime_list_df['my_rewatching'] = anime_list_df['my_rewatching'].fillna(0)
anime_list_df = anime_list_df.drop(columns=['my_tags'])

In [16]:
anime_samp_list_df=anime_list_df[['username','anime_id','my_score']]

In [17]:
anime_samp_list_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31284030 entries, 0 to 31284029
Data columns (total 3 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   username  object
 1   anime_id  int64 
 2   my_score  int64 
dtypes: int64(2), object(1)
memory usage: 716.0+ MB


In [18]:
anime_list_df.isna().sum()

username               0
anime_id               0
my_watched_episodes    0
my_start_date          0
my_finish_date         0
my_score               0
my_status              0
my_rewatching          0
my_rewatching_ep       0
my_last_updated        0
dtype: int64

In [19]:
user_df.isna().sum()

username                         1
user_id                          0
user_watching                    0
user_completed                   0
user_onhold                      0
user_dropped                     0
user_plantowatch                 0
user_days_spent_watching         0
gender                           0
location                         5
birth_date                       0
access_rank                 108711
join_date                        0
last_online                      0
stats_mean_score                 0
stats_rewatched                  0
stats_episodes                   0
dtype: int64

In [20]:
user_df=user_df.drop(columns=['access_rank'])

In [21]:
user_df = user_df.dropna(subset=['location'])
user_df['username'] = user_df['username'].apply(lambda x: generate_random_string() if pd.isna(x) else x)
user_df.isna().sum()

username                    0
user_id                     0
user_watching               0
user_completed              0
user_onhold                 0
user_dropped                0
user_plantowatch            0
user_days_spent_watching    0
gender                      0
location                    0
birth_date                  0
join_date                   0
last_online                 0
stats_mean_score            0
stats_rewatched             0
stats_episodes              0
dtype: int64

In [22]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6668 entries, 0 to 6667
Data columns (total 33 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   anime_id         6668 non-null   int64  
 1   title            6668 non-null   object 
 2   title_english    3438 non-null   object 
 3   title_japanese   6663 non-null   object 
 4   title_synonyms   4481 non-null   object 
 5   image_url        6667 non-null   object 
 6   type             6668 non-null   object 
 7   source           6668 non-null   object 
 8   episodes         6668 non-null   int64  
 9   status           6668 non-null   object 
 10  airing           6668 non-null   bool   
 11  aired_string     6668 non-null   object 
 12  aired            6668 non-null   object 
 13  duration         6668 non-null   object 
 14  rating           6586 non-null   object 
 15  score            6668 non-null   float64
 16  scored_by        6668 non-null   int64  
 17  rank          

In [23]:
anime_list_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31284030 entries, 0 to 31284029
Data columns (total 10 columns):
 #   Column               Dtype  
---  ------               -----  
 0   username             object 
 1   anime_id             int64  
 2   my_watched_episodes  int64  
 3   my_start_date        object 
 4   my_finish_date       object 
 5   my_score             int64  
 6   my_status            int64  
 7   my_rewatching        float64
 8   my_rewatching_ep     int64  
 9   my_last_updated      object 
dtypes: float64(1), int64(5), object(4)
memory usage: 2.3+ GB


In [24]:
anime_list_df.head()

Unnamed: 0,username,anime_id,my_watched_episodes,my_start_date,my_finish_date,my_score,my_status,my_rewatching,my_rewatching_ep,my_last_updated
0,karthiga,21,586,0000-00-00,0000-00-00,9,1,0.0,0,2013-03-03 10:52:53
1,karthiga,59,26,0000-00-00,0000-00-00,7,2,0.0,0,2013-03-10 13:54:51
2,karthiga,74,26,0000-00-00,0000-00-00,7,2,0.0,0,2013-04-27 16:43:35
3,karthiga,120,26,0000-00-00,0000-00-00,7,2,0.0,0,2013-03-03 10:53:57
4,karthiga,178,26,0000-00-00,0000-00-00,7,2,0.0,0,2013-03-27 15:59:13


In [25]:
anime_list_df['my_status'].unique()

array([ 1,  2,  3,  4,  6, 55, 33,  5,  0])

In [26]:
user_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 108706 entries, 0 to 108710
Data columns (total 16 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   username                  108706 non-null  object 
 1   user_id                   108706 non-null  int64  
 2   user_watching             108706 non-null  int64  
 3   user_completed            108706 non-null  int64  
 4   user_onhold               108706 non-null  int64  
 5   user_dropped              108706 non-null  int64  
 6   user_plantowatch          108706 non-null  int64  
 7   user_days_spent_watching  108706 non-null  float64
 8   gender                    108706 non-null  object 
 9   location                  108706 non-null  object 
 10  birth_date                108706 non-null  object 
 11  join_date                 108706 non-null  object 
 12  last_online               108706 non-null  object 
 13  stats_mean_score          108706 non-null  float6

In [27]:
user_df.head()

Unnamed: 0,username,user_id,user_watching,user_completed,user_onhold,user_dropped,user_plantowatch,user_days_spent_watching,gender,location,birth_date,join_date,last_online,stats_mean_score,stats_rewatched,stats_episodes
0,karthiga,2255153,3,49,1,0,0,55.091667,Female,"Chennai, India",1990-04-29 00:00:00,2013-03-03 00:00:00,2014-02-04 01:32:00,7.43,0.0,3391
1,Damonashu,37326,45,195,27,25,59,82.574306,Male,"Detroit,Michigan",1991-08-01 00:00:00,2008-02-13 00:00:00,2017-07-10 06:52:54,6.15,6.0,4903
2,bskai,228342,25,414,2,5,11,159.483333,Male,"Nayarit, Mexico",1990-12-14 00:00:00,2009-08-31 00:00:00,2014-05-12 16:35:00,8.27,1.0,9701
3,terune_uzumaki,327311,5,5,0,0,0,11.394444,Female,"Malaysia, Kuantan",1998-08-24 00:00:00,2010-05-10 00:00:00,2012-10-18 19:06:00,9.7,6.0,697
4,Bas_G,5015094,35,114,6,20,175,30.458333,Male,"Nijmegen, Nederland",1999-10-24 00:00:00,2015-11-26 00:00:00,2018-05-10 20:53:37,7.86,0.0,1847


In [28]:
user_samp_df = user_df[["username","user_id"]]

In [29]:
anime_samp_list_df = pd.merge(user_samp_df,anime_samp_list_df, on='username')
anime_samp_list_df.head()

Unnamed: 0,username,user_id,anime_id,my_score
0,karthiga,2255153,21,9
1,karthiga,2255153,59,7
2,karthiga,2255153,74,7
3,karthiga,2255153,120,7
4,karthiga,2255153,178,7


In [30]:
anime_samp_list_df.isna().sum(),anime_samp_list_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31282812 entries, 0 to 31282811
Data columns (total 4 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   username  object
 1   user_id   int64 
 2   anime_id  int64 
 3   my_score  int64 
dtypes: int64(3), object(1)
memory usage: 954.7+ MB


(username    0
 user_id     0
 anime_id    0
 my_score    0
 dtype: int64,
 None)

In [31]:
n_users = anime_samp_list_df.user_id.unique().shape[0]
n_anime = anime_samp_list_df.anime_id.unique().shape[0]
print( str(n_users) + ' users')
print( str(n_anime) + ' anime')

108704 users
6668 anime


In [32]:
anime_final_df = pd.merge(anime_samp_list_df,anime_samp_df, on='anime_id')
anime_final_df.head()

Unnamed: 0,username,user_id,anime_id,my_score,title,genre
0,karthiga,2255153,21,9,One Piece,"Action, Adventure, Comedy, Super Power, Drama,..."
1,karthiga,2255153,59,7,Chobits,"Sci-Fi, Comedy, Drama, Romance, Ecchi, Seinen"
2,karthiga,2255153,74,7,Gakuen Alice,"Comedy, School, Shoujo, Super Power"
3,karthiga,2255153,120,7,Fruits Basket,"Slice of Life, Comedy, Drama, Romance, Fantasy..."
4,karthiga,2255153,178,7,Ultra Maniac,"Magic, Comedy, Romance, School, Shoujo"


In [33]:
anime_final_df.info(),anime_final_df.shape,anime_final_df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31282812 entries, 0 to 31282811
Data columns (total 6 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   username  object
 1   user_id   int64 
 2   anime_id  int64 
 3   my_score  int64 
 4   title     object
 5   genre     object
dtypes: int64(3), object(3)
memory usage: 1.4+ GB


(None,
 (31282812, 6),
 username    0
 user_id     0
 anime_id    0
 my_score    0
 title       0
 genre       0
 dtype: int64)

In [34]:
anime_final_df.head()

Unnamed: 0,username,user_id,anime_id,my_score,title,genre
0,karthiga,2255153,21,9,One Piece,"Action, Adventure, Comedy, Super Power, Drama,..."
1,karthiga,2255153,59,7,Chobits,"Sci-Fi, Comedy, Drama, Romance, Ecchi, Seinen"
2,karthiga,2255153,74,7,Gakuen Alice,"Comedy, School, Shoujo, Super Power"
3,karthiga,2255153,120,7,Fruits Basket,"Slice of Life, Comedy, Drama, Romance, Fantasy..."
4,karthiga,2255153,178,7,Ultra Maniac,"Magic, Comedy, Romance, School, Shoujo"


In [35]:
anime_final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31282812 entries, 0 to 31282811
Data columns (total 6 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   username  object
 1   user_id   int64 
 2   anime_id  int64 
 3   my_score  int64 
 4   title     object
 5   genre     object
dtypes: int64(3), object(3)
memory usage: 1.4+ GB


In [36]:
anime_samp_df.info(),anime_samp_list_df.info(),user_samp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6668 entries, 0 to 6667
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   anime_id  6668 non-null   int64 
 1   title     6668 non-null   object
 2   genre     6668 non-null   object
dtypes: int64(1), object(2)
memory usage: 156.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31282812 entries, 0 to 31282811
Data columns (total 4 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   username  object
 1   user_id   int64 
 2   anime_id  int64 
 3   my_score  int64 
dtypes: int64(3), object(1)
memory usage: 954.7+ MB
<class 'pandas.core.frame.DataFrame'>
Index: 108706 entries, 0 to 108710
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   username  108706 non-null  object
 1   user_id   108706 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 2.5+ MB


(None, None, None)

## Recomendation system ##

### MAL USER 

In [37]:
output_file = 'D:/Nitty/Personal Projects/Anime-recommendation-system/API/MAL_user_data.json'
with open(output_file) as f:
    data = json.load(f)


In [38]:
user_id=5656156
username='Nitheeshwar'
animelist=set(anime_samp_df['anime_id'])
watched_anime_ids = set()
df=[]
for i in range(len(data)):
    if data[i]['node']['id'] in animelist:
        watched_anime_ids.add(data[i]['node']['id'])
        if data[i]['list_status']['score']>0:
            genre_series = anime_samp_df[anime_samp_df['anime_id'] == data[i]['node']['id']]['genre']
            genre = genre_series.iloc[0] if not genre_series.empty else None
            df.append([username,user_id,data[i]['node']['id'],data[i]['node']['title'],data[i]['list_status']['score'],genre])
MAL_user_df = pd.DataFrame(df, columns=['user_id','username','anime_id','anime_title', 'my_score', 'genre'])

In [39]:
anime_df['related'][0]

"{'Adaptation': [{'mal_id': 17207, 'type': 'manga', 'url': 'https://myanimelist.net/manga/17207/Inu_x_Boku_SS', 'title': 'Inu x Boku SS'}], 'Sequel': [{'mal_id': 13403, 'type': 'anime', 'url': 'https://myanimelist.net/anime/13403/Inu_x_Boku_SS_Special', 'title': 'Inu x Boku SS Special'}]}"

In [40]:
import ast

def extract_related_ids(related_entry):
    related_ids = set()
    try:
        related_dict = ast.literal_eval(related_entry)
        if isinstance(related_dict, dict):
            for key, related_list in related_dict.items():
                for item in related_list:
                    if isinstance(item, dict) and 'mal_id' in item:
                        related_ids.add(item['mal_id'])

    except (ValueError, SyntaxError) as e:
        print(f"Error processing related entry: {e}")

    return related_ids


In [41]:
def get_related_anime_ids(user_watched_ids, anime_df):
    related_ids = set()
    user_watched_anime = anime_df[anime_df['anime_id'].isin(user_watched_ids)]

    for _, row in user_watched_anime.iterrows():
        related_ids.update(extract_related_ids(row['related']))

    return related_ids


In [42]:
related_anime_ids = get_related_anime_ids(watched_anime_ids, anime_df)
exclude_ids = watched_anime_ids.union(related_anime_ids)

In [43]:
MAL_user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      80 non-null     object
 1   username     80 non-null     int64 
 2   anime_id     80 non-null     int64 
 3   anime_title  80 non-null     object
 4   my_score     80 non-null     int64 
 5   genre        80 non-null     object
dtypes: int64(3), object(3)
memory usage: 3.9+ KB


In [44]:
MAL_user_df.shape

(80, 6)

#### Content based ####

In [45]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [46]:
tfidf = TfidfVectorizer(tokenizer=lambda x: x.split(', '))
anime_tfidf = tfidf.fit_transform(anime_df['genre'])



In [47]:
user_genres = " ".join(MAL_user_df['genre'])
user_vector = tfidf.transform([user_genres])
similarities = cosine_similarity(user_vector, anime_tfidf).flatten()

In [48]:
def filter_related_recommendations(recommendations, anime_df):
    recommended_anime_ids = recommendations['anime_id'].tolist()
    filtered_ids = set()

    for anime_id in recommended_anime_ids:
        if anime_id in filtered_ids:
            continue

        related_ids = set()
        related_entry = anime_df.loc[anime_df['anime_id'] == anime_id, 'related'].values
        if len(related_entry) > 0:
            related_ids = extract_related_ids(related_entry[0]) 

        filtered_ids.update(related_ids)

    return recommendations[~recommendations['anime_id'].isin(filtered_ids)]


In [49]:
anime_df['similarity'] = similarities
content_recommendations = anime_df.sort_values('similarity', ascending=False)
filtered_content_recommendations = content_recommendations[~content_recommendations['anime_id'].isin(exclude_ids)]
filtered_recommendations = filter_related_recommendations(filtered_content_recommendations, anime_df)

final_recommendations = filtered_recommendations.head(10)
final_recommendations

Unnamed: 0,anime_id,title,title_english,title_japanese,title_synonyms,image_url,type,source,episodes,status,...,related,producer,licensor,studio,genre,opening_theme,ending_theme,duration_min,aired_from_year,similarity
2936,4197,Blue Dragon: Tenkai no Shichi Ryuu,Blue Dragon: The Seven Dragons of the Heavens,BLUE DRAGON 天界の七竜,"Blue Dragon: Tenkuu no Nana Ryuu, Blue Dragon:...",https://myanimelist.cdn-dena.com/images/anime/...,TV,Game,51,Finished Airing,...,"{'Prequel': [{'mal_id': 2142, 'type': 'anime',...",TV Tokyo,Viz Media,Studio Pierrot,"Adventure, Comedy, Fantasy, Supernatural","['#1: ""Hikari no Sasu Hō e (光の指す方へ)"" by JiLL-D...","['#1: ""Tsubomi (蕾)"" by JiLL-Decoy association ...",20.0,2008.0,0.646504
794,1961,Arashi no Yoru ni,,あらしのよるに,"In the Night of the Storm, On A Stormy Night, ...",https://myanimelist.cdn-dena.com/images/anime/...,Movie,Book,1,Finished Airing,...,"{'Adaptation': [{'mal_id': 7330, 'type': 'mang...","Shogakukan Productions, Mainichi Broadcasting ...",,Group TAC,"Adventure, Comedy, Drama, Fantasy",[],"['""aiko"" by Star']",107.0,2005.0,0.635193
3281,4712,Digimon Savers: Agumon! Gaomon! Lalamon! Bakur...,,デジモンセイバーズ 特典映像 アグモン！ガオモン！ララモン！爆裂！場外ラストバトル！,Digimon Savers Special: Agumon! Gaomon! Lalamo...,https://myanimelist.cdn-dena.com/images/anime/...,Special,Original,1,Finished Airing,...,"{'Full story': [{'mal_id': 859, 'type': 'anime...",,,Toei Animation,"Action, Adventure, Comedy, Fantasy, Sci-Fi, Su...","['""Hirari"" by Wada Kouji']","['""Ryuusei"" by MiyuMiyu']",23.0,2007.0,0.632691
3552,893,Dragon Ball Movie 4: Saikyou e no Michi,Dragon Ball Movie 4: The Path to Power,ドラゴンボール 最強への道,Dragon Ball 10th Anniversary Movie,https://myanimelist.cdn-dena.com/images/anime/...,Movie,Manga,1,Finished Airing,...,"{'Alternative version': [{'mal_id': 223, 'type...",,Funimation,Toei Animation,"Action, Adventure, Comedy, Fantasy, Sci-Fi, Sh...",[],"['""DAN DAN Kokoro Hikareteku"" by FIELD OF VIEW']",74.0,1996.0,0.620841
4942,33142,Re:Zero kara Hajimeru Break Time,Re:ZERO ~Starting Break Time From Zero~,Re:ゼロから始める休憩時間〈ブレイクタイム〉,Re:Zero kara Hajimeru Kyuukei Jikan,https://myanimelist.cdn-dena.com/images/anime/...,Special,Light novel,11,Finished Airing,...,"{'Other': [{'mal_id': 31240, 'type': 'anime', ...",TV Tokyo,Funimation,Studio PuYUKAI,"Adventure, Comedy, Parody, Fantasy",[],[],2.0,2016.0,0.620825
2611,2832,Ani*Kuri15,,アニ＊クリ15,"Ani*Cre15, Ani Kuri 15, Ani-Kuri 15, Anikuri 1...",https://myanimelist.cdn-dena.com/images/anime/...,Special,Original,15,Finished Airing,...,[],,,"Gonzo, Gainax, Production I.G, Madhouse, Studi...","Slice of Life, Adventure, Fantasy, Magic, Game...",[],[],1.0,2007.0,0.617019
3570,225,Dragon Ball GT,Dragon Ball GT,ドラゴンボールGT,"Dragonball GT, DBGT",https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,64,Finished Airing,...,"{'Prequel': [{'mal_id': 813, 'type': 'anime', ...",Fuji TV,Funimation,Toei Animation,"Action, Adventure, Comedy, Fantasy, Magic, Sci...","['""DAN DAN Kokoro Hikarete ku"" by Field of View']","['#1: ""Hitori Ja Nai"" by Deen (eps 1-26)', '#2...",23.0,1996.0,0.616174
4229,231,Asagiri no Miko,Shrine of the Morning Mist,朝霧の巫女,"Maidens of Morning Mist, Priestesses of the Mo...",https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,26,Finished Airing,...,"{'Adaptation': [{'mal_id': 3730, 'type': 'mang...","Starchild Records, TV Tokyo Music, Shounen Gah...","Media Blasters, NYAV Post","Chaos Project, GANSIS","Action, Comedy, Drama, Fantasy, Magic, School,...","['""Faint Love"" by MEGUMI HAYASHIBARA']","['""Koibuni"" by MEGUMI HAYASHIBARA']",12.0,2002.0,0.61321
5261,1250,Erementar Gerad,Elemental Gelade,エレメンタルジェレイド,Erementar Gerade,https://myanimelist.cdn-dena.com/images/anime/...,TV,Manga,26,Finished Airing,...,"{'Adaptation': [{'mal_id': 114, 'type': 'manga...","TV Tokyo, Geneon Universal Entertainment, Sots...","Funimation, Geneon Entertainment USA",Xebec,"Adventure, Comedy, Fantasy, Magic, Romance, Su...","['""Forever..."" by Savage Genius']","['#1: ""約束 (Yakusoku)"" by Kuroda Michihiro', '#...",24.0,2005.0,0.611952
2202,867,Slayers Gorgeous,,スレイヤーズごぅじゃす,Slayers Movie 4,https://myanimelist.cdn-dena.com/images/anime/...,Movie,Light novel,1,Finished Airing,...,"{'Adaptation': [{'mal_id': 20871, 'type': 'man...",Kadokawa Shoten,ADV Films,J.C.Staff,"Adventure, Comedy, Fantasy, Magic, Supernatural",[],"['""Raging Waves"" by Megumi Hayashibara']",60.0,1998.0,0.60912


In [50]:
final_recommendations['title']

2936                   Blue Dragon: Tenkai no Shichi Ryuu
794                                     Arashi no Yoru ni
3281    Digimon Savers: Agumon! Gaomon! Lalamon! Bakur...
3552              Dragon Ball Movie 4: Saikyou e no Michi
4942                     Re:Zero kara Hajimeru Break Time
2611                                           Ani*Kuri15
3570                                       Dragon Ball GT
4229                                      Asagiri no Miko
5261                                      Erementar Gerad
2202                                     Slayers Gorgeous
Name: title, dtype: object

#### CF ####

In [51]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

In [52]:
anime_samp_list_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31282812 entries, 0 to 31282811
Data columns (total 4 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   username  object
 1   user_id   int64 
 2   anime_id  int64 
 3   my_score  int64 
dtypes: int64(3), object(1)
memory usage: 954.7+ MB


In [53]:
user_item_matrix = anime_samp_list_df.pivot(index='user_id', columns='anime_id', values='my_score').fillna(0)

In [54]:
knn = NearestNeighbors(metric='cosine', algorithm='auto', n_neighbors=3, n_jobs=-1)
knn.fit(user_item_matrix)

In [55]:
user_id = 1  
distances, indices = knn.kneighbors(user_item_matrix.loc[user_id].values.reshape(1, -1))
similar_users = user_item_matrix.index[indices.flatten()]
rated_by_similar_users = user_item_matrix.loc[similar_users].gt(0).sum(axis=0)
unrated_anime = user_item_matrix.loc[user_id].eq(0)

knn_recommendations = rated_by_similar_users[unrated_anime].sort_values(ascending=False).head(10)
print("\nKNN-based Collaborative Filtering Recommendations:")
knn_recommendations


KNN-based Collaborative Filtering Recommendations:


anime_id
430     2
3230    2
395     2
1482    2
1142    2
856     2
237     2
1576    2
467     2
790     2
dtype: int64

In [56]:
recommended_anime_ids = knn_recommendations.index 
recommended_anime = anime_df[anime_df['anime_id'].isin(recommended_anime_ids)]  
print("SVD-based Recommendations:")
recommended_anime[['anime_id', 'title', 'genre']]

SVD-based Recommendations:


Unnamed: 0,anime_id,title,genre
202,790,Ergo Proxy,"Sci-Fi, Mystery, Psychological"
474,1142,Hachimitsu to Clover II,"Drama, Josei, Romance, Slice of Life"
1168,467,Ghost in the Shell: Stand Alone Complex,"Action, Military, Sci-Fi, Police, Mecha, Seinen"
2121,1482,D.Gray-man,"Action, Adventure, Comedy, Super Power, Demons..."
2580,3230,Druaga no Tou: The Aegis of Uruk,"Adventure, Comedy, Fantasy"
2608,237,Eureka Seven,"Sci-Fi, Adventure, Drama, Romance, Mecha"
4930,430,Fullmetal Alchemist: The Conqueror of Shamballa,"Military, Comedy, Historical, Drama, Fantasy, ..."
5398,1576,Bakumatsu Kikansetsu Irohanihoheto,"Action, Drama, Historical, Samurai"
5791,395,Gantz 2nd Stage,"Action, Sci-Fi, Horror, Psychological, Superna..."
6459,856,Utawarerumono,"Action, Sci-Fi, Drama, Fantasy"


In [57]:
from sklearn.decomposition import TruncatedSVD


svd = TruncatedSVD(n_components=3, random_state=42)
svd_matrix = svd.fit_transform(user_item_matrix)


svd_reconstructed = pd.DataFrame(svd_matrix.dot(svd.components_), index=user_item_matrix.index, columns=user_item_matrix.columns)

user_recommendations = svd_reconstructed.loc[user_id].sort_values(ascending=False).head(10)
user_recommendations


anime_id
1535    9.144493
121     8.462868
199     7.867781
1575    7.804190
226     6.960308
431     6.935286
164     6.835676
1       6.819605
2904    6.818301
853     6.774023
Name: 1, dtype: float64

In [58]:
recommended_anime_ids_knn = user_recommendations.index  
recommended_anime_knn = anime_df[anime_df['anime_id'].isin(recommended_anime_ids_knn)]  
print("\nKNN-based Collaborative Filtering Recommendations:")
recommended_anime_knn[['anime_id', 'title', 'genre']]


KNN-based Collaborative Filtering Recommendations:


Unnamed: 0,anime_id,title,genre
9,853,Ouran Koukou Host Club,"Comedy, Harem, Romance, School, Shoujo"
1013,164,Mononoke Hime,"Action, Adventure, Fantasy"
1118,431,Howl no Ugoku Shiro,"Adventure, Drama, Fantasy, Romance"
3404,199,Sen to Chihiro no Kamikakushi,"Adventure, Supernatural, Drama"
3550,2904,Code Geass: Hangyaku no Lelouch R2,"Action, Military, Sci-Fi, Super Power, Drama, ..."
3802,1535,Death Note,"Mystery, Police, Psychological, Supernatural, ..."
4171,226,Elfen Lied,"Action, Horror, Psychological, Supernatural, D..."
4358,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space"
5112,121,Fullmetal Alchemist,"Action, Adventure, Comedy, Drama, Fantasy, Mag..."
6579,1575,Code Geass: Hangyaku no Lelouch,"Action, Military, Sci-Fi, Super Power, Drama, ..."


In [59]:
MAL_user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      80 non-null     object
 1   username     80 non-null     int64 
 2   anime_id     80 non-null     int64 
 3   anime_title  80 non-null     object
 4   my_score     80 non-null     int64 
 5   genre        80 non-null     object
dtypes: int64(3), object(3)
memory usage: 3.9+ KB


In [60]:
anime_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6668 entries, 0 to 6667
Data columns (total 34 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   anime_id         6668 non-null   int64  
 1   title            6668 non-null   object 
 2   title_english    3438 non-null   object 
 3   title_japanese   6663 non-null   object 
 4   title_synonyms   4481 non-null   object 
 5   image_url        6667 non-null   object 
 6   type             6668 non-null   object 
 7   source           6668 non-null   object 
 8   episodes         6668 non-null   int64  
 9   status           6668 non-null   object 
 10  airing           6668 non-null   bool   
 11  aired_string     6668 non-null   object 
 12  aired            6668 non-null   object 
 13  duration         6668 non-null   object 
 14  rating           6586 non-null   object 
 15  score            6668 non-null   float64
 16  scored_by        6668 non-null   int64  
 17  rank          

In [61]:
def recommend_anime_from_mal(mal_user_df, anime_df, genre_filters=None, top_n=10, match_all=False):
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    #high_rated_anime = mal_user_df[mal_user_df['my_score'] >= 7]
    
    # Combine genres of the user's preferred anime
    user_genre_pref = ' '.join(mal_user_df['genre'])
    
    # TF-IDF Vectorization
    tfidf_vectorizer = TfidfVectorizer()
    combined_genres = anime_df['genre'].tolist() + [user_genre_pref]
    genre_tfidf_matrix = tfidf_vectorizer.fit_transform(combined_genres)
    
    # Calculate Cosine Similarity
    user_vector = genre_tfidf_matrix[-1]
    cosine_sim = cosine_similarity(user_vector, genre_tfidf_matrix[:-1]).flatten()
    
    # Add similarity scores to the anime dataframe
    anime_df['similarity'] = cosine_sim

    # Filter by genres if specified
    if len(genre_filters):
        if match_all:
            # Match all genres: Anime must include all genres in the filter
            genre_filter_set = set(genre_filters)
            anime_df = anime_df[
                anime_df['genre'].apply(lambda x: genre_filter_set.issubset(set(x.split(', '))))
            ]
        else:
            # Match any genre: Anime must include at least one genre in the filter
            genre_regex = '|'.join(genre_filters)  # Create a regex to match any genre
            anime_df = anime_df[anime_df['genre'].str.contains(genre_regex, case=False)]

    # Sort by similarity and return top N
    #recommendations = anime_df.sort_values(by='similarity', ascending=False).head(top_n)
    content_recommendations = anime_df.sort_values('similarity', ascending=False)
    filtered_content_recommendations = content_recommendations[~content_recommendations['anime_id'].isin(exclude_ids)]
    filtered_recommendations = filter_related_recommendations(filtered_content_recommendations, anime_df)

    final_recommendations = filtered_recommendations.head(top_n)
    final_recommendations
    
    return final_recommendations[['anime_id', 'title', 'genre', 'similarity']]

# Example Usage
genre_filters = []  # Multiple genres
recommendations = recommend_anime_from_mal(MAL_user_df, anime_df, genre_filters, top_n=10, match_all=False)
recommendations


Unnamed: 0,anime_id,title,genre,similarity
4204,891,Dragon Ball Movie 2: Majinjou no Nemuri Hime,"Action, Adventure, Comedy, Fantasy, Sci-Fi, Sh...",0.7776
3552,893,Dragon Ball Movie 4: Saikyou e no Michi,"Action, Adventure, Comedy, Fantasy, Sci-Fi, Sh...",0.7776
3570,225,Dragon Ball GT,"Action, Adventure, Comedy, Fantasy, Magic, Sci...",0.752719
6209,2589,Black Cat: Toozakaru Neko,"Action, Sci-Fi, Adventure, Comedy, Super Power...",0.746069
4229,231,Asagiri no Miko,"Action, Comedy, Drama, Fantasy, Magic, School,...",0.729076
3314,902,Dragon Ball Z Movie 09: Ginga Girigiri!! Bucch...,"Action, Sci-Fi, Adventure, Comedy, Fantasy, Sh...",0.72813
1220,898,Dragon Ball Z Movie 05: Tobikkiri no Saikyou t...,"Action, Sci-Fi, Adventure, Comedy, Fantasy, Sh...",0.72813
5610,900,Dragon Ball Z Movie 07: Kyokugen Battle!! Sand...,"Action, Sci-Fi, Adventure, Comedy, Fantasy, Sh...",0.72813
2439,906,Dragon Ball Z Movie 13: Ryuuken Bakuhatsu!! Go...,"Action, Sci-Fi, Adventure, Comedy, Fantasy, Sh...",0.72813
2191,905,Dragon Ball Z Movie 12: Fukkatsu no Fusion!! G...,"Action, Sci-Fi, Adventure, Comedy, Fantasy, Sh...",0.72813


#### ANIME + GENRE (OPTIONAL)

In [62]:
import requests
import json

def search_anime_by_name(anime_name, limit=5):
    """
    Search for an anime by name using MyAnimeList API v2 and return the anime list.
    
    Parameters:
    - access_token (str): The OAuth access token.
    - anime_name (str): The anime name to search for.
    - limit (int): The number of results to return (default is 4).
    
    Returns:
    - List of anime data if found, or an error message if not found.
    """
    token_file = 'D:/Nitty/Personal Projects/Anime-recommendation-system/API/key.json'
    with open(token_file) as f:
        data = json.load(f)
    ACCESS_TOKEN = data['access_token']

    url = f'https://api.myanimelist.net/v2/anime?q={anime_name}&limit={limit}'
    
    headers = {
        'Authorization': f'Bearer {ACCESS_TOKEN}'  # Authorization header with Bearer token
    }
    
    try:
        response = requests.get(url, headers=headers)
        
        if response.status_code == 200:
            data = response.json()  # Parse the JSON response
            return data['data']  # Extract the list of anime data from the response
        else:
            return f"Error {response.status_code}: {response.text}"
    
    except Exception as e:
        return f"An error occurred: {e}"

def get_anime_id(anime_name):
    anime_data = search_anime_by_name(anime_name)
    anime_data = sorted(anime_data, key=lambda x: x['node']['id'])

    if isinstance(anime_data, list):
        for anime in anime_data:
            print(f"Title: {anime['node']['title']}, ID: {anime['node']['id']}")
            return anime['node']['id']
    else:
        print(f"No anime found with the name '{anime_name}'. Please check the spelling or try a different name.")
        return None

In [63]:
def recommend_anime_by_name_and_genrefilter(anime_df, anime_name, genre_filters=None, top_n=10, match_all=False):
    """
    Recommend anime based on the name of a liked anime and an optional genre filter.
    
    Parameters:
    - anime_df (DataFrame): Global anime dataset (with anime_id, title, genre).
    - anime_name (str): The title of the anime the user likes.
    - genre_filters (list of str, optional): List of genres to filter by (e.g., ["Action", "Adventure"]).
    - top_n (int): Number of recommendations to return.
    - match_all (bool): If True, match anime that include all specified genres. Otherwise, match any genre.
    
    Returns:
    - DataFrame: Top N recommended anime with their similarity scores or an error message if the anime is not found.
    """
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    
    # Ensure `get_anime_id` function returns the correct anime ID for the given anime name
    anime_id = get_anime_id(anime_name)
    
    # Filter anime_df to get the selected anime by name (assuming 'anime_id' is an integer)
    selected_anime = anime_df[anime_df['anime_id'] == anime_id]
    
    if selected_anime.empty:
        return f"No anime found with the name '{anime_name}'. Please check the spelling or try a different name."
    
    # Get the anime_id and genre of the selected anime
    selected_anime_id = selected_anime.iloc[0]['anime_id']
    selected_anime_genre = selected_anime.iloc[0]['genre']
    
    # TF-IDF Vectorization for genre similarity
    tfidf_vectorizer = TfidfVectorizer()
    combined_genres = anime_df['genre'].tolist() + [selected_anime_genre]
    genre_tfidf_matrix = tfidf_vectorizer.fit_transform(combined_genres)
    
    # Calculate Cosine Similarity
    selected_anime_vector = genre_tfidf_matrix[-1]  # Vector for selected anime
    cosine_sim = cosine_similarity(selected_anime_vector, genre_tfidf_matrix[:-1]).flatten()
    
    # Add similarity scores to the anime dataframe
    anime_df['similarity'] = cosine_sim
    
    # Filter by genres if specified
    if genre_filters:
        if match_all:
            # Match all genres: Anime must include all genres in the filter
            genre_filter_set = set(genre_filters)
            anime_df = anime_df[
                anime_df['genre'].apply(lambda x: genre_filter_set.issubset(set(x.split(', '))))
            ]
        else:
            # Match any genre: Anime must include at least one genre in the filter
            genre_regex = '|'.join(genre_filters)  # Create a regex to match any genre
            anime_df = anime_df[anime_df['genre'].str.contains(genre_regex, case=False)]

    # Sort by similarity and return top N
    #recommendations = anime_df.sort_values(by='similarity', ascending=False).head(top_n)
    content_recommendations = anime_df.sort_values('similarity', ascending=False)
    filtered_content_recommendations = content_recommendations[~content_recommendations['anime_id'].isin(exclude_ids)]
    filtered_recommendations = filter_related_recommendations(filtered_content_recommendations, anime_df)

    final_recommendations = filtered_recommendations.head(top_n)
    final_recommendations
    
    return final_recommendations[['anime_id', 'title', 'genre', 'similarity']]
    #return recommendations[['anime_id', 'title', 'genre', 'similarity']]

# Example Usage
anime_name = "Gosick"  # Specify the anime the user likes
genre_filters = ["Action"]  # Multiple genres (optional)
recommendations = recommend_anime_by_name_and_genrefilter(anime_df, anime_name, genre_filters, top_n=10, match_all=False)
recommendations


Title: Gosick, ID: 8425


Unnamed: 0,anime_id,title,genre,similarity
4961,1252,Fushigi no Umi no Nadia: Original Movie,"Action, Adventure, Drama, Historical, Mystery,...",0.790064
158,34430,Detective Conan Movie 21: The Crimson Love Letter,"Action, Mystery, Historical, Police, Drama, Ro...",0.777675
109,35240,Princess Principal,"Action, Mystery, Historical",0.750201
431,12879,Dantalian no Shoka: Ibarahime,"Action, Historical, Mystery",0.750201
608,1033,Sennen Joyuu,"Action, Adventure, Historical, Drama, Romance,...",0.64369
3473,2251,Baccano!,"Action, Mystery, Comedy, Historical, Supernatural",0.637251
5703,33964,91 Days Recap,"Action, Historical, Drama",0.636656
2709,34777,91 Days Special,"Action, Historical, Drama",0.636656
408,282,Angel Heart,"Action, Mystery, Drama, Romance, Seinen",0.634616
1222,272,Noir,"Action, Mystery, Drama",0.627756


#### Genre only

In [64]:
def recommend_anime_by_genrefilter(anime_df, genre_filters, top_n=10, match_all=False):
    """
    Recommend anime based on the genre filter.
    
    Parameters:
    - anime_df (DataFrame): Global anime dataset (with anime_id, title, genre).
    - genre_filters (list of str): List of genres to filter by (e.g., ["Action", "Adventure"]).
    - top_n (int): Number of recommendations to return.
    - match_all (bool): If True, match anime that include all specified genres. Otherwise, match any genre.
    
    Returns:
    - DataFrame: Top N recommended anime.
    """
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    
    # Filter by genres if specified
    if genre_filters:
        if match_all:
            # Match all genres: Anime must include all genres in the filter
            genre_filter_set = set(genre_filters)
            anime_df = anime_df[
                anime_df['genre'].apply(lambda x: genre_filter_set.issubset(set(x.split(', '))))
            ]
        else:
            # Match any genre: Anime must include at least one genre in the filter
            genre_regex = '|'.join(genre_filters)  # Create a regex to match any genre
            anime_df = anime_df[anime_df['genre'].str.contains(genre_regex, case=False)]
    
    # TF-IDF Vectorization for genre similarity
    tfidf_vectorizer = TfidfVectorizer()
    genre_tfidf_matrix = tfidf_vectorizer.fit_transform(anime_df['genre'].tolist())
    
    # Calculate Cosine Similarity among anime based on genres
    cosine_sim = cosine_similarity(genre_tfidf_matrix, genre_tfidf_matrix)
    
    # Add similarity scores to the anime dataframe
    anime_df['similarity'] = cosine_sim.mean(axis=1)  # You can customize this part
    
    # Sort by similarity and return top N
    #recommendations = anime_df.sort_values(by='similarity', ascending=False).head(top_n)
    content_recommendations = anime_df.sort_values('similarity', ascending=False)
    filtered_content_recommendations = content_recommendations[~content_recommendations['anime_id'].isin(exclude_ids)]
    filtered_recommendations = filter_related_recommendations(filtered_content_recommendations, anime_df)

    final_recommendations = filtered_recommendations.head(top_n)
    final_recommendations
    
    return final_recommendations[['anime_id', 'title', 'genre', 'similarity']]
    #return recommendations[['anime_id', 'title', 'genre', 'similarity']]

# Example Usage
genre_filters = ["Romance"]  # List of genres to filter by
recommendations = recommend_anime_by_genrefilter(anime_df, genre_filters, top_n=10, match_all=False)
recommendations

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anime_df['similarity'] = cosine_sim.mean(axis=1)  # You can customize this part


Unnamed: 0,anime_id,title,genre,similarity
6335,1734,Ajimu: Kaigan Monogatari,"Comedy, Drama, Romance, School",0.298543
1465,36220,Itsudatte Bokura no Koi wa 10 cm Datta.,"Comedy, Drama, Romance, School",0.298543
6426,33036,Suki ni Naru Sono Shunkan wo.: Kokuhaku Jikkou...,"Comedy, Drama, Romance, School",0.298543
1052,1725,Keitai Shoujo,"Comedy, Romance, School, Drama",0.298543
5387,15379,Kotoura-san,"Comedy, Drama, Romance, School",0.298543
1044,1056,Good Morning Call,"Comedy, Drama, Romance, School, Shoujo",0.295603
3574,21995,Ao Haru Ride,"Comedy, Drama, Romance, School, Shoujo, Slice ...",0.291504
927,145,Kareshi Kanojo no Jijou,"Comedy, Drama, Romance, School, Shoujo, Slice ...",0.291504
2973,471,To Heart 2,"Comedy, Drama, Harem, Romance, School, Slice o...",0.285284
388,510,Kakyuusei 2: Hitomi no Naka no Shoujo-tachi,"Comedy, Drama, Harem, Romance, School",0.284994


In [65]:
anime_df['genre'] = anime_df['genre'].apply(lambda x: x.split(', ') if isinstance(x, str) else x)

# Flatten the list of genres and get unique genres
all_genres = anime_df['genre'].explode().unique()

# Sort the genres alphabetically
all_genres = sorted(all_genres)

# Display the unique genres
print(all_genres)

['', 'Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama', 'Ecchi', 'Fantasy', 'Game', 'Harem', 'Hentai', 'Historical', 'Horror', 'Josei', 'Kids', 'Magic', 'Martial Arts', 'Mecha', 'Military', 'Music', 'Mystery', 'Parody', 'Police', 'Psychological', 'Romance', 'Samurai', 'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'Shoujo Ai', 'Shounen', 'Shounen Ai', 'Slice of Life', 'Space', 'Sports', 'Super Power', 'Supernatural', 'Thriller', 'Vampire', 'Yaoi', 'Yuri']


In [69]:
anime_samp_df['genre'] = anime_samp_df['genre'].apply(lambda x: x.split(', ') if isinstance(x, str) else x)

# Flatten the list of genres and get unique genres
all_genres = anime_samp_df['genre'].explode().unique()

# Sort the genres alphabetically
all_genres = sorted(all_genres)

# Display the unique genres
print(all_genres)

['', 'Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons', 'Drama', 'Ecchi', 'Fantasy', 'Game', 'Harem', 'Hentai', 'Historical', 'Horror', 'Josei', 'Kids', 'Magic', 'Martial Arts', 'Mecha', 'Military', 'Music', 'Mystery', 'Parody', 'Police', 'Psychological', 'Romance', 'Samurai', 'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'Shoujo Ai', 'Shounen', 'Shounen Ai', 'Slice of Life', 'Space', 'Sports', 'Super Power', 'Supernatural', 'Thriller', 'Vampire', 'Yaoi', 'Yuri']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anime_samp_df['genre'] = anime_samp_df['genre'].apply(lambda x: x.split(', ') if isinstance(x, str) else x)
