# Importing Libraries

In [1]:
# Basic Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import warnings
warnings.filterwarnings('ignore')
from joblib import load, dump, Parallel, delayed

# Visualization
import plotly.express as px
import plotly.graph_objects as go  # for 3D plot visualization
import plotly.figure_factory as ff
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)

from langdetect import detect
from datetime import datetime


# Data Prepocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

# Model Training
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
import faiss
from sklearn.neighbors import NearestNeighbors

'''
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Necessary modules for collaborative filtering
from keras.layers import Input, Embedding, Dot, Flatten, Dense
from keras.models import Model
from keras.optimizers import Adam
from wordcloud import WordCloud
from collections import defaultdict
from collections import Counter
'''
# Necessary modules for content-based filtering
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


# Reading Datasets

In [2]:
# Setting column display to 50
pd.set_option('display.max_columns', 50)

In [3]:
# Importing anime details dataframe
df_anime = pd.read_csv('../AllData/Anime/anime.csv')
print("Shape of the Dataset:",df_anime.shape)
df_anime.head(3)

Shape of the Dataset: (23748, 21)


Unnamed: 0,id,title,score,genres,synopsis,type,episodes,status,producers,licensors,studios,source,duration,rating,rank,popularity,favorites,scored_by,members,image_url,is_hentai
0,1,cowboy bebop,8.75,"award winning, action, sci-fi","crime is timeless. by the year 2071, humanity ...",tv,26,finished airing,bandai visual,"funimation, bandai entertainment",sunrise,original,24 min per ep,R - 17+ (violence & profanity),41,43,78525,914193,1771505,https://cdn.myanimelist.net/images/anime/4/196...,0
1,5,cowboy bebop tengoku no tobira,8.38,"action, sci-fi","another day, another bounty—such is the life o...",movie,1,finished airing,"sunrise, bandai visual",sony pictures entertainment,bones,original,1 hr 55 min,R - 17+ (violence & profanity),189,602,1448,206248,360978,https://cdn.myanimelist.net/images/anime/1439/...,0
2,6,trigun,8.22,"adventure, action, sci-fi","vash the stampede is the man with a $$60,000,0...",tv,26,finished airing,victor entertainment,"funimation, geneon entertainment usa",madhouse,manga,24 min per ep,PG-13 - Teens 13 or older,328,246,15035,356739,727252,https://cdn.myanimelist.net/images/anime/7/203...,0


In [4]:
# Importing user details dataframe
df_user = pd.read_csv('../AllData/Anime/users-details.csv')
print("Shape of the Dataset:",df_user.shape)
df_user.head()

Shape of the Dataset: (731282, 14)


Unnamed: 0,id,name,gender,joined,days_watched,mean_score,watching,completed,on_hold,dropped,plan_to_watch,total_entries,rewatched,episodes_watched
0,1,xinil,male,2004-11-05t00:00:00+00:00,142.3,7.37,1,233,8,93,64,399,60,8458
1,3,aokaado,male,2004-11-11t00:00:00+00:00,68.6,7.34,23,137,99,44,40,343,15,4072
2,4,crystal,female,2004-11-13t00:00:00+00:00,212.8,6.68,16,636,303,0,45,1000,10,12781
3,9,arcane,-,2004-12-05t00:00:00+00:00,30.0,7.71,5,54,4,3,0,66,0,1817
4,18,mad,-,2005-01-03t00:00:00+00:00,52.0,6.27,1,114,10,5,23,153,42,3038


In [7]:
# Importing user score dataframe
df_score = pd.read_excel('../AllData/Anime/anime_user_ratings.xlsx')
print("Shape of the dataset:",df_score.shape)
df_score.head()

Shape of the dataset: (1048558, 3)


Unnamed: 0,anime_id,user_id,rating
0,1,1,10
1,1,4,8
2,1,20,9
3,1,23,9
4,1,47,7


# Data Preparation

In [8]:
df_anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23748 entries, 0 to 23747
Data columns (total 21 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          23748 non-null  int64  
 1   title       23748 non-null  object 
 2   score       23748 non-null  float64
 3   genres      23748 non-null  object 
 4   synopsis    23748 non-null  object 
 5   type        23748 non-null  object 
 6   episodes    23748 non-null  int64  
 7   status      23748 non-null  object 
 8   producers   23748 non-null  object 
 9   licensors   23748 non-null  object 
 10  studios     23748 non-null  object 
 11  source      23748 non-null  object 
 12  duration    23748 non-null  object 
 13  rating      23748 non-null  object 
 14  rank        23748 non-null  int64  
 15  popularity  23748 non-null  int64  
 16  favorites   23748 non-null  int64  
 17  scored_by   23748 non-null  int64  
 18  members     23748 non-null  int64  
 19  image_url   23748 non-nul

In [9]:
df_anime = df_anime.drop(columns=['synopsis', 'episodes', 'status', 'producers', 'licensors', 'source', 'duration', 'rating', 'rank', 'popularity', 'favorites', 'scored_by', 'members', 'is_hentai'])

In [10]:
df_anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23748 entries, 0 to 23747
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         23748 non-null  int64  
 1   title      23748 non-null  object 
 2   score      23748 non-null  float64
 3   genres     23748 non-null  object 
 4   type       23748 non-null  object 
 5   studios    23748 non-null  object 
 6   rating     23748 non-null  object 
 7   image_url  23748 non-null  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 1.4+ MB


In [10]:
# Preprocessing Score column
df_anime['score'].value_counts()

score
-1.00    8064
 6.31      80
 6.54      80
 6.25      79
 6.51      79
         ... 
 3.21       1
 3.29       1
 1.85       1
 3.69       1
 4.07       1
Name: count, Length: 567, dtype: int64

In [11]:
scores = df_anime['score'][df_anime['score'] != -1]
scores = scores.astype('float')
score_mean = round(scores.mean() , 2)

In [12]:
df_anime['score'] = df_anime['score'].replace(-1, score_mean)
df_anime['score'] = df_anime['score'].astype('float64')

In [13]:
df_anime['score'].value_counts()

score
6.38    8133
6.31      80
6.54      80
6.51      79
6.25      79
        ... 
3.21       1
3.29       1
1.85       1
3.69       1
4.07       1
Name: count, Length: 566, dtype: int64

In [14]:
# Processing Ranked column
df_anime['rank'].value_counts()

rank
-1        4149
 0         141
 18459       4
 17431       4
 6491        4
          ... 
 1956        1
 18626       1
 10974       1
 10733       1
 14536       1
Name: count, Length: 14795, dtype: int64

In [15]:
df_anime['rank'] = df_anime['rank'].replace(-1, np.nan)
df_anime['rank'] = df_anime['rank'].astype('float64')

In [16]:
df_anime['rank'].value_counts()

rank
0.0        141
12591.0      4
18448.0      4
14506.0      4
18804.0      4
          ... 
1956.0       1
18626.0      1
10974.0      1
10733.0      1
14536.0      1
Name: count, Length: 14794, dtype: int64

In [17]:
df_user['id'] = df_user['id'].astype('str')
df_user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731282 entries, 0 to 731281
Data columns (total 14 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                731282 non-null  object 
 1   name              731281 non-null  object 
 2   gender            731282 non-null  object 
 3   joined            731282 non-null  object 
 4   days_watched      731282 non-null  float64
 5   mean_score        731282 non-null  float64
 6   watching          731282 non-null  int64  
 7   completed         731282 non-null  int64  
 8   on_hold           731282 non-null  int64  
 9   dropped           731282 non-null  int64  
 10  plan_to_watch     731282 non-null  int64  
 11  total_entries     731282 non-null  int64  
 12  rewatched         731282 non-null  int64  
 13  episodes_watched  731282 non-null  int64  
dtypes: float64(2), int64(8), object(4)
memory usage: 78.1+ MB


In [18]:
df_user = df_user[df_user['name'].notnull()]

In [19]:
df_user.isnull().sum()

id                  0
name                0
gender              0
joined              0
days_watched        0
mean_score          0
watching            0
completed           0
on_hold             0
dropped             0
plan_to_watch       0
total_entries       0
rewatched           0
episodes_watched    0
dtype: int64

In [17]:
df_score['user_id'] = df_score['user_id'].astype('str')
df_score.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048558 entries, 0 to 1048557
Data columns (total 5 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   user_id      1048558 non-null  object
 1   username     1048449 non-null  object
 2   anime_id     1048558 non-null  int64 
 3   anime_title  1048558 non-null  object
 4   rating       1048558 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 40.0+ MB


In [18]:
df_score = df_score[df_score['username'].notnull()]

In [19]:
df_score.isnull().sum()

user_id        0
username       0
anime_id       0
anime_title    0
rating         0
dtype: int64

# Data Visualization

In [22]:
# Filter out anime titles with popularity value 0
df_valid_popularity = df_anime[df_anime['popularity'] > 0]

# Sort the dataframe by popularity and select the top 15
top_15_popular = df_valid_popularity.sort_values(by='popularity', ascending=True).head(15)

# Create a bar chart with different colors for each bar
fig = px.bar(top_15_popular, x='title', y='popularity',
             labels={'title': 'Anime Title', 'popularity': 'Popularity'},
             title='Top 15 Most Popular Animes',
             color='title')
# Note:- Less the popularity no. is more popular is the anime.
fig.show()

# Data Preprocessing

In [23]:
# Copying relevant columns
df = df_score[['user_id','anime_id','rating']].copy()
print("Shape of the Dataset:",df.shape)
df.head()

Shape of the Dataset: (1048449, 3)


Unnamed: 0,user_id,anime_id,rating
0,1,21,9
1,1,48,7
2,1,320,5
3,1,49,8
4,1,304,8


In [24]:
# Checking if there are any duplicate rows
duplicated_rows = df[df.duplicated()]
print("Duplicated Rows:")
print(duplicated_rows)

Duplicated Rows:
Empty DataFrame
Columns: [user_id, anime_id, rating]
Index: []


In [25]:
# Calculating the average score
avg_score = np.mean(df['rating'])
print('Average Score:', avg_score)

Average Score: 7.474533334477881


In [26]:
# Scaling our "rating" column
# Create a MinMaxScaler object
scaler = MinMaxScaler(feature_range=(0, 1))

# Scale the 'score' column between 0 and 1
df['scaled_score'] = scaler.fit_transform(df[['rating']])

In [27]:
# Encoding categorical data

## Encoding user IDs
user_encoder = LabelEncoder()
df["user_encoded"] = user_encoder.fit_transform(df["user_id"])
num_users = len(user_encoder.classes_)

## Encoding anime IDs
anime_encoder = LabelEncoder()
df["anime_encoded"] = anime_encoder.fit_transform(df["anime_id"])
num_animes = len(anime_encoder.classes_)

# Printing dataset information
print("Number of unique users: {}, Number of unique anime: {}".format(num_users, num_animes))
print("Minimum rating: {}, Maximum rating: {}".format(min(df['rating']), max(df['rating'])))

Number of unique users: 8701, Number of unique anime: 11820
Minimum rating: 1, Maximum rating: 10


In [28]:
print("Shape of the Dataset:",df.shape)
df.head()

Shape of the Dataset: (1048449, 6)


Unnamed: 0,user_id,anime_id,rating,scaled_score,user_encoded,anime_encoded
0,1,21,9,0.888889,0,11
1,1,48,7,0.666667,0,29
2,1,320,5,0.444444,0,296
3,1,49,8,0.777778,0,30
4,1,304,8,0.777778,0,280


# Model Training (Content-Based V1&V2)

In [20]:
# Create a TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Define a generator to compute TF-IDF matrix on the fly
tfidf_matrix = tfidf.fit_transform((genre for genre in df_anime['genres'].values.astype('U')))

# Compute cosine similarity matrix as a sparse matrix (V1)
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Convert to sparse matrix (V2)
tfidf_matrix_sparse = csr_matrix(tfidf_matrix)

# Content-Based Recommendation V1

In [30]:
# Function to get recommendations based on cosine similarity, genre, and ratings based on score
def get_recommendations_by_title_v1(title, cosine_sim, df, n=10):
    # Get the index of the anime that matches the title
    idx = df[df['title'] == title].index[0]

    # Compute the similarity scores between the anime at the given index and all other animes
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Filter out animes with unknown scores
    valid_scores = [x for x in sim_scores if df.iloc[x[0]]['score'] != -1]

    # Sort the valid anime similarity scores based on the cosine similarity and ratings score in descending order
    sorted_scores = sorted(valid_scores, key=lambda x: (x[1], df.iloc[x[0]]['score']), reverse=True)

    # Get the top n similar animes (excluding the anime itself)
    top_animes = [x for x in sorted_scores if x[0] != idx][:n]

    # Extract the indices of the recommended animes
    recommended_indices = [x[0] for x in top_animes]
    return df.iloc[recommended_indices][['id', 'title', 'genres', 'score', 'image_url']]

In [31]:
anime_title = 'kono subarashii sekai ni shukufuku wo'
print(f'Recommendations for "{anime_title}":')
get_recommendations_by_title_v1(anime_title, cosine_sim, df_anime)

Recommendations for "kono subarashii sekai ni shukufuku wo":


Unnamed: 0,title,genres,score
14487,kono subarashii sekai ni shukufuku wo movie ku...,"adventure, comedy, fantasy",8.45
11552,kono subarashii sekai ni shukufuku wo 2,"adventure, comedy, fantasy",8.27
6241,doraemon movie 31 shin nobita to tetsujin heid...,"adventure, comedy, fantasy",8.14
12422,kono subarashii sekai ni shukufuku wo 2 kono s...,"adventure, comedy, fantasy",8.03
502,slayers next,"adventure, comedy, fantasy",8.02
1068,slayers try,"adventure, comedy, fantasy",7.81
11838,little witch academia tv,"adventure, comedy, fantasy",7.81
12497,mahoujin guruguru 2017,"adventure, comedy, fantasy",7.81
7105,little witch academia,"adventure, comedy, fantasy",7.8
8061,little witch academia mahoujikake no parade,"adventure, comedy, fantasy",7.75


# Content-Based Recommendation V2

In [21]:
# Function to compute cosine similarity for a single anime
def compute_cosine_similarity_for_anime(matrix, idx):
    anime_vector = matrix[idx]
    similarity_scores = cosine_similarity(anime_vector, matrix).flatten()
    return similarity_scores

# Function to get recommendations based on cosine similarity, genre, and ratings based on score
def get_recommendations_by_title_v2(title, matrix, df, n=10):
    # Get the index of the anime that matches the title
    idx = df[df['title'] == title].index[0]

    # Compute the similarity scores between the anime at the given index and all other animes
    sim_scores = compute_cosine_similarity_for_anime(matrix, idx)

    # Filter out animes with unknown scores
    valid_scores = [(i, score) for i, score in enumerate(sim_scores) if df.iloc[i]['score'] != -1]

    # Sort the valid anime similarity scores based on the cosine similarity and ratings score in descending order
    sorted_scores = sorted(valid_scores, key=lambda x: (x[1], df.iloc[x[0]]['score']), reverse=True)

    # Get the top n similar animes (excluding the anime itself)
    top_animes = [x for x in sorted_scores if x[0] != idx][:n]

    # Extract the indices of the recommended animes
    recommended_indices = [x[0] for x in top_animes]
    return df.iloc[recommended_indices][['id', 'title', 'genres', 'score', 'image_url']]

# Function to get recommendations based on cosine similarity, genre, and ratings based on score
def get_recommendations_by_id_v2(id, matrix, df, n=10):
    # Get the index of the anime that matches the id
    idx = df[df['id'] == id].index[0]

    # Compute the similarity scores between the anime at the given index and all other animes
    sim_scores = compute_cosine_similarity_for_anime(matrix, idx)

    # Filter out animes with unknown scores
    valid_scores = [(i, score) for i, score in enumerate(sim_scores) if df.iloc[i]['score'] != -1]

    # Sort the valid anime similarity scores based on the cosine similarity and ratings score in descending order
    sorted_scores = sorted(valid_scores, key=lambda x: (x[1], df.iloc[x[0]]['score']), reverse=True)

    # Get the top n similar animes (excluding the anime itself)
    top_animes = [x for x in sorted_scores if x[0] != idx][:n]

    # Extract the indices of the recommended animes
    recommended_indices = [x[0] for x in top_animes]
    return df.iloc[recommended_indices][['id', 'title', 'genres', 'score', 'image_url']]

In [30]:
anime_id = 30831
anime_title = 'kono subarashii sekai ni shukufuku wo'
print(f'Recommendations for "{anime_title}":')
get_recommendations_by_title_v2(anime_title, tfidf_matrix_sparse, df_anime)

Recommendations for "kono subarashii sekai ni shukufuku wo":


Unnamed: 0,id,title,genres,score,image_url
14487,38040,kono subarashii sekai ni shukufuku wo movie ku...,"adventure, comedy, fantasy",8.45,https://cdn.myanimelist.net/images/anime/1638/...
11552,32937,kono subarashii sekai ni shukufuku wo 2,"adventure, comedy, fantasy",8.27,https://cdn.myanimelist.net/images/anime/2/831...
6241,10534,doraemon movie 31 shin nobita to tetsujin heid...,"adventure, comedy, fantasy",8.14,https://cdn.myanimelist.net/images/anime/7/724...
12422,34626,kono subarashii sekai ni shukufuku wo 2 kono s...,"adventure, comedy, fantasy",8.03,https://cdn.myanimelist.net/images/anime/1115/...
502,535,slayers next,"adventure, comedy, fantasy",8.02,https://cdn.myanimelist.net/images/anime/1165/...
1068,1172,slayers try,"adventure, comedy, fantasy",7.81,https://cdn.myanimelist.net/images/anime/10/50...
11838,33489,little witch academia tv,"adventure, comedy, fantasy",7.81,https://cdn.myanimelist.net/images/anime/13/83...
12497,34745,mahoujin guruguru 2017,"adventure, comedy, fantasy",7.81,https://cdn.myanimelist.net/images/anime/1880/...
7105,14349,little witch academia,"adventure, comedy, fantasy",7.8,https://cdn.myanimelist.net/images/anime/2/429...
8061,19489,little witch academia mahoujikake no parade,"adventure, comedy, fantasy",7.75,https://cdn.myanimelist.net/images/anime/12/75...


# Model Training (Content-Based V3)

In [21]:
# Compute the TF-IDF matrix
tfidf_matrix_v2 = tfidf.fit_transform(df_anime['genres'].values.astype('U')).astype(np.float32)

# Convert the sparse matrix to a dense matrix
tfidf_matrix_dense = tfidf_matrix_v2.toarray()

# Normalize the TF-IDF matrix
tfidf_matrix_dense = tfidf_matrix_dense / np.linalg.norm(tfidf_matrix_dense, axis=1, keepdims=True)

# Build the Faiss index
index = faiss.IndexFlatIP(tfidf_matrix_dense.shape[1])
index.add(tfidf_matrix_dense)

# Content-Based Recommendation V3

In [22]:
# Function to get recommendations based on cosine similarity, genre, and ratings based on score
def get_recommendations_by_id_v3(id, index, matrix, df, n=10):
    # Get the index of the anime that matches the title
    idx = df[df['id'] == id].index[0]

    # Compute the similarity scores between the anime at the given index and all other animes
    anime_vector = matrix[idx].reshape(1, -1).astype(np.float32)
    distances, indices = index.search(anime_vector, n + 1)  # n+1 because the anime itself will be included

    # Filter out the anime itself
    indices = indices[0]
    distances = distances[0]
    filtered_indices = [(i, d) for i, d in zip(indices, distances) if i != idx]

    # Sort the results by similarity score
    filtered_indices = sorted(filtered_indices, key=lambda x: (x[1], df.iloc[x[0]]['score']), reverse=True)

    # Get the top n similar animes
    top_indices = [i for i, _ in filtered_indices[:n]]

    # Extract the indices of the recommended animes
    recommended_indices = [i for i in top_indices if df.iloc[i]['score'] != -1]
    return df.iloc[recommended_indices][['id', 'title', 'genres', 'score', 'image_url']]

In [24]:
anime_id = 30831
print(f'Recommendations for "{anime_id}":')
get_recommendations_by_id_v3(anime_id, index, tfidf_matrix_dense, df_anime)

Recommendations for "30831":


Unnamed: 0,id,title,genres,score,image_url
227,251,kyou kara maou,"adventure, comedy, fantasy",7.63,https://cdn.myanimelist.net/images/anime/11/75...
307,331,mahoujin guruguru,"adventure, comedy, fantasy",7.6,https://cdn.myanimelist.net/images/anime/2/758...
468,501,doraemon,"adventure, comedy, fantasy",7.37,https://cdn.myanimelist.net/images/anime/10/33...
256,280,animal yokochou,"adventure, comedy, fantasy",7.26,https://cdn.myanimelist.net/images/anime/5/534...
87,108,ou dorobou jing in seventh heaven,"adventure, comedy, fantasy",7.25,https://cdn.myanimelist.net/images/anime/5/303...
308,332,dokidoki densetsu mahoujin guruguru,"adventure, comedy, fantasy",7.1,https://cdn.myanimelist.net/images/anime/1153/...
94,115,el hazard the alternative world,"adventure, comedy, fantasy",6.79,https://cdn.myanimelist.net/images/anime/3/610...
97,118,el hazard 2 the magnificent world,"adventure, comedy, fantasy",6.78,https://cdn.myanimelist.net/images/anime/11/27...
274,298,hack tasogare no udewa densetsu,"adventure, comedy, fantasy",6.56,https://cdn.myanimelist.net/images/anime/4/783...
141,163,power stone,"adventure, comedy, fantasy",6.55,https://cdn.myanimelist.net/images/anime/1110/...


# Model Training (SVD : User-Based Collaborative)

In [34]:
# # Load the dataset into Surprise's format
# reader = Reader(rating_scale=(df_score['rating'].min(), df_score['rating'].max()))
# data = Dataset.load_from_df(df_score[['user_id', 'anime_id', 'rating']], reader)

In [35]:
# # Define the SVD model
# svd_model = SVD()

# # Evaluate the model using cross-validation
# cross_validate(svd_model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# # Train the SVD model on the entire dataset
# svd_trainset = data.build_full_trainset()
# svd_model.fit(svd_trainset)

In [36]:
# dump(svd_model, '../AllModel/anime_svd_model.joblib')

# User-Based Collaborative Recommendation

In [25]:
# Create a pivot table with users as rows and animes as columns
user_anime_matrix = df_score.pivot(index='user_id', columns='anime_id', values='rating').fillna(0)

# Convert to sparse matrix
user_anime_sparse_matrix = csr_matrix(user_anime_matrix.values)

# Fit the NearestNeighbors model
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_anime_sparse_matrix)

In [26]:
# Function to get real-time recommendations using user-based collaborative filtering
def get_recommendations_by_user(user_id, knn, user_anime_matrix, n=10):
    user_index = user_anime_matrix.index.get_loc(user_id)
    n_neighbors = min(n + 1, user_anime_sparse_matrix.shape[0])  # Ensure n_neighbors is not greater than the number of samples
    distances, indices = knn.kneighbors(user_anime_sparse_matrix[user_index], n_neighbors=n_neighbors)
    
    # Get the indices of the most similar users
    similar_users_indices = indices.flatten()[1:]  # Exclude the user itself
    similar_users_distances = distances.flatten()[1:]  # Exclude the user itself
    
    # Get the animes rated by the user
    user_rated_animes = user_anime_matrix.iloc[user_index]
    
    # Initialize a dictionary to store the weighted sum of ratings
    weighted_ratings = {}
    
    # Iterate over similar users
    for similar_user_index, distance in zip(similar_users_indices, similar_users_distances):
        similarity_score = 1 - distance
        similar_user_ratings = user_anime_matrix.iloc[similar_user_index]
        
        # Iterate over the animes rated by the similar user
        for anime_id, rating in similar_user_ratings.items():
            if user_rated_animes[anime_id] == 0:  # Only consider animes not rated by the user
                if anime_id not in weighted_ratings:
                    weighted_ratings[anime_id] = 0
                weighted_ratings[anime_id] += similarity_score * rating
    
    # Sort the animes based on the weighted sum of ratings
    sorted_animes = sorted(weighted_ratings.items(), key=lambda x: x[1], reverse=True)
    
    # Get the top n animes
    top_n_animes = sorted_animes[:n]
    return top_n_animes

def hybrid_recommendations(user_id, svd, knn, user_anime_matrix, n=10):
    svd_recommendations = []

    if svd is not None:
        # Get initial recommendations using the SVD model
        user_rated_animes = user_anime_matrix.loc[user_id]
        all_animes = user_anime_matrix.columns
        
        for anime_id in all_animes:
            if user_rated_animes[anime_id] == 0:  # Only consider animes not rated by the user
                svd_recommendations.append((anime_id, svd.predict(user_id, anime_id).est))

        # Normalize the SVD recommendation scores
        if svd_recommendations:
            max_svd_score = max(svd_recommendations, key=lambda x: x[1])[1]
            min_svd_score = min(svd_recommendations, key=lambda x: x[1])[1]
            svd_recommendations = [(anime_id, (score - min_svd_score) / (max_svd_score - min_svd_score), score) for anime_id, score in svd_recommendations]
        
        # Sort the SVD recommendations
        svd_recommendations = sorted(svd_recommendations, key=lambda x: x[1], reverse=True)
        
        # Get the top n SVD recommendations
        top_svd_recommendations = svd_recommendations[:n]
    else:
        top_svd_recommendations = []
    
    # Refine the recommendations using user-based collaborative filtering
    user_based_recommendations = get_recommendations_by_user(user_id, knn, user_anime_matrix, n)

    # Normalize the user-based recommendation scores
    if user_based_recommendations:
        max_user_score = max(user_based_recommendations, key=lambda x: x[1])[1]
        min_user_score = min(user_based_recommendations, key=lambda x: x[1])[1]
        user_based_recommendations = [(anime_id, (score - min_user_score) / (max_user_score - min_user_score), score) for anime_id, score in user_based_recommendations]

    # Combine both sets of recommendations and remove duplicates
    combined_recommendations = list({tmdbId: (score, original_score) for tmdbId, score, original_score in top_svd_recommendations + user_based_recommendations}.items())

    # Sort by recommendation score and then by tv score
    combined_recommendations = sorted(combined_recommendations, key=lambda x: (x[1][0], x[1][1]), reverse=True)
    
    final_recommendations = [anime_id for anime_id, _ in combined_recommendations[:n]]
    return final_recommendations

def get_anime_details_by_ids(anime_ids):
    animes = []
    for anime_id in anime_ids:
        anime = df_anime[df_anime['id'] == anime_id]
        animes.append(anime)
    result = pd.concat(animes)
    return result[['title', 'genres', 'score']]

In [27]:
svd_model = load('../WatcherAPI/app/Model/anime_svd_model.joblib')

In [28]:
user_id = '9'
recommended_animes_ids = hybrid_recommendations(user_id, svd_model, knn_model, user_anime_matrix)
print(f'Top 10 recommended animes using hybrid approach for user {user_id}: {recommended_animes_ids}')
get_anime_details_by_ids(recommended_animes_ids)

Top 10 recommended animes using hybrid approach for user 9: [71, 427, 42938, 21, 3572, 1914, 32281, 3092, 12431, 94]


Unnamed: 0,title,genres,score
51,full metal panic,"comedy, action, sci-fi",7.61
400,kaleido star,"comedy, fantasy, drama",7.91
17073,fruits basket the final,"supernatural, romance, drama",9.0
11,one piece,"adventure, fantasy, action",8.69
3131,macross f,"award winning, action, sci-fi, romance",7.89
1745,saiunkoku monogatari 2nd season,"adventure, fantasy, romance",8.02
11239,kimi no na wa,"supernatural, award winning, drama",8.85
2803,junjou romantica,"boys love, comedy, romance, drama",7.5
6732,uchuu kyoudai,"comedy, sci-fi",8.5
73,kidou senshi gundam seed destiny,"romance, action, sci-fi, drama",7.18


# Integrating Real-Time Data

In [29]:
from pymongo import MongoClient

# Function to fetch real-time data from the database
def fetch_real_time_data(uri, db_name, collection_name, data_type='anime'):
    # Connect to the MongoDB database
    client = MongoClient(uri)
    db = client[db_name]
    collection = db[collection_name]
    
    # Fetch the latest user ratings
    cursor = collection.find({'mediaType': 'anime', 'point': {'$gte': 1}}, {'_id': 0, 'userId': 1, 'mediaId': 1, 'point': 1})
    df_real_time = pd.DataFrame(list(cursor))
    df_real_time.rename(columns={'userId': 'user_id', 'mediaId': 'anime_id', 'point': 'rating'}, inplace=True)

    # Convert the data types
    df_real_time['user_id'] = df_real_time['user_id'].astype('str')
    df_real_time['anime_id'] = df_real_time['anime_id'].astype('int64')
    
    # Close the connection
    client.close()
    
    return df_real_time

# Function to update the user-item matrix and recompute nearest neighbors
def update_user_item_matrix(df_score, df_real_time):
    # Combine the existing data with the real-time data
    df_combined = pd.concat([df_score, df_real_time]).drop_duplicates(subset=['user_id', 'anime_id'], keep='last')
    
    # Create a pivot table with users as rows and animes as columns
    user_anime_matrix = df_combined.pivot(index='user_id', columns='anime_id', values='rating').fillna(0)
    
    # Convert to sparse matrix
    user_anime_sparse_matrix = csr_matrix(user_anime_matrix.values)
    
    return user_anime_matrix, user_anime_sparse_matrix

def update_knn_model(user_anime_sparse_matrix):
    # Fit the NearestNeighbors model
    knn = NearestNeighbors(metric='cosine', algorithm='brute')
    knn.fit(user_anime_sparse_matrix)
    
    return knn

In [None]:
mongouser = ""
mongopass = ""

uri = f'mongodb+srv://{mongouser}:{mongopass}@cluster0.ls3onag.mongodb.net/next-auth-prisma?retryWrites=true&w=majority&appName=Cluster0'
df_real_time = fetch_real_time_data(uri, 'next-auth-prisma', 'Media')
user_anime_matrix, user_anime_sparse_matrix = update_user_item_matrix(df_score, df_real_time)
knn = update_knn_model(user_anime_sparse_matrix)

user_id = '66ab7c9293c726da200f6fe2'
recommended_animes_ids = hybrid_recommendations(user_id, svd_model, knn, user_anime_matrix)
print(f'Top 10 recommended animes using hybrid approach for user {user_id}: {recommended_animes_ids}')
get_anime_details_by_ids(recommended_animes_ids)

Top 10 recommended animes using hybrid approach for user 66ab7c9293c726da200f6fe2: [5114, 49387, 820, 44, 24701, 15335, 9969, 35180, 28977, 32281]


Unnamed: 0,title,genres,score
3961,fullmetal alchemist brotherhood,"adventure, fantasy, action, drama",9.1
20589,vinland saga season 2,"adventure, action, drama",8.81
741,ginga eiyuu densetsu,"sci-fi, drama",9.02
25,rurouni kenshin meiji kenkaku romantan - tsuio...,"romance, action, drama",8.71
9150,mushishi zoku shou 2nd season,"adventure, supernatural, slice of life, mystery",8.73
7228,gintama movie 2 kanketsu-hen - yorozuya yo eie...,"comedy, action, sci-fi",8.91
5989,gintama,"comedy, action, sci-fi",9.04
12757,3-gatsu no lion 2nd season,drama,8.93
9875,gintama,"comedy, action, sci-fi",9.06
11239,kimi no na wa,"supernatural, award winning, drama",8.85
