In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Activation,BatchNormalization,InputLayer,Embedding,Dot,Dense,Flatten,Input
from tensorflow.keras.callbacks import ModelCheckpoint,LearningRateScheduler,TensorBoard,EarlyStopping

from wordcloud import WordCloud
%matplotlib inline

###  READING animelist.csv data


In [7]:
import os

In [8]:
INPUT_DIR = os.path.join('..','artifacts','raw')

In [118]:
rating_df = pd.read_csv(INPUT_DIR+'/animelist.csv',low_memory=True,usecols=['user_id','anime_id','rating'])

In [119]:
rating_df.head()

Unnamed: 0,user_id,anime_id,rating
0,0,67,9
1,0,6702,7
2,0,242,10
3,0,4898,0
4,0,21,10


In [120]:
len(rating_df)

499999

### Data Preprocessing

In [121]:
n_ratings = rating_df['user_id'].value_counts()
rating_df = rating_df[rating_df["user_id"].isin(n_ratings[n_ratings>=400].index)].copy()

In [122]:
len(rating_df)

331475

In [123]:
min_rating = min(rating_df['rating'])
min_rating

0

In [124]:
max_rating = max(rating_df['rating'])
max_rating

10

In [125]:
avg_rating = np.mean(rating_df['rating'])
avg_rating


np.float64(4.032913492721924)

In [126]:
## performing min-max scaling on the ratings
rating_df['rating'] = rating_df['rating'].apply(lambda x: (x-min_rating)/(max_rating-min_rating)).values.astype(np.float64)

In [127]:
## Check for duplicate values
rating_df.duplicated().sum()

np.int64(0)

In [19]:
## Check for Null Values
rating_df.isnull().sum()

user_id     0
anime_id    0
rating      0
dtype: int64

In [217]:
# Extract all unique user IDs
user_ids = rating_df['user_id'].unique().tolist()

# Encoding user IDs (Fix: Swap `x` and `i` in enumerate)
user2user_encoder = {x: i for i, x in enumerate(user_ids)}

# Decoding user IDs
user2user_decoded = {i : x for i , x in enumerate(user_ids)}

# Map user_id to encoded values
rating_df['user'] = rating_df['user_id'].map(user2user_encoder)

# Check for missing values
print(rating_df.isnull().sum())


user_id     0
anime_id    0
rating      0
user        0
anime       0
dtype: int64


In [218]:
## Check for number of users
n_users = len(user2user_encoder)
n_users

426

In [219]:
rating_df.head()

Unnamed: 0,user_id,anime_id,rating,user,anime
0,1316,32772,0.0,0,407
1,301,37510,0.9,1,1160
2,381,329,0.0,2,2034
3,1348,31240,0.8,3,2130
4,445,3226,0.9,4,1043


In [220]:
## Same thing we've done for user id same will do for anime id's
anime_ids = rating_df['anime_id'].unique().tolist()
anime2anime_encoder = {x:i for i,x in enumerate(anime_ids)}
anime2anime_decoder = {i : x for i , x in enumerate(anime_ids)}
rating_df['anime'] = rating_df['anime_id'].map(anime2anime_encoder)
print(rating_df.isnull().sum())

user_id     0
anime_id    0
rating      0
user        0
anime       0
dtype: int64


In [221]:
n_anime = len(anime2anime_encoder)
n_anime

12689

In [222]:
## to prevent from data leakage we will do random sampling of the data
rating_df = rating_df.sample(frac=1,random_state=42).reset_index(drop=True)

In [223]:
rating_df.head()

Unnamed: 0,user_id,anime_id,rating,user,anime
0,1309,36316,0.3,133,3086
1,313,37982,0.5,316,1839
2,98,23289,0.0,233,246
3,41,6893,0.4,39,2295
4,1203,18119,0.8,412,740


In [185]:
user2user_encoder

{944: 0,
 1470: 1,
 1023: 2,
 681: 3,
 386: 4,
 1310: 5,
 1113: 6,
 794: 7,
 785: 8,
 446: 9,
 864: 10,
 590: 11,
 781: 12,
 559: 13,
 1491: 14,
 228: 15,
 516: 16,
 1563: 17,
 1264: 18,
 965: 19,
 760: 20,
 146: 21,
 423: 22,
 1498: 23,
 436: 24,
 642: 25,
 899: 26,
 1601: 27,
 350: 28,
 1008: 29,
 1493: 30,
 1000: 31,
 694: 32,
 1420: 33,
 60: 34,
 1248: 35,
 1329: 36,
 1488: 37,
 631: 38,
 346: 39,
 774: 40,
 1433: 41,
 1185: 42,
 1374: 43,
 1348: 44,
 889: 45,
 314: 46,
 478: 47,
 1616: 48,
 1407: 49,
 41: 50,
 147: 51,
 620: 52,
 267: 53,
 483: 54,
 932: 55,
 731: 56,
 517: 57,
 577: 58,
 156: 59,
 563: 60,
 1049: 61,
 888: 62,
 652: 63,
 1127: 64,
 936: 65,
 1347: 66,
 19: 67,
 486: 68,
 1073: 69,
 1231: 70,
 1203: 71,
 995: 72,
 890: 73,
 438: 74,
 238: 75,
 976: 76,
 1036: 77,
 1523: 78,
 842: 79,
 865: 80,
 442: 81,
 112: 82,
 783: 83,
 571: 84,
 1247: 85,
 389: 86,
 444: 87,
 325: 88,
 1205: 89,
 664: 90,
 1439: 91,
 172: 92,
 1245: 93,
 1046: 94,
 841: 95,
 1430: 96,
 902: 9

In [224]:
rating_df.dropna(inplace=True)

In [225]:
X = rating_df[['user','anime']].values
y = rating_df['rating']

In [226]:
rating_df.isnull().sum()

user_id     0
anime_id    0
rating      0
user        0
anime       0
dtype: int64

In [229]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42  # 10% test size (adjust if needed)
)

print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")


X_train shape: (298327, 2), y_train shape: (298327,)
X_test shape: (33148, 2), y_test shape: (33148,)


In [230]:
import numpy as np

X_train_array = [np.array(X_train[:, 0]), np.array(X_train[:, 1])]
X_test_array = [np.array(X_test[:, 0]), np.array(X_test[:, 1])]
len(X_test_array)


2

### MODEL ARCHITECTURE

In [33]:
from tensorflow.keras.layers import Input, Embedding, Dot, Flatten, Dense, BatchNormalization, Activation
from tensorflow.keras.models import Model

def RecommenderNet():
    embedding_size = 128

    user = Input(name='user', shape=[1])
    user_embedding = Embedding(name='user_embedding', input_dim=n_users, output_dim=embedding_size)(user)

    anime = Input(name='anime', shape=[1])
    anime_embedding = Embedding(name='anime_embedding', input_dim=n_anime, output_dim=embedding_size)(anime)

    # Fix: Corrected axes to 1
    x = Dot(name='dot_product', normalize=True, axes=1)([user_embedding, anime_embedding])

    x = Flatten()(x)

    # Fix: Corrected kernel_initializer typo
    x = Dense(1, kernel_initializer='he_normal')(x)
    x = BatchNormalization()(x)
    x = Activation('sigmoid')(x)

    model = Model(inputs=[user, anime], outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['mae', 'mse'])

    return model


In [34]:
model = RecommenderNet()

In [35]:
model.summary()

In [36]:
# To find the best learning rate, we can use the LearningRateScheduler callback
# Define the learning rate schedule
start_lr = 0.00001
min_lr = 0.00001
max_lr = 0.00005
batch_size = 10000

ramum_epochs = 5
sustain_epochs = 0
exp_decay  = 0.8
def lrfn(epoch):
    if epoch < ramum_epochs:
        return (max_lr-start_lr)/ramum_epochs*epoch + start_lr
    elif epoch < ramum_epochs + sustain_epochs:
        return max_lr
    else:
        return (max_lr-min_lr)*exp_decay**(epoch-ramum_epochs-sustain_epochs)+min_lr

In [37]:
lr_callback = LearningRateScheduler(lambda epoch: lrfn(epoch),verbose=0)
checkpoint_filepath = './weights.weights.h5'

model_checkpoint = ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_loss',
    save_weights_only=True,
    save_best_only=True,
    mode='min',
    verbose=1
)

early_stopping = EarlyStopping(
    patience=3,
    monitor='val_loss',
    mode='min',
    restore_best_weights=True,
)

In [38]:
my_callbacks = [model_checkpoint,lr_callback,early_stopping]

In [39]:
#import numpy as np

#X_train_array = [np.array(X_train[:, 0]), np.array(X_train[:, 1])]
#X_test_array = [np.array(X_test[:, 0]), np.array(X_test[:, 1])]
history = model.fit(
    x=X_train_array,  # Pass as a list of two inputs
    y=y_train,
    batch_size=batch_size,
    epochs=20,
    verbose=1,
    validation_data=(X_test_array, y_test) , # Validation data must also be in the same format
    callbacks = my_callbacks
)

Epoch 1/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 274ms/step - loss: 0.8055 - mae: 0.3830 - mse: 0.1990
Epoch 1: val_loss improved from inf to 0.86866, saving model to ./weights.weights.h5
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 300ms/step - loss: 0.8055 - mae: 0.3831 - mse: 0.1990 - val_loss: 0.8687 - val_mae: 0.3936 - val_mse: 0.2170 - learning_rate: 1.0000e-05
Epoch 2/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 279ms/step - loss: 0.8058 - mae: 0.3832 - mse: 0.1990
Epoch 2: val_loss improved from 0.86866 to 0.84734, saving model to ./weights.weights.h5
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 294ms/step - loss: 0.8058 - mae: 0.3832 - mse: 0.1990 - val_loss: 0.8473 - val_mae: 0.3905 - val_mse: 0.2115 - learning_rate: 1.8000e-05
Epoch 3/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 604ms/step - loss: 0.8046 - mae: 0.3829 - mse: 0.1988
Epoch 3: val_loss improved from 0.84

In [40]:
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: (331475, 2)
y shape: (331475,)


In [41]:
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")


X_train shape: (298327, 2)
X_test shape: (33148, 2)
y_train shape: (298327,)
y_test shape: (33148,)


In [42]:
def extract_weights(name,model):
    weight_layer = model.get_layer(name)
    weights = weight_layer.get_weights()[0]
    weights = weights/np.linalg.norm(weights,axis=1).reshape(-1,1)
    return weights

In [43]:
anime_weights = extract_weights('anime_embedding',model)

In [44]:
user_weights = extract_weights('user_embedding',model)

In [45]:
anime_weights

array([[ 0.01571589, -0.01785725,  0.0947482 , ..., -0.03620861,
        -0.06169871,  0.03265613],
       [-0.1325622 ,  0.08138491, -0.00495628, ...,  0.13067654,
         0.09007382, -0.04756843],
       [-0.10643987,  0.05373679, -0.10999032, ..., -0.00061054,
        -0.01142685, -0.14388621],
       ...,
       [-0.11642922, -0.10335897, -0.01660006, ...,  0.02909606,
        -0.00896194, -0.1104106 ],
       [-0.13122068, -0.05646526, -0.06611496, ...,  0.01903584,
         0.11218029,  0.07635825],
       [-0.04937718, -0.12621294,  0.04940003, ..., -0.07821552,
         0.04490033, -0.07393818]], dtype=float32)

### Read Anime.csv

In [231]:
import pandas as pd
df = pd.read_csv(INPUT_DIR+'/anime.csv',low_memory=True)
df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,50229.0,75651.0,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, Magic",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,...,2182.0,4806.0,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,...,312.0,529.0,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0


In [232]:
df.replace('Unknown',np.nan,inplace=True)

In [233]:
def getAnimeName(anime_id):
    try:
        name = df[df.anime_id == anime_id].eng_version.values[0]
        if name is np.nan:
            name = df[df.anime_id == anime_id].Name.values[0]
    except:
        print("error")
    return name

In [234]:
df['anime_id'] = df['MAL_ID']
df['eng_version'] = df['English name']
df['eng_version'] = df.anime_id.apply(lambda x: getAnimeName(x))

In [235]:
df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,anime_id,eng_version
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0,1,Cowboy Bebop
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",,...,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0,5,Cowboy Bebop:The Movie
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",Trigun,トライガン,TV,26,"Apr 1, 1998 to Sep 30, 1998",Spring 1998,...,86142.0,49432.0,15376.0,5838.0,1965.0,664.0,316.0,533.0,6,Trigun
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, Magic",Witch Hunter Robin,Witch Hunter ROBIN (ウイッチハンターロビン),TV,26,"Jul 2, 2002 to Dec 24, 2002",Summer 2002,...,10128.0,11618.0,5709.0,2920.0,1083.0,353.0,164.0,131.0,7,Witch Hunter Robin
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",Beet the Vandel Buster,冒険王ビィト,TV,52,"Sep 30, 2004 to Sep 29, 2005",Fall 2004,...,1242.0,1713.0,1068.0,634.0,265.0,83.0,50.0,27.0,8,Beet the Vandel Buster


In [236]:
getAnimeName(6702)

'Fairy Tail'

In [237]:
df.sort_values('Score',inplace=True,ascending=False,
               kind='quicksort',
               na_position='last')

In [198]:
df.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1,anime_id,eng_version
3971,5114,Fullmetal Alchemist: Brotherhood,9.19,"Action, Military, Adventure, Comedy, Drama, Magic, Fantasy, Shounen",Fullmetal Alchemist:Brotherhood,鋼の錬金術師 FULLMETAL ALCHEMIST,TV,64,"Apr 5, 2009 to Jul 4, 2010",Spring 2009,...,199160.0,70045.0,20210.0,9308.0,3222.0,1536.0,2162.0,16806.0,5114,Fullmetal Alchemist:Brotherhood
15926,40028,Shingeki no Kyojin: The Final Season,9.17,"Action, Military, Mystery, Super Power, Drama, Fantasy, Shounen",Attack on Titan Final Season,進撃の巨人 The Final Season,TV,16,"Dec 7, 2020 to ?",Winter 2021,...,26016.0,8793.0,2674.0,1336.0,588.0,382.0,514.0,11061.0,40028,Attack on Titan Final Season
5683,9253,Steins;Gate,9.11,"Thriller, Sci-Fi",Steins;Gate,STEINS;GATE,TV,24,"Apr 6, 2011 to Sep 14, 2011",Spring 2011,...,140914.0,57740.0,21375.0,11126.0,5061.0,2292.0,1678.0,5255.0,9253,Steins;Gate
14963,38524,Shingeki no Kyojin Season 3 Part 2,9.1,"Action, Drama, Fantasy, Military, Mystery, Shounen, Super Power",Attack on Titan Season 3 Part 2,進撃の巨人 Season3 Part.2,TV,10,"Apr 29, 2019 to Jul 1, 2019",Spring 2019,...,110481.0,33662.0,8365.0,2974.0,1108.0,550.0,385.0,4169.0,38524,Attack on Titan Season 3 Part 2
9913,28977,Gintama°,9.1,"Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen",Gintama Season 4,銀魂°,TV,51,"Apr 8, 2015 to Mar 30, 2016",Spring 2015,...,21360.0,10215.0,3898.0,2311.0,952.0,648.0,1100.0,4508.0,28977,Gintama Season 4


In [238]:
df = df[['anime_id','eng_version','Score','Genres','Episodes','Type','Premiered','Members']].copy()

In [239]:
df.head()

Unnamed: 0,anime_id,eng_version,Score,Genres,Episodes,Type,Premiered,Members
3971,5114,Fullmetal Alchemist:Brotherhood,9.19,"Action, Military, Adventure, Comedy, Drama, Magic, Fantasy, Shounen",64,TV,Spring 2009,2248456
15926,40028,Attack on Titan Final Season,9.17,"Action, Military, Mystery, Super Power, Drama, Fantasy, Shounen",16,TV,Winter 2021,733260
5683,9253,Steins;Gate,9.11,"Thriller, Sci-Fi",24,TV,Spring 2011,1771162
14963,38524,Attack on Titan Season 3 Part 2,9.1,"Action, Drama, Fantasy, Military, Mystery, Shounen, Super Power",10,TV,Spring 2019,1073626
9913,28977,Gintama Season 4,9.1,"Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen",51,TV,Spring 2015,404121


In [240]:
def getAnimeFrame(anime,df):
    if isinstance(anime,int):
        return df[df.anime_id == anime]
    if isinstance(anime,str):
        return df[df.eng_version == anime]

In [241]:
getAnimeFrame('Steins;Gate',df)

Unnamed: 0,anime_id,eng_version,Score,Genres,Episodes,Type,Premiered,Members
5683,9253,Steins;Gate,9.11,"Thriller, Sci-Fi",24,TV,Spring 2011,1771162


## Anime with synopsis

In [242]:
cols=['MAL_ID','Name','Genres','sypnopsis']

In [243]:
synopsis_df = pd.read_csv(INPUT_DIR+'/anime_with_synopsis.csv',low_memory=True,usecols=cols)
synopsis_df.head(1)

Unnamed: 0,MAL_ID,Name,Genres,sypnopsis
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space","In the year 2071, humanity has colonized several of the planets and moons of the solar system leaving the now uninhabitable surface of planet Earth behind. The Inter Solar System Police attempts to keep peace in the galaxy, aided in part by outlaw bounty hunters, referred to as ""Cowboys."" The ragtag team aboard the spaceship Bebop are two such individuals. Mellow and carefree Spike Spiegel is balanced by his boisterous, pragmatic partner Jet Black as the pair makes a living chasing bounties and collecting rewards. Thrown off course by the addition of new members that they meet in their travels—Ein, a genetically engineered, highly intelligent Welsh Corgi; femme fatale Faye Valentine, an enigmatic trickster with memory loss; and the strange computer whiz kid Edward Wong—the crew embarks on thrilling adventures that unravel each member's dark and mysterious past little by little. Well-balanced with high density action and light-hearted comedy, Cowboy Bebop is a space Western classic and an homage to the smooth and improvised music it is named after."


In [244]:
def getSynopsis(anime,df):
    if isinstance(anime,int):
        return synopsis_df[synopsis_df.MAL_ID == anime].sypnopsis.values[0]
    if isinstance(anime,str):
        return synopsis_df[synopsis_df.Name == anime].sypnopsis.values[0]

In [245]:
getSynopsis('Steins;Gate',synopsis_df)

'The self-proclaimed mad scientist Rintarou Okabe rents out a room in a rickety old building in Akihabara, where he indulges himself in his hobby of inventing prospective "future gadgets" with fellow lab members: Mayuri Shiina, his air-headed childhood friend, and Hashida Itaru, a perverted hacker nicknamed "Daru." The three pass the time by tinkering with their most promising contraption yet, a machine dubbed the "Phone Microwave," which performs the strange function of morphing bananas into piles of green gel. Though miraculous in itself, the phenomenon doesn\'t provide anything concrete in Okabe\'s search for a scientific breakthrough; that is, until the lab members are spurred into action by a string of mysterious happenings before stumbling upon an unexpected success—the Phone Microwave can send emails to the past, altering the flow of history. Adapted from the critically acclaimed visual novel by 5pb. and Nitroplus, Steins;Gate takes Okabe through the depths of scientific theory 

### Content Based Recommendation System

In [246]:
pd.set_option('max_colwidth',None)

In [247]:
def find_similar_animes(name, anime_weights, anime2anime_encoded, anime2anime_decoded, df, synopsis_df, n=10, return_dist=False, neg=False):
    # Get the anime_id for the given name
    index = getAnimeFrame(name, df).anime_id.values[0]
    encoded_index = anime2anime_encoded.get(index)

    if encoded_index is None:
        raise ValueError(f"Encoded index not found for anime ID: {index}")

    weights = anime_weights

    # Compute the similarity distances
    dists = np.dot(weights, weights[encoded_index])  # Ensure weights[encoded_index] is a 1D array
    sorted_dists = np.argsort(dists)

    n = n + 1

    # Select closest or farthest based on 'neg' flag
    if neg:
        closest = sorted_dists[:n]
    else:
        closest = sorted_dists[-n:]

    # Return distances and closest indices if requested
    if return_dist:
        return dists, closest

    # Build the similarity array
    SimilarityArr = []
    for close in closest:
        decoded_id = anime2anime_decoded.get(close)
       

       
        anime_frame = getAnimeFrame(decoded_id, df)

        anime_name = anime_frame.eng_version.values[0]
        genre = anime_frame.Genres.values[0]
        similarity = dists[close]
   

        SimilarityArr.append({
            "anime_id": decoded_id,
            "name": anime_name,
            "similarity": similarity,
            "genre": genre,
        })
       

    # Create a DataFrame with results and sort by similarity
    Frame = pd.DataFrame(SimilarityArr).sort_values(by="similarity", ascending=False)
    return Frame[Frame.anime_id != index].drop(['anime_id'], axis=1).reset_index(drop=True)


In [248]:
find_similar_animes(
    "Naruto",
    anime_weights,
    anime2anime_encoder,
    anime2anime_decoder,
    df,
    synopsis_df,
)

Unnamed: 0,name,similarity,genre
0,Flip Flappers,0.36873,"Sci-Fi, Adventure, Comedy, Magic"
1,Circuit Angel: Ketsui no Starting Grid,0.33526,"Action, Sports"
2,Neighborhood Story The Movie,0.329488,"Comedy, Drama, Romance"
3,Rokudou Juku,0.325323,Music
4,Natural,0.313827,Hentai
5,Dan Doh!!,0.30524,"Adventure, Sports, Shounen"
6,Futsuu tte Nandarou?,0.301039,Slice of Life
7,Eden (ONA),0.293667,"Sci-Fi, Slice of Life, Fantasy"
8,Buddy Go! 2,0.281123,"Comedy, Romance, Shoujo"
9,Dragon Ball Z Kai,0.273445,"Action, Adventure, Comedy, Fantasy, Martial Arts, Shounen, Super Power"


In [294]:
def find_similar_users(input_item_id, user_weights, user2user_encoded, user2user_decoded,n=10, return_dist=False, neg=False):
    # Get the anime_id for the given name
    index = input_item_id
    encoded_index = user2user_encoded.get(index)

    if encoded_index is None:
        raise ValueError(f"Encoded index not found for anime ID: {index}")

    weights = user_weights

    # Compute the similarity distances
    dists = np.dot(weights, weights[encoded_index])  # Ensure weights[encoded_index] is a 1D array
    sorted_dists = np.argsort(dists)

    n = n + 1

    # Select closest or farthest based on 'neg' flag
    if neg:
        closest = sorted_dists[:n]
    else:
        closest = sorted_dists[-n:]

    # Return distances and closest indices if requested
    if return_dist:
        return dists, closest

    # Build the similarity array
    SimilarityArr = []
    for close in closest:
        similarity = dists[close]

        if isinstance(input_item_id,int):
            decoded_id = user2user_decoded.get(close)

            SimilarityArr.append({
                'similar_user': decoded_id,
                "similarity": similarity,
            })
       

    # Create a DataFrame with results and sort by similarity
    similar_users = pd.DataFrame(SimilarityArr).sort_values(by="similarity", ascending=False)
    return similar_users[similar_users.similar_user != input_item_id]


In [None]:
find_similar_users(int(869),user_weights,user2user_encoder,user2user_decoded)

Unnamed: 0,similar_user,similarity
9,1289,0.247572
8,386,0.236648
7,257,0.23345
6,948,0.204745
5,785,0.203347
4,413,0.194206
3,760,0.18896
2,519,0.184187
1,870,0.181269
0,1177,0.174642


In [251]:
def show_wordcloud(all_genres):
    genres_cloud = WordCloud(width=700,height=400,background_color='white',colormap='gnuplot').generate_from_frequencies(all_genres)
    plt.figure(figsize=(10,8))
    plt.imshow(genres_cloud,interpolation='bilinear')
    plt.axis('off')
    plt.show()

In [252]:
from collections import defaultdict

In [279]:
def getFavGenre(frame,plot=False):
    frame.dropna(inplace=False)
    all_genres = defaultdict(int)
    print(all_genres)

    genres_list = []
    for genres in frame['Genres']:
        if isinstance(genres,str):
            for genre in genres.split(','):
                genres_list.append(genre)
                all_genres[genre.strip()] +=1
    if plot:
        show_wordcloud(all_genres)
    return genres_list,all_genres


In [280]:
x = getAnimeFrame(1,df)

In [281]:
x

Unnamed: 0,anime_id,eng_version,Score,Genres,Episodes,Type,Premiered,Members
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",26,TV,Spring 1998,1251960


In [282]:
getFavGenre(x)

defaultdict(<class 'int'>, {})


(['Action', ' Adventure', ' Comedy', ' Drama', ' Sci-Fi', ' Space'],
 defaultdict(int,
             {'Action': 1,
              'Adventure': 1,
              'Comedy': 1,
              'Drama': 1,
              'Sci-Fi': 1,
              'Space': 1}))

In [316]:
def get_user_preferences(user_id , rating_df , df ,plot=False):

    animes_watched_by_user = rating_df[rating_df.user_id == user_id]

    user_rating_percentile = np.percentile(animes_watched_by_user.rating , 75)

    animes_watched_by_user = animes_watched_by_user[animes_watched_by_user.rating >= user_rating_percentile]

    top_animes_user = (
        animes_watched_by_user.sort_values(by="rating" , ascending=False).anime_id.values
    )

    anime_df_rows = df[df["anime_id"].isin(top_animes_user)]
    anime_df_rows = anime_df_rows[["eng_version","Genres"]]

    if plot:
        getFavGenre(anime_df_rows,plot)

    return anime_df_rows

In [318]:
get_user_preferences(1309,rating_df,df)

Unnamed: 0,eng_version,Genres
3971,Fullmetal Alchemist:Brotherhood,"Action, Military, Adventure, Comedy, Drama, Magic, Fantasy, Shounen"
5683,Steins;Gate,"Thriller, Sci-Fi"
14963,Attack on Titan Season 3 Part 2,"Action, Drama, Fantasy, Military, Mystery, Shounen, Super Power"
6474,Hunter x Hunter,"Action, Adventure, Fantasy, Shounen, Super Power"
3537,Clannad ~After Story~,"Slice of Life, Comedy, Supernatural, Drama, Romance"
...,...,...
14822,"Demon Lord, Retry!","Action, Adventure, Fantasy"
14197,Plunderer,"Action, Ecchi, Fantasy, Shounen"
17180,The Warrior From Qin,"Action, Adventure, Fantasy"
16344,Redo of Healer,"Ecchi, Fantasy"


In [283]:
rating_df.head()

Unnamed: 0,user_id,anime_id,rating,user,anime
0,1309,36316,0.3,133,3086
1,313,37982,0.5,316,1839
2,98,23289,0.0,233,246
3,41,6893,0.4,39,2295
4,1203,18119,0.8,412,740


In [321]:
def get_user_recommendations(similar_users , user_pref ,df , synopsis_df, rating_df, n=5):

    recommended_animes = []
    anime_list = []

    for user_id in similar_users.similar_user.values:
        pref_list = get_user_preferences(int(user_id) , rating_df, df)

        pref_list = pref_list[~pref_list.eng_version.isin(user_pref.eng_version.values)]

        if not pref_list.empty:
            anime_list.append(pref_list.eng_version.values)

    if anime_list:
            anime_list = pd.DataFrame(anime_list)

            sorted_list = pd.DataFrame(pd.Series(anime_list.values.ravel()).value_counts()).head(n)

            for i,anime_name in enumerate(sorted_list.index):
                n_user_pref = sorted_list[sorted_list.index == anime_name].values[0][0]

                if isinstance(anime_name,str):
                    frame = getAnimeFrame(anime_name,df)
                    anime_id = frame.anime_id.values[0]
                    genre = frame.Genres.values[0]
                    synopsis = getSynopsis(int(anime_id),synopsis_df)

                    recommended_animes.append({
                        "n" : n_user_pref,
                        "anime_name" : anime_name,
                        "Genres" : genre,
                        "Synopsis": synopsis
                    })
    return pd.DataFrame(recommended_animes).head(n)    

In [313]:
similar_users = find_similar_users(int(869),user_weights,user2user_encoder,user2user_decoded)

In [319]:
user_pref = get_user_preferences(869,rating_df,df)

In [322]:
get_user_recommendations(similar_users , user_pref ,df , synopsis_df, rating_df)

Unnamed: 0,n,anime_name,Genres,Synopsis
0,9,Toradora!,"Slice of Life, Comedy, Romance, School","uuji Takasu is a gentle high school student with a love for housework; but in contrast to his kind nature, he has an intimidating face that often gets him labeled as a delinquent. On the other hand is Taiga Aisaka, a small, doll-like student, who is anything but a cute and fragile girl. Equipped with a wooden katana and feisty personality, Taiga is known throughout the school as the ""Palmtop Tiger."" One day, an embarrassing mistake causes the two students to cross paths. Ryuuji discovers that Taiga actually has a sweet side: she has a crush on the popular vice president, Yuusaku Kitamura, who happens to be his best friend. But things only get crazier when Ryuuji reveals that he has a crush on Minori Kushieda—Taiga's best friend! Toradora! is a romantic comedy that follows this odd duo as they embark on a quest to help each other with their respective crushes, forming an unlikely alliance in the process."
1,9,Bakuman.,"Comedy, Drama, Romance, Shounen","Onto their third serialization, manga duo Moritaka Mashiro and Akito Takagi—also known by their pen name, Muto Ashirogi—are ever closer to their dream of an anime adaption. However, the real challenge is only just beginning: if they are unable to compete with the artist Eiji Niizuma in the rankings within the span of six months, they will be canceled. To top it off, numerous rivals are close behind and declaring war. They don't even have enough time to spare thinking about an anime! In Bakuman. 3rd Season , Muto Ashirogi must find a way to stay atop the colossal mountain known as the Shounen Jack rankings. With new problems and new assistants, the pair continue to strive for their dream."
2,8,Parasyte -the maxim-,"Action, Sci-Fi, Horror, Psychological, Drama, Seinen","ll of a sudden, they arrived: parasitic aliens that descended upon Earth and quickly infiltrated humanity by burrowing into the brains of vulnerable targets. These insatiable beings acquire full control of their host and are able to morph into a variety of forms in order to feed on unsuspecting prey. Sixteen-year-old high school student Shinichi Izumi falls victim to one of these parasites, but it fails to take over his brain, ending up in his right hand instead. Unable to relocate, the parasite, now named Migi, has no choice but to rely on Shinichi in order to stay alive. Thus, the pair is forced into an uneasy coexistence and must defend themselves from hostile parasites that hope to eradicate this new threat to their species."
3,8,Hunter x Hunter,"Action, Adventure, Fantasy, Shounen, Super Power","Hunter x Hunter is set in a world where Hunters exist to perform all manner of dangerous tasks like capturing criminals and bravely searching for lost treasures in uncharted territories. Twelve-year-old Gon Freecss is determined to become the best Hunter possible in hopes of finding his father, who was a Hunter himself and had long ago abandoned his young son. However, Gon soon realizes the path to achieving his goals is far more challenging than he could have ever imagined. Along the way to becoming an official Hunter, Gon befriends the lively doctor-in-training Leorio, vengeful Kurapika, and rebellious ex-assassin Killua. To attain their own goals and desires, together the four of them take the Hunter Exam, notorious for its low success rate and high probability of death. Throughout their journey, Gon and his friends embark on an adventure that puts them through many hardships and struggles. They will meet a plethora of monsters, creatures, and characters—all while learning what being a Hunter truly means."
4,8,Princess Mononoke,"Action, Adventure, Fantasy","hen an Emishi village is attacked by a fierce demon boar, the young prince Ashitaka puts his life at stake to defend his tribe. With its dying breath, the beast curses the prince's arm, granting him demonic powers while gradually siphoning his life away. Instructed by the village elders to travel westward for a cure, Ashitaka arrives at Tatara, the Iron Town, where he finds himself embroiled in a fierce conflict: Lady Eboshi of Tatara, promoting constant deforestation, stands against Princess San and the sacred spirits of the forest, who are furious at the destruction brought by the humans. As the opposing forces of nature and mankind begin to clash in a desperate struggle for survival, Ashitaka attempts to seek harmony between the two, all the while battling the latent demon inside of him. Princess Mononoke is a tale depicting the connection of technology and nature, while showing the path to harmony that could be achieved by mutual acceptance."


In [None]:
def hybrid_recommendation(user_id , user_weight=0.5, content_weight =0.5):

    ## User Recommndation

    similar_users =find_similar_users(user_id,user_weights,user2user_encoded,user2user_decoded)
    user_pref = get_user_preferences(user_id , rating_df, df)
    user_recommended_animes =get_user_recommendations(similar_users,user_pref,df, synopsis_df,rating_df)
    

    user_recommended_anime_list = user_recommended_animes["anime_name"].tolist()
    print(user_recommended_anime_list)

    #### Content recommendation
    content_recommended_animes = []

    for anime in user_recommended_anime_list:
        similar_animes = find_similar_animes(anime, anime_weights, anime2anime_encoded, anime2anime_decoded, df, synopsis_df)

        if similar_animes is not None and not similar_animes.empty:
            content_recommended_animes.extend(similar_animes["name"].tolist())
        else:
            print(f"No similar anime found {anime}")
    
    combined_scores = {}

    for anime in user_recommended_anime_list:
        combined_scores[anime] = combined_scores.get(anime,0) + user_weight

    for anime in content_recommended_animes:
        combined_scores[anime] = combined_scores.get(anime,0) + content_weight  

    sorted_animes = sorted(combined_scores.items() , key=lambda x:x[1] , reverse=True)

    return [anime for anime , score in sorted_animes[:10]] 

