<a href="https://colab.research.google.com/github/Rahul19982022/anime-recommendation-system-exp/blob/main/archives/2_collaborative_filtering_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import os
import pickle
import random

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
import tensorflow as tf

In [None]:
SEED = 1 # Your chosen fixed integer seed
os.environ['PYTHONHASHSEED'] = str(SEED) # For Python's internal hashing
random.seed(SEED)                      # For Python's built-in random module
np.random.seed(SEED)                   # For NumPy's random module
tf.random.set_seed(SEED)               # For TensorFlow/Keras random operations

# --- CRITICAL FOR GPU WEIGHT REPRODUCIBILITY ---
tf.config.experimental.enable_op_determinism()
# -------------------------------------------------

In [None]:
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import backend as K

from tensorflow.keras.layers import Input, Embedding, Dot, Reshape, Dense, Flatten, Lambda, Multiply, Add
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
proj_path = '/content/drive/MyDrive/anime_proj'
os.chdir(proj_path)

In [None]:
anime = pd.read_csv('datasets/kaggle_dataset/anime.csv')

In [None]:
user = pd.read_pickle('datasets/created_datasets/user_rated_data_Step1.pkl')

In [None]:
anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17562 entries, 0 to 17561
Data columns (total 35 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MAL_ID         17562 non-null  int64 
 1   Name           17562 non-null  object
 2   Score          17562 non-null  object
 3   Genres         17562 non-null  object
 4   English name   17562 non-null  object
 5   Japanese name  17562 non-null  object
 6   Type           17562 non-null  object
 7   Episodes       17562 non-null  object
 8   Aired          17562 non-null  object
 9   Premiered      17562 non-null  object
 10  Producers      17562 non-null  object
 11  Licensors      17562 non-null  object
 12  Studios        17562 non-null  object
 13  Source         17562 non-null  object
 14  Duration       17562 non-null  object
 15  Rating         17562 non-null  object
 16  Ranked         17562 non-null  object
 17  Popularity     17562 non-null  int64 
 18  Members        17562 non-n

In [None]:
# Going further, we are not using these columns
user = user.drop(['watching_status', 'watched_episodes'], axis = 1)

In [None]:
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61699446 entries, 0 to 61699445
Data columns (total 3 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   user_id   uint32
 1   anime_id  uint16
 2   rating    uint8 
dtypes: uint16(1), uint32(1), uint8(1)
memory usage: 411.9 MB


In [None]:
anime['Type'].value_counts()

Unnamed: 0_level_0,count
Type,Unnamed: 1_level_1
TV,4996
OVA,3894
Movie,3041
Special,2218
ONA,1907
Music,1469
Unknown,37


In [None]:
rem_genre = anime[anime['Genres'].str.contains('Hentai')]['MAL_ID'].tolist()
music_cat = anime[anime['Type'] == 'Music']['MAL_ID'].tolist()
unknown_cat = anime[anime['Type'] == 'Unknown']['MAL_ID'].tolist()

anime_rem = set(rem_genre + music_cat + unknown_cat)

user = user[~user['anime_id'].isin(anime_rem)]
user.reset_index(drop = True, inplace = True)

anime = anime[~anime['MAL_ID'].isin(anime_rem)]
anime.reset_index(drop = True, inplace = True)

In [None]:
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60346030 entries, 0 to 60346029
Data columns (total 3 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   user_id   uint32
 1   anime_id  uint16
 2   rating    uint8 
dtypes: uint16(1), uint32(1), uint8(1)
memory usage: 402.9 MB


In [None]:
anime.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14708 entries, 0 to 14707
Data columns (total 35 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   MAL_ID         14708 non-null  int64 
 1   Name           14708 non-null  object
 2   Score          14708 non-null  object
 3   Genres         14708 non-null  object
 4   English name   14708 non-null  object
 5   Japanese name  14708 non-null  object
 6   Type           14708 non-null  object
 7   Episodes       14708 non-null  object
 8   Aired          14708 non-null  object
 9   Premiered      14708 non-null  object
 10  Producers      14708 non-null  object
 11  Licensors      14708 non-null  object
 12  Studios        14708 non-null  object
 13  Source         14708 non-null  object
 14  Duration       14708 non-null  object
 15  Rating         14708 non-null  object
 16  Ranked         14708 non-null  object
 17  Popularity     14708 non-null  int64 
 18  Members        14708 non-n

In [None]:
anime.head(2)

Unnamed: 0,MAL_ID,Name,Score,Genres,English name,Japanese name,Type,Episodes,Aired,Premiered,...,Score-10,Score-9,Score-8,Score-7,Score-6,Score-5,Score-4,Score-3,Score-2,Score-1
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",Cowboy Bebop,カウボーイビバップ,TV,26,"Apr 3, 1998 to Apr 24, 1999",Spring 1998,...,229170.0,182126.0,131625.0,62330.0,20688.0,8904.0,3184.0,1357.0,741.0,1580.0
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Cowboy Bebop:The Movie,カウボーイビバップ 天国の扉,Movie,1,"Sep 1, 2001",Unknown,...,30043.0,49201.0,49505.0,22632.0,5805.0,1877.0,577.0,221.0,109.0,379.0


In [None]:
user.head()

Unnamed: 0,user_id,anime_id,rating
0,0,67,9
1,0,6702,7
2,0,242,10
3,0,24,9
4,0,4722,8


In [None]:
user['anime_id'].value_counts()

Unnamed: 0_level_0,count
anime_id,Unnamed: 1_level_1
1535,193817
16498,177096
11757,170836
6547,145391
5114,144357
...,...
35962,1
41458,1
41277,1
35153,1


In [None]:
user['anime_id'].value_counts()[user['anime_id'].value_counts() >= 500]

Unnamed: 0_level_0,count
anime_id,Unnamed: 1_level_1
1535,193817
16498,177096
11757,170836
6547,145391
5114,144357
...,...
38425,500
15979,500
1344,500
216,500


In [None]:
anime_threshold = 500
animes_thr_list = user['anime_id'].value_counts()[user['anime_id'].value_counts() >= anime_threshold].index.tolist()

In [None]:
user = user[user['anime_id'].isin(pd.Series(animes_thr_list))]
anime = anime[anime['MAL_ID'].isin(pd.Series(animes_thr_list))]

user.reset_index(drop = True, inplace = True)
anime.reset_index(drop = True, inplace = True)

In [None]:
# User should rate atleast 10 animes
n_ratings = user['user_id'].value_counts()
user_threshold = 10
user_model = user[user['user_id'].isin(pd.Series(n_ratings[n_ratings >= user_threshold].index))]

In [None]:
user_model.info()

<class 'pandas.core.frame.DataFrame'>
Index: 59406098 entries, 0 to 59516291
Data columns (total 3 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   user_id   uint32
 1   anime_id  uint16
 2   rating    uint8 
dtypes: uint16(1), uint32(1), uint8(1)
memory usage: 849.8 MB


In [None]:
import gc
gc.collect()

45

In [None]:
user_model = user_model.copy()

# Encoding categorical data
user_ids = user_model["user_id"].unique().tolist()
user2user_encoded = {x: i for i, x in enumerate(user_ids)}
user_encoded2user = {i: x for i, x in enumerate(user_ids)}
user_model["user_enc"] = user_model["user_id"].map(user2user_encoded)
n_users = len(user2user_encoded)

anime_ids = user_model["anime_id"].unique().tolist()
anime2anime_encoded = {x: i for i, x in enumerate(anime_ids)}
anime_encoded2anime = {i: x for i, x in enumerate(anime_ids)}
user_model["anime_enc"] = user_model["anime_id"].map(anime2anime_encoded)
n_animes = len(anime2anime_encoded)

print("Num of users: {}, Num of animes: {}".format(n_users, n_animes))

Num of users: 285270, Num of animes: 6356


In [None]:
user['user_enc'] = user['user_id'].map(user2user_encoded).astype('Int32')
user['anime_enc'] = user['anime_id'].map(anime2anime_encoded).astype('uint16')

In [None]:
user.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59516292 entries, 0 to 59516291
Data columns (total 5 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   user_id    uint32
 1   anime_id   uint16
 2   rating     uint8 
 3   user_enc   Int32 
 4   anime_enc  uint16
dtypes: Int32(1), uint16(2), uint32(1), uint8(1)
memory usage: 794.6 MB


In [None]:
user.head()

Unnamed: 0,user_id,anime_id,rating,user_enc,anime_enc
0,0,67,9,0,0
1,0,6702,7,0,1
2,0,242,10,0,2
3,0,24,9,0,3
4,0,4722,8,0,4


In [None]:
# Split

X_train, X_test, y_train, y_test = train_test_split(
                                  user_model.drop(['rating', 'user_id', 'anime_id'], axis = 1),
                                  user_model["rating"], test_size=0.1, stratify = user_model['user_enc'], random_state=SEED)

print('> Train set ratings: {}'.format(len(y_train)))
print('> Test set ratings: {}'.format(len(y_test)))

> Train set ratings: 53465488
> Test set ratings: 5940610


In [None]:
user_model['user_enc'].nunique(), X_test['user_enc'].nunique(), X_train['user_enc'].nunique()

(285270, 285270, 285270)

In [None]:
X_val, X_test, y_val, y_test = train_test_split(
                                  X_test, y_test, test_size=0.5, random_state=SEED)

print('> Validation set ratings: {}'.format(len(y_val)))
print('> Test set ratings: {}'.format(len(y_test)))

> Validation set ratings: 2970305
> Test set ratings: 2970305


In [None]:
import gc
gc.collect()

180

In [None]:
checkpoint_filepath = 'model/checkpoint.model.keras'

model_checkpoints = ModelCheckpoint(filepath=checkpoint_filepath,
                                        save_weights_only=False,
                                        monitor='val_loss',
                                        mode='min',
                                        save_best_only=True)

early_stopping = EarlyStopping(patience = 5, monitor='val_loss',
                               mode='min')

my_callbacks = [model_checkpoints,
    early_stopping
]

In [None]:
def RecommenderNet():
    embedding_size = 128

    user = Input(name = 'user', shape = [1])
    user_embedding = Embedding(name = 'user_embedding',
                       input_dim = n_users,
                       output_dim = embedding_size)(user)

    user_embedding_reshaped = Reshape(name = 'user_embedding_reshaped', target_shape = (embedding_size,))(user_embedding)


    anime = Input(name = 'anime', shape = [1])
    anime_embedding = Embedding(name = 'anime_embedding',
                       input_dim = n_animes,
                       output_dim = embedding_size)(anime)

    anime_embedding_reshaped = Reshape(name = 'anime_embedding_reshaped',target_shape = (embedding_size,))(anime_embedding)

    x = Dot(name = 'dot_product', normalize = True, axes = 1)([user_embedding_reshaped, anime_embedding_reshaped])

    x = Dense(1, activation = 'sigmoid')(x)

    # Scaling and shifting to range [1, 10] using Multiply and Add
    x = Multiply()([x, tf.constant(9.0, shape = (1,))])  # Scale to range [0, 9]
    x = Add()([x, tf.constant(1.0, shape = (1,))])       # Shift to range [1, 10]


    model = Model(inputs=[user, anime], outputs=x)
    model.compile(loss= 'mse', metrics=["mae", "mse"], optimizer='Adam')

    return model


In [None]:
model = RecommenderNet()

In [None]:
batch_size = 30000

# Model training
history = model.fit(
    x = [X_train['user_enc'],X_train['anime_enc']],
    y =  y_train,
    batch_size=batch_size,
    epochs=20,
    verbose=1,
    validation_data=([X_val['user_enc'],X_val['anime_enc']], y_val),
    callbacks=my_callbacks
)

Epoch 1/20
[1m1783/1783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 38ms/step - loss: 3.7497 - mae: 1.5498 - mse: 3.7497 - val_loss: 1.6115 - val_mae: 0.9732 - val_mse: 1.6115
Epoch 2/20
[1m1783/1783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 36ms/step - loss: 1.5509 - mae: 0.9512 - mse: 1.5509 - val_loss: 1.4347 - val_mae: 0.9064 - val_mse: 1.4347
Epoch 3/20
[1m1783/1783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 37ms/step - loss: 1.3639 - mae: 0.8822 - mse: 1.3639 - val_loss: 1.3474 - val_mae: 0.8741 - val_mse: 1.3474
Epoch 4/20
[1m1783/1783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 37ms/step - loss: 1.2099 - mae: 0.8274 - mse: 1.2099 - val_loss: 1.3056 - val_mae: 0.8590 - val_mse: 1.3056
Epoch 5/20
[1m1783/1783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 36ms/step - loss: 1.0591 - mae: 0.7728 - mse: 1.0591 - val_loss: 1.3108 - val_mae: 0.8607 - val_mse: 1.3108
Epoch 6/20
[1m1783/1783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

In [None]:
best_model = tf.keras.models.load_model(checkpoint_filepath)

In [None]:
y_pred = best_model.predict([X_test['user_enc'],X_test['anime_enc']], batch_size = 50000)

[1m60/60[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step


In [None]:
gc.collect()

1246

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)**0.5

1.1429492836661472

In [None]:
pickle.dump(user2user_encoded, open('datasets/created_datasets/encoded_dictionary/user2user_encoded.pkl', 'wb'))
pickle.dump(user_encoded2user, open('datasets/created_datasets/encoded_dictionary/user_encoded2user.pkl', 'wb'))
pickle.dump(anime2anime_encoded, open('datasets/created_datasets/encoded_dictionary/anime2anime_encoded.pkl', 'wb'))
pickle.dump(anime_encoded2anime, open('datasets/created_datasets/encoded_dictionary/anime_encoded2anime.pkl', 'wb'))

In [None]:
user.to_pickle('datasets/created_datasets/user.pkl')
anime.to_pickle('datasets/created_datasets/anime.pkl')

In [None]:
user_copy = pd.read_pickle('datasets/created_datasets/user.pkl')
user.equals(user_copy)

True

In [None]:
y_test.to_pickle('data_split/y_test.pkl')