#importlibrary

In [34]:
from google.colab import drive
import zipfile
import pandas as pd
import pickle
from collections import defaultdict
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.dummy import DummyRegressor
import matplotlib.pyplot as plt

Mengimpor library yang diperlukan untuk:

Google Colab: Akses Google Drive
pandas/numpy: Manipulasi data
TensorFlow: Membangun neural network
sklearn: Preprocessing dan evaluasi model

#load dataset

In [35]:
# Mount Google Drive
drive.mount('/content/drive')

# Extract dataset
zip_path = '/content/drive/MyDrive/Content based filtering/archive (4).zip'
extract_path = '/content/content_based_filtering'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
print("Dataset extracted successfully!")

# Define file paths
csv_path = '/content/content_based_filtering/content_movie_list.csv'
pickle_path = '/content/content_based_filtering/content_user_to_genre.pickle'
header_path = '/content/content_based_filtering/content_user_train_header.txt'

# Load data files
movie_list = pd.read_csv(csv_path)

# Load user header
with open(header_path, 'r') as header:
    for line in header:
        user_header = line.strip().split(',')

# Load user-genre preferences
with open(pickle_path, 'rb') as f:
    user_to_genre = pickle.load(f)

print("Data loaded successfully!")
print(f"Number of movies: {len(movie_list)}")
print(f"Number of users: {len(user_to_genre)}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset extracted successfully!
Data loaded successfully!
Number of movies: 694
Number of users: 395


  user_to_genre = pickle.load(f)


Memuat dataset yang terdiri dari:

movie_list: Daftar film dengan ID, judul, dan genre
user_to_genre: Dictionary berisi preferensi user terhadap genre dan rating yang diberikan

#understanding data

In [36]:
user_to_genre[2]

{'glist': array([[3.9  , 5.   , 0.   , 0.   , 4.   , 4.2  , 4.   , 4.   , 0.   ,
         3.   , 4.   , 0.   , 4.25 , 3.875]]),
 'g_count': array([[5., 1., 0., 0., 5., 5., 2., 9., 0., 1., 1., 0., 2., 8.]]),
 'rating_count': 16,
 'rating_sum': np.float64(65.0),
 'movies': {np.int64(6874): np.float64(4.0),
  np.int64(8798): np.float64(3.5),
  np.int64(46970): np.float64(4.0),
  np.int64(48516): np.float64(4.0),
  np.int64(60756): np.float64(5.0),
  np.int64(71535): np.float64(3.0),
  np.int64(74458): np.float64(4.0),
  np.int64(77455): np.float64(3.0),
  np.int64(80489): np.float64(4.5),
  np.int64(80906): np.float64(5.0),
  np.int64(89774): np.float64(5.0),
  np.int64(91658): np.float64(2.5),
  np.int64(106782): np.float64(5.0),
  np.int64(112552): np.float64(4.0),
  np.int64(115713): np.float64(3.5),
  np.int64(122882): np.float64(5.0)},
 'rating_ave': np.float64(4.0625)}

Menunjukkan struktur data user yang berisi:

glist: Rating rata-rata user untuk setiap genre
g_count: Jumlah film yang dinilai per genre
rating_count: Total jumlah rating
movies: Dictionary film yang dinilai dengan rating-nya
rating_ave: Rating rata-rata user



In [37]:
movie_list

Unnamed: 0,movieId,title,genres
0,4054,Save the Last Dance (2001),Drama|Romance
1,4069,"Wedding Planner, The (2001)",Comedy|Romance
2,4148,Hannibal (2001),Horror|Thriller
3,4149,Saving Silverman (Evil Woman) (2001),Comedy|Romance
4,4153,Down to Earth (2001),Comedy|Fantasy|Romance
...,...,...,...
689,168252,Logan (2017),Action|Sci-Fi
690,176371,Blade Runner 2049 (2017),Sci-Fi
691,177765,Coco (2017),Adventure|Animation|Children
692,179819,Star Wars: The Last Jedi (2017),Action|Adventure|Fantasy|Sci-Fi


In [38]:
movie_list.isna().sum()

Unnamed: 0,0
movieId,0
title,0
genres,0


In [39]:
movie_list.duplicated().sum()

np.int64(0)

In [40]:
movie_list.head()

Unnamed: 0,movieId,title,genres
0,4054,Save the Last Dance (2001),Drama|Romance
1,4069,"Wedding Planner, The (2001)",Comedy|Romance
2,4148,Hannibal (2001),Horror|Thriller
3,4149,Saving Silverman (Evil Woman) (2001),Comedy|Romance
4,4153,Down to Earth (2001),Comedy|Fantasy|Romance


#Finding average ratings for all movies

In [41]:
#creating sum of total rating col and count of total rating col
movie_list['total_rating_sum']=0
movie_list['total_rating_count']=0

In [42]:
#Disable warnings
pd.options.mode.chained_assignment = None  # default='warn'

#Calculating total ratings for each movies by looping through all users who gave ratings
for user in user_to_genre:
    #print(user_to_genre[user])
    #print(user['movies'])
    user_movies=user_to_genre[user]['movies']
    for movie in user_movies:
        #print(movie,user_movies[movie])
        movie_list.loc[:,'total_rating_sum'][movie_list['movieId']==movie]+=user_movies[movie]
        movie_list.loc[:,'total_rating_count'][movie_list['movieId']==movie]+=1


[1;30;43mOutput streaming akan dipotong hingga 5000 baris terakhir.[0m

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  movie_list.loc[:,'total_rating_count'][movie_list['movieId']==movie]+=1
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating 

Menghitung rating rata-rata untuk setiap film dari semua user yang memberikan rating.

In [43]:
#Average rating =total_ratings/total_ratings_count
movie_list.loc[:,'avg rating']=movie_list.loc[:,'total_rating_sum'] / movie_list.loc[:,'total_rating_count']

In [44]:
#Deleting unwanted columns
movie_list.drop(['total_rating_sum','total_rating_count'],axis=1,inplace=True)

In [45]:
movie_list

Unnamed: 0,movieId,title,genres,avg rating
0,4054,Save the Last Dance (2001),Drama|Romance,2.843750
1,4069,"Wedding Planner, The (2001)",Comedy|Romance,2.909091
2,4148,Hannibal (2001),Horror|Thriller,2.935897
3,4149,Saving Silverman (Evil Woman) (2001),Comedy|Romance,2.772727
4,4153,Down to Earth (2001),Comedy|Fantasy|Romance,2.416667
...,...,...,...,...
689,168252,Logan (2017),Action|Sci-Fi,4.280000
690,176371,Blade Runner 2049 (2017),Sci-Fi,3.805556
691,177765,Coco (2017),Adventure|Animation|Children,3.538462
692,179819,Star Wars: The Last Jedi (2017),Action|Adventure|Fantasy|Sci-Fi,3.125000


In [46]:
my_item_vec=pd.DataFrame()
my_item_vec

#spliting genres

In [47]:
# Create an empty list to store the data for the new DataFrame
data_list = []

for row in movie_list.values:
    categories=row[2].split('|')
    #print(categories)
    for category in categories:
        # Append a dictionary representing the row to the list
        data_list.append({'movieId':row[0],'year':row[1][-5:-1],'ave rating':row[3],'title':row[1][:-7],'genres':category})

# Create the DataFrame from the list of dictionaries
my_item_vec = pd.DataFrame(data_list)

my_item_vec['movieId']=my_item_vec['movieId'].astype(int)

In [48]:
my_item_vec

Unnamed: 0,movieId,year,ave rating,title,genres
0,4054,2001,2.843750,Save the Last Dance,Drama
1,4054,2001,2.843750,Save the Last Dance,Romance
2,4069,2001,2.909091,"Wedding Planner, The",Comedy
3,4069,2001,2.909091,"Wedding Planner, The",Romance
4,4148,2001,2.935897,Hannibal,Horror
...,...,...,...,...,...
1878,179819,2017,3.125000,Star Wars: The Last Jedi,Fantasy
1879,179819,2017,3.125000,Star Wars: The Last Jedi,Sci-Fi
1880,187593,2018,3.875000,Deadpool 2,Action
1881,187593,2018,3.875000,Deadpool 2,Comedy


In [49]:
movie_list_seperated_genres=my_item_vec.copy()

#One Hot Encoding generes column

In [50]:
#one hot encoding generes column
from sklearn.preprocessing import OneHotEncoder
# Remove the sparse=False argument if your scikit-learn version is older
ohe_categories=OneHotEncoder(handle_unknown='ignore')
# Convert the sparse matrix output to a dense array before creating the DataFrame
my_item_vec_categories=pd.DataFrame(ohe_categories.fit_transform(my_item_vec['genres'].to_numpy().reshape(-1,1)).toarray()
                         ,columns=[col[3:] for col in ohe_categories.get_feature_names_out()]).astype(int)

Mengubah format data film:

Memisahkan film dengan multiple genre menjadi baris terpisah
Mengubah genre menjadi format one-hot encoding (0/1)

In [51]:
my_item_vec=pd.concat([my_item_vec,my_item_vec_categories],axis=1)
my_item_vec.head()

Unnamed: 0,movieId,year,ave rating,title,genres,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,4054,2001,2.84375,Save the Last Dance,Drama,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,4054,2001,2.84375,Save the Last Dance,Romance,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,4069,2001,2.909091,"Wedding Planner, The",Comedy,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,4069,2001,2.909091,"Wedding Planner, The",Romance,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,4148,2001,2.935897,Hannibal,Horror,0,0,0,0,0,0,0,0,0,1,0,0,0,0


In [52]:
#Dropping unwanted columns
my_item_vec.drop(['title','genres'],axis=1,inplace=True)

In [53]:
my_item_vec.head()

Unnamed: 0,movieId,year,ave rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,4054,2001,2.84375,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,4054,2001,2.84375,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,4069,2001,2.909091,0,0,0,0,1,0,0,0,0,0,0,0,0,0
3,4069,2001,2.909091,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,4148,2001,2.935897,0,0,0,0,0,0,0,0,0,1,0,0,0,0


#Making training set : users and items

In [54]:
users=pd.DataFrame([],columns=user_header)
items=pd.DataFrame([],columns=my_item_vec.columns)
y=pd.DataFrame([]) #actual ratings given by users

In [55]:
#We are creating training data so that there is 1 user that who has given rating to 1 item

for user in user_to_genre:


    #get all movie ids in list
    movie_dict=user_to_genre[user]['movies']

    #select data of all movies with above movie_ids in my_item_vec
    all_movies=my_item_vec[my_item_vec['movieId'].isin(movie_dict)].reset_index(drop=True)

    #select the ratings that user actually gave to the movies
    user_ratings=all_movies['movieId'].apply(lambda all_movies_movie_id:movie_dict[all_movies_movie_id])  #returns ratings given to movie by this user

    #Combining to original training data
    items=pd.concat([items,all_movies],axis=0,ignore_index=True)
    y=pd.concat([y,user_ratings],axis=0,ignore_index=True)
    #glist contains avg rating of user for different categories
    #combining user id, rating count and rating ave with glist
    combined_arr=np.c_[np.array([[user,user_to_genre[user]['rating_count'],user_to_genre[user]['rating_ave']]]),user_to_genre[user]['glist']]

    #No of movies rated
    num_repeat=all_movies.shape[0]

    #Repeat user data equal to no. of movie data the user has rated
    combined_arr_repeated=np.tile(combined_arr,(num_repeat,1))


    #Combining in original training data
    users=pd.concat([users,pd.DataFrame(combined_arr_repeated,columns=user_header)],axis=0,ignore_index=True)

  items=pd.concat([items,all_movies],axis=0,ignore_index=True)
  users=pd.concat([users,pd.DataFrame(combined_arr_repeated,columns=user_header)],axis=0,ignore_index=True)


Membuat dataset training dengan format:

users: Fitur user (preferensi genre, rating count, dll)
items: Fitur film (tahun, rating rata-rata, genre one-hot)
y: Target (rating aktual yang diberikan user)

In [56]:
#Converting year from object to float
items['year']=items['year'].astype(int)

In [57]:
users.head()

Unnamed: 0,user id,rating count,rating ave,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,2.0,16.0,4.0625,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.25,3.875
1,2.0,16.0,4.0625,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.25,3.875
2,2.0,16.0,4.0625,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.25,3.875
3,2.0,16.0,4.0625,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.25,3.875
4,2.0,16.0,4.0625,3.9,5.0,0.0,0.0,4.0,4.2,4.0,4.0,0.0,3.0,4.0,0.0,4.25,3.875


In [58]:
items.head()

Unnamed: 0,movieId,year,ave rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,6874,2003,3.961832,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,6874,2003,3.961832,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2,6874,2003,3.961832,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,8798,2004,3.761364,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,8798,2004,3.761364,0,0,0,0,0,1,0,0,0,0,0,0,0,0


In [59]:
y.head()

Unnamed: 0,0
0,4.0
1,4.0
2,4.0
3,3.5
4,3.5


#Processing training data

In [60]:
#In model we will not use first 3 columns from from users set i.e 'user id','rating count' and 'rating ave'
user_features=[col for col in users.columns if col not in ['user id','rating count','rating ave'] ]
num_user_features=len(user_features)
user_features_start=3   #Model takes values from 3rd column to last column

#In model we will not use first column from items set i.e col 'movieId'
item_features=[col for col in items.columns if col not in ['movieId'] ]
num_item_features=len(item_features)
item_features_start=1   #Model takes values from 1st column to last column

print('User features : ',num_user_features,' : ',user_features)
print('Item features : ',num_item_features,' : ',item_features)

User features :  14  :  ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Horror', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller']
Item features :  16  :  ['year', 'ave rating', 'Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Horror', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller']


##Scaling

In [61]:
#Scaling data
from sklearn.preprocessing import StandardScaler

scaledata=True
if scaledata:
    item_train_save = items.copy()
    user_train_save = users.copy()

    scaler_items=StandardScaler()
    items=pd.DataFrame(scaler_items.fit_transform(items),columns=items.columns)

    scaler_users=StandardScaler()
    users=pd.DataFrame(scaler_users.fit_transform(users),columns=users.columns)

StandardScaler: Menormalkan fitur user dan item
MinMaxScaler: Mengubah target rating ke range [-1,1]

In [62]:
#Splitting training and test data
items_train,items_test=train_test_split(items,test_size=0.2,shuffle=True,random_state=1)
users_train,users_test=train_test_split(users,test_size=0.2,shuffle=True,random_state=1)
y_train,y_test=train_test_split(y,test_size=0.2,shuffle=True,random_state=1)

In [63]:
from sklearn.preprocessing import MinMaxScaler

y_scaler=MinMaxScaler((-1,1))
y_train_norm=y_scaler.fit_transform(y_train)
y_test_norm=y_scaler.transform(y_test)

#model

In [64]:
#Making Model
num_outputs=32
tf.random.set_seed(1)
users_NN=tf.keras.models.Sequential([
    tf.keras.layers.Dense(256,activation='relu'),
    tf.keras.layers.Dense(128,activation='relu'),
    tf.keras.layers.Dense(num_outputs,activation='linear'),
])

items_NN=tf.keras.models.Sequential([
    tf.keras.layers.Dense(256,activation='relu'),
    tf.keras.layers.Dense(128,activation='relu'),
    tf.keras.layers.Dense(num_outputs,activation='linear'),
])

class L2NormalizeLayer(tf.keras.layers.Layer):
    def call(self, inputs):
        return tf.linalg.l2_normalize(inputs, axis=1)


users_input=tf.keras.layers.Input(shape=tuple([num_user_features]))
vu=users_NN(users_input)
# Apply L2 normalization using the custom layer
vu=L2NormalizeLayer()(vu)

items_input=tf.keras.layers.Input(shape=tuple([num_item_features]))
vm=items_NN(items_input)
# Apply L2 normalization using the custom layer
vm=L2NormalizeLayer()(vm)

# compute the dot product of the two vectors vu and vm
output=tf.keras.layers.Dot(axes=1)([vu,vm])

#Specify input and output of model
model=tf.keras.Model([users_input,items_input],output)

model.summary()

Sekarang, mari kita buat jaringan syaraf seperti yang dijelaskan pada gambar di atas. Ini akan memiliki dua jaringan yang digabungkan dengan dot product. Anda akan membangun dua jaringan. Dalam contoh ini, kedua jaringan tersebut akan identik. Perhatikan bahwa jaringan-jaringan ini tidak harus sama. Jika konten pengguna secara substansial lebih besar daripada konten film, Anda dapat memilih untuk meningkatkan kompleksitas jaringan pengguna relatif terhadap jaringan film. Dalam hal ini, kontennya serupa, sehingga jaringannya pun sama.

Gunakan model sekuensial Keras Lapisan pertama adalah lapisan padat dengan 256 unit dan aktivasi relu. Lapisan kedua adalah lapisan padat dengan 128 unit dan aktivasi relu. Lapisan ketiga adalah lapisan padat dengan unit num_outputs dan aktivasi linier atau tanpa aktivasi.

Translated with DeepL.com (free version)

In [65]:
tf.random.set_seed(1)
cost_fun=tf.keras.losses.MeanSquaredError()
optimizer=tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss=cost_fun,optimizer=optimizer)

In [66]:
model.fit([users_train[user_features],items_train[item_features]],y_train_norm,epochs=30)

Epoch 1/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 0.1294
Epoch 2/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 0.1173
Epoch 3/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 0.1152
Epoch 4/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.1136
Epoch 5/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - loss: 0.1122
Epoch 6/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.1109
Epoch 7/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.1095
Epoch 8/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 0.1082
Epoch 9/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 0.1073
Epoch 10/30
[1m1455/1455[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x7b09e0faae50>

In [67]:
model.evaluate([users_test[user_features],items_test[item_features]],y_test_norm)

[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.1060


0.10632720589637756

#Predictions

In [68]:
def predict_user_ratings(model,user_vec,my_item_vec,user_features_start,item_features_start,scaler_items,scaler_users,y_scaler,scaledata):

    #All the movies are in my_item_vec
    #We will populate user_vec so that its no. of rows is equal to no. of rows in my_item_vec
    user_vecs=np.tile(user_vec,(my_item_vec.shape[0],1))

    #Scaling data if scaledata = True
    if scaledata:
        user_vecs=scaler_users.transform(user_vecs)
        item_vecs=scaler_items.transform(my_item_vec)

    #predict rating
    y_p=model.predict([user_vecs[:,user_features_start:],item_vecs[:,item_features_start:]])

    #inverse transform predicted rantings to get actual ratings
    y_p=y_scaler.inverse_transform(y_p)

    return y_p

#Predictions for a new use

In [69]:
new_user_id = 5000
new_rating_ave = 1.0
new_action = 1.0
new_adventure = 1
new_animation = 1
new_childrens = 1
new_comedy = 5
new_crime = 1
new_documentary = 1
new_drama = 1
new_fantasy = 1
new_horror = 1
new_mystery = 1
new_romance = 5
new_scifi = 5
new_thriller = 1
new_rating_count = 3

user_vec = np.array([[new_user_id, new_rating_count, new_rating_ave,
                      new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, new_horror, new_mystery,
                      new_romance, new_scifi, new_thriller]])

In [70]:
#For new user we will predict rating for all the movies
y_p=predict_user_ratings(model,user_vec,my_item_vec,user_features_start,item_features_start,scaler_items,scaler_users,y_scaler,scaledata=scaledata)

[1m 1/59[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m6s[0m 116ms/step



[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [71]:
movie_list_seperated_genres['y_p']=y_p
movie_list_seperated_genres.sort_values(by='y_p',ascending=False)

Unnamed: 0,movieId,year,ave rating,title,genres,y_p
395,6297,2003,3.425000,Holes,Comedy,4.746114
316,5785,2002,3.500000,Jackass: The Movie,Comedy,4.736767
482,6753,2003,3.500000,Secondhand Lions,Comedy,4.734894
379,6188,2003,3.512821,Old School,Comedy,4.731544
678,8910,2004,3.452381,I Heart Huckabees,Comedy,4.727626
...,...,...,...,...,...,...
364,6016,2002,4.146667,City of God (Cidade de Deus),Adventure,3.155874
363,6016,2002,4.146667,City of God (Cidade de Deus),Action,3.143801
423,6502,2002,3.974138,28 Days Later,Action,3.139224
544,7153,2003,4.118919,"Lord of the Rings: The Return of the King, The",Fantasy,3.132066


#Prediction for existing user

In [72]:
#get ratings for given user id
user_id=36

old_user_vec=scaler_users.inverse_transform(users_train)[scaler_users.inverse_transform(users_train)[:,0]==user_id][0]
print(old_user_vec)

# predict rating for all the movies
y_p=predict_user_ratings(model,old_user_vec,my_item_vec,user_features_start,item_features_start,scaler_items,scaler_users,y_scaler,scaledata=scaledata)

[36.     4.     1.875  3.     3.     0.     0.     0.     1.75   0.
  1.5    0.     0.     0.     1.     3.     0.   ]
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step




In [73]:
movie_list_seperated_genres['y_p']=y_p
movie_list_seperated_genres['y_actual']=0

for i in user_to_genre[user_id]['movies']:
    movie_list_seperated_genres['y_actual'][movie_list_seperated_genres['movieId']==i]=user_to_genre[user_id]['movies'][i]

movie_list_seperated_genres[movie_list_seperated_genres['y_actual']!=0].sort_values(by='y_p',ascending=False)

#y_actual is the actual rating of user and y_p is predicted rating

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  movie_list_seperated_genres['y_actual'][movie_list_seperated_genres['movieId']==i]=user_to_genre[user_id]['movies'][i]
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default be

Unnamed: 0,movieId,year,ave rating,title,genres,y_p,y_actual
202,5171,2002,2.863636,"Time Machine, The",Adventure,2.954052,3.0
201,5171,2002,2.863636,"Time Machine, The",Action,2.836692,3.0
203,5171,2002,2.863636,"Time Machine, The",Sci-Fi,2.791664,3.0
166,4995,2001,4.0,"Beautiful Mind, A",Drama,1.958833,1.0
167,4995,2001,4.0,"Beautiful Mind, A",Romance,1.772538,1.0
279,5464,2002,3.520408,Road to Perdition,Crime,1.73932,1.5
351,5956,2002,3.518182,Gangs of New York,Crime,1.737621,2.0
280,5464,2002,3.520408,Road to Perdition,Drama,1.486549,1.5
352,5956,2002,3.518182,Gangs of New York,Drama,1.481373,2.0


In [74]:
movie_list_seperated_genres.drop(['y_p','y_actual'],axis=1,inplace=True)

Jaringan syaraf di atas menghasilkan dua vektor fitur, vektor fitur pengguna Vu, dan vektor fitur film Vm. Keduanya merupakan 32 vektor entri yang nilainya sulit ditafsirkan. Namun, item yang serupa akan memiliki vektor yang serupa. Informasi ini dapat digunakan untuk membuat rekomendasi. Sebagai contoh, jika seorang pengguna memberi nilai tinggi pada “Toy Story 3”, seseorang dapat merekomendasikan film yang serupa dengan memilih film yang memiliki vektor fitur film yang serupa.
Ukuran kemiripan adalah jarak kuadrat antara dua vektor

Translated with DeepL.com (free version)

#Getting feature vector for movies : Vm

Matriks jarak antar film dapat dihitung sekali saat model dilatih dan kemudian digunakan kembali untuk rekomendasi baru tanpa pelatihan ulang. Langkah pertama, setelah model dilatih, adalah mendapatkan vektor fitur film Vm, untuk masing-masing film. Untuk melakukan ini, kita akan menggunakan item_NN yang telah dilatih dan membangun model kecil untuk memungkinkan kita menjalankan vektor film melaluinya untuk menghasilkan Vm.


Translated with DeepL.com (free version)

In [75]:
input_item_m = tf.keras.layers.Input(shape=(num_item_features,))
vm_m = items_NN(input_item_m)
# Apply L2 normalization using the custom layer
vm_m = L2NormalizeLayer()(vm_m)

model_m = tf.keras.Model(input_item_m, vm_m)
model_m.summary()

In [76]:
scaled_my_item_vec=scaler_items.transform(my_item_vec)
vms=model_m.predict(scaled_my_item_vec[:,item_features_start:])
vms.shape

[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


(1883, 32)

#Recommending movies with minimum squared distance

In [77]:
dist=np.zeros((vms.shape[0],vms.shape[0]))

#Calculating squared distance from each movie to every other movie
for i in range(len(vms)):
    dist[i]=np.sum(np.square(vms[i]-vms),axis=1).reshape((1,-1))

dist

array([[0.        , 0.2229608 , 0.20450446, ..., 1.99657726, 1.91267622,
        1.9462564 ],
       [0.2229608 , 0.        , 0.07355691, ..., 2.1055007 , 1.76001656,
        2.04809237],
       [0.20450446, 0.07355691, 0.        , ..., 1.8729074 , 1.7187345 ,
        1.83981609],
       ...,
       [1.99657726, 2.1055007 , 1.8729074 , ..., 0.        , 0.25630122,
        0.00918694],
       [1.91267622, 1.76001656, 1.7187345 , ..., 0.25630122, 0.        ,
        0.20982462],
       [1.9462564 , 2.04809237, 1.83981609, ..., 0.00918694, 0.20982462,
        0.        ]])

In [78]:
#masking so that we dont consider distance from a movie to itself
masked_dist=np.ma.masked_array(dist,mask=np.identity(dist.shape[0]))
masked_dist

masked_array(
  data=[[--, 0.22296079993247986, 0.20450446009635925, ...,
         1.996577262878418, 1.912676215171814, 1.946256399154663],
        [0.22296079993247986, --, 0.07355690747499466, ...,
         2.1055006980895996, 1.7600165605545044, 2.0480923652648926],
        [0.20450446009635925, 0.07355690747499466, --, ...,
         1.8729074001312256, 1.7187345027923584, 1.8398160934448242],
        ...,
        [1.996577262878418, 2.1055006980895996, 1.8729074001312256, ...,
         --, 0.25630122423171997, 0.009186942130327225],
        [1.912676215171814, 1.7600165605545044, 1.7187345027923584, ...,
         0.25630122423171997, --, 0.20982462167739868],
        [1.946256399154663, 2.0480923652648926, 1.8398160934448242, ...,
         0.009186942130327225, 0.20982462167739868, --]],
  mask=[[ True, False, False, ..., False, False, False],
        [False,  True, False, ..., False, False, False],
        [False, False,  True, ..., False, False, False],
        ...,
        [Fal

In [79]:
#For each row, getting index of movie with min distance
min_dist_movie_index=np.argmin(masked_dist,axis=1)
min_dist_movie_index

array([ 192,    7,   20, ..., 1754, 1756, 1757])

In [80]:
recommend_df=pd.DataFrame()
recommend_df['movie 1']=movie_list_seperated_genres['title'] +' ( '+movie_list_seperated_genres['year']+' ) '
recommend_df['movie 1 genres']=movie_list_seperated_genres['genres']

recommend_df['movie 2']=( movie_list_seperated_genres.loc[min_dist_movie_index,'title'] +' ( '+movie_list_seperated_genres.loc[min_dist_movie_index,'year']+' ) ' ).values
recommend_df['movie 2 genres']=movie_list_seperated_genres.loc[min_dist_movie_index,'genres'].values

In [81]:
recommend_df.head(10)

Unnamed: 0,movie 1,movie 1 genres,movie 2,movie 2 genres
0,Save the Last Dance ( 2001 ),Drama,John Q ( 2002 ),Drama
1,Save the Last Dance ( 2001 ),Romance,Saving Silverman (Evil Woman) ( 2001 ),Romance
2,"Wedding Planner, The ( 2001 )",Comedy,Spy Kids ( 2001 ),Comedy
3,"Wedding Planner, The ( 2001 )",Romance,Save the Last Dance ( 2001 ),Romance
4,Hannibal ( 2001 ),Horror,Final Destination 2 ( 2003 ),Horror
5,Hannibal ( 2001 ),Thriller,Panic Room ( 2002 ),Thriller
6,Saving Silverman (Evil Woman) ( 2001 ),Comedy,Cats & Dogs ( 2001 ),Comedy
7,Saving Silverman (Evil Woman) ( 2001 ),Romance,Save the Last Dance ( 2001 ),Romance
8,Down to Earth ( 2001 ),Comedy,Joe Dirt ( 2001 ),Comedy
9,Down to Earth ( 2001 ),Fantasy,"Haunted Mansion, The ( 2003 )",Fantasy
