### Setup

In [None]:
from IPython.display import clear_output

import nltk
nltk.download("punkt")
nltk.download("stopwords")

import gensim.downloader
glove = gensim.downloader.load('glove-wiki-gigaword-100')

clear_output()

### Import Libraries

In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

### Load Dataset

In [None]:
df_interaction = pd.read_csv("interaction.csv", index_col=0)
df_interaction

Unnamed: 0,user,item,rating,timestamp,split
0,1,1,5,874965758,train
1,1,2,3,876893171,train
2,1,3,4,878542960,train
3,1,4,3,876893119,train
4,1,5,3,889751712,train
...,...,...,...,...,...
19995,458,648,4,886395899,test
19996,458,1101,4,886397931,test
19997,459,934,3,879563639,test
19998,460,10,3,882912371,test


### Data Splitting

In [None]:
train = df_interaction[df_interaction.split=="train"]
test = df_interaction[df_interaction.split=="test"]

In [None]:
train

Unnamed: 0,user,item,rating,timestamp,split
0,1,1,5,874965758,train
1,1,2,3,876893171,train
2,1,3,4,878542960,train
3,1,4,3,876893119,train
4,1,5,3,889751712,train
...,...,...,...,...,...
79995,943,1067,2,875501756,train
79996,943,1074,4,888640250,train
79997,943,1188,3,888640250,train
79998,943,1228,3,888640275,train


In [None]:
test

Unnamed: 0,user,item,rating,timestamp,split
1,1,10,3,875693118,test
2,1,12,5,878542960,test
4,1,17,3,875073198,test
5,1,20,4,887431883,test
6,1,23,4,875072895,test
...,...,...,...,...,...
19995,458,648,4,886395899,test
19996,458,1101,4,886397931,test
19997,459,934,3,879563639,test
19998,460,10,3,882912371,test


## Simple EDA

### Ensure there are no missing ratings or duplicates

In [None]:
train[train.duplicated(subset=["user","item"])]

Unnamed: 0,user,item,rating,timestamp,split


In [None]:
test[test.duplicated(subset=["user","item"])]

Unnamed: 0,user,item,rating,timestamp,split


In [None]:
train[train.rating.isna()]

Unnamed: 0,user,item,rating,timestamp,split


In [None]:
test[test.rating.isna()]

Unnamed: 0,user,item,rating,timestamp,split


### Check if all users in testing appear in training

In [None]:
user_train = train.user.unique()

In [None]:
all(test.user.isin(user_train))

True

In [None]:
print("Number of interactions in train:",train.shape[0])
print("Number of interactions in test:",test.shape[0])

Number of interactions in train: 75443
Number of interactions in test: 18850


In [None]:
df_metadata = pd.read_csv("metadata.csv", index_col=0)
df_metadata

Unnamed: 0,item_id,original_title,overview
0,1,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,755,Jumanji,When siblings Judy and Peter discover an encha...
2,1028,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,1311,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,756,Father of the Bride Part II,Just when George Banks has recovered from his ...
...,...,...,...
7915,1460,Sleepover,As their first year of high school looms ahead...
8012,1674,Mamma Roma,After many years working in the streets of Rom...
8702,217,Bram Stoker's Dracula,Dracula is searching for a woman who looks lik...
9783,1329,The Low Life,John came to Hollywood to get that one big bre...


In [None]:
df_metadata = df_metadata.dropna(subset = ["original_title","overview"], how="any")
df_metadata

Unnamed: 0,item_id,original_title,overview
0,1,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,755,Jumanji,When siblings Judy and Peter discover an encha...
2,1028,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,1311,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,756,Father of the Bride Part II,Just when George Banks has recovered from his ...
...,...,...,...
7915,1460,Sleepover,As their first year of high school looms ahead...
8012,1674,Mamma Roma,After many years working in the streets of Rom...
8702,217,Bram Stoker's Dracula,Dracula is searching for a woman who looks lik...
9783,1329,The Low Life,John came to Hollywood to get that one big bre...


### Text Cleaning

In [None]:
df_metadata["combined"] = df_metadata.original_title + " " + df_metadata.overview
df_metadata.combined

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_metadata["combined"] = df_metadata.original_title + " " + df_metadata.overview


Unnamed: 0,combined
0,"Toy Story Led by Woody, Andy's toys live happi..."
1,Jumanji When siblings Judy and Peter discover ...
2,Grumpier Old Men A family wedding reignites th...
3,"Waiting to Exhale Cheated on, mistreated and s..."
4,Father of the Bride Part II Just when George B...
...,...
7915,Sleepover As their first year of high school l...
8012,Mamma Roma After many years working in the str...
8702,Bram Stoker's Dracula Dracula is searching for...
9783,The Low Life John came to Hollywood to get tha...


In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
df_metadata["combined"] = df_metadata["combined"].str.lower()
df_metadata.combined

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_metadata["combined"] = df_metadata["combined"].str.lower()


Unnamed: 0,combined
0,"toy story led by woody, andy's toys live happi..."
1,jumanji when siblings judy and peter discover ...
2,grumpier old men a family wedding reignites th...
3,"waiting to exhale cheated on, mistreated and s..."
4,father of the bride part ii just when george b...
...,...
7915,sleepover as their first year of high school l...
8012,mamma roma after many years working in the str...
8702,bram stoker's dracula dracula is searching for...
9783,the low life john came to hollywood to get tha...


In [None]:
df_metadata["tokenized"] = df_metadata.combined.apply(lambda x: word_tokenize(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_metadata["tokenized"] = df_metadata.combined.apply(lambda x: word_tokenize(x))


In [None]:
df_metadata["tokenized"]

Unnamed: 0,tokenized
0,"[toy, story, led, by, woody, ,, andy, 's, toys..."
1,"[jumanji, when, siblings, judy, and, peter, di..."
2,"[grumpier, old, men, a, family, wedding, reign..."
3,"[waiting, to, exhale, cheated, on, ,, mistreat..."
4,"[father, of, the, bride, part, ii, just, when,..."
...,...
7915,"[sleepover, as, their, first, year, of, high, ..."
8012,"[mamma, roma, after, many, years, working, in,..."
8702,"[bram, stoker, 's, dracula, dracula, is, searc..."
9783,"[the, low, life, john, came, to, hollywood, to..."


In [None]:
#remove stopword and special character
df_metadata["clean_tokenized"] = df_metadata["tokenized"].apply(lambda tokens: [word for word in tokens if word.isalpha() and word not in stopwords.words("english")])
df_metadata.drop(columns=["combined","tokenized"], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_metadata["clean_tokenized"] = df_metadata["tokenized"].apply(lambda tokens: [word for word in tokens if word.isalpha() and word not in stopwords.words("english")])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_metadata.drop(columns=["combined","tokenized"], inplace=True)


### Create embeddings of original_title + overview

In [None]:
glove["movie"]

array([ 0.38251  ,  0.14821  ,  0.60601  , -0.51533  ,  0.43992  ,
        0.061053 , -0.62716  , -0.025385 ,  0.1643   , -0.22101  ,
        0.14423  , -0.37213  , -0.21683  , -0.08895  ,  0.097904 ,
        0.6561   ,  0.64455  ,  0.47698  ,  0.83849  ,  1.6486   ,
        0.88922  , -0.1181   , -0.012465 , -0.52082  ,  0.77854  ,
        0.48723  , -0.014991 , -0.14127  , -0.34747  , -0.29595  ,
        0.1028   ,  0.57191  , -0.045594 ,  0.026443 ,  0.53816  ,
        0.32257  ,  0.40788  , -0.043599 , -0.146    , -0.48346  ,
        0.32036  ,  0.55086  , -0.76259  ,  0.43269  ,  0.61753  ,
       -0.36503  , -0.60599  , -0.79615  ,  0.3929   , -0.23668  ,
       -0.34719  , -0.61201  ,  0.54747  ,  0.94812  ,  0.20941  ,
       -2.7771   , -0.6022   ,  0.8495   ,  1.2549   ,  0.017893 ,
       -0.041901 ,  2.1147   , -0.026618 , -0.28104  ,  0.68124  ,
       -0.14165  ,  0.99249  ,  0.49879  , -0.67538  ,  0.6417   ,
        0.42303  , -0.27913  ,  0.063403 ,  0.68909  , -0.3618

In [None]:
import numpy as np

def get_embedding(list_of_tokens):
  embeddings = np.zeros(100)
  for token in list_of_tokens:
    if token in glove:
      embeddings += glove[token]
  return embeddings

In [None]:
df_metadata["embedding"] = df_metadata["clean_tokenized"].apply(lambda x: get_embedding(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_metadata["embedding"] = df_metadata["clean_tokenized"].apply(lambda x: get_embedding(x))


In [None]:
df_metadata["embedding"]

Unnamed: 0,embedding
0,"[-0.5064330138266087, 7.1165720876306295, 7.78..."
1,"[3.438274426443968, 8.522259819321334, 14.8187..."
2,"[0.8174464097246528, 7.149652984924614, 5.7175..."
3,"[2.8892899472266436, 4.592724758898839, 6.4860..."
4,"[1.576476872432977, 1.4149789679795504, 4.0744..."
...,...
7915,"[3.366053223639028, 4.492173184757121, 7.62047..."
8012,"[12.92227082606405, 12.395751975476742, 6.9112..."
8702,"[0.03374400734901428, 1.124519944190979, 3.725..."
9783,"[5.472918973304331, 5.457290678285062, 4.57585..."


### Testing

In [None]:
u = 1
items_of_user_1 = train.query("user==@u").item.to_list()

In [None]:
#create user 1 profile based on items
embedding_of_movies_of_user = df_metadata.loc[df_metadata.item_id.isin(items_of_user_1),"embedding"]
embedding_of_movies_of_user

Unnamed: 0,embedding
0,"[-0.5064330138266087, 7.1165720876306295, 7.78..."
9,"[0.7079479657113552, -0.09113010764122009, 6.2..."
17,"[-0.9298799242824316, 1.9354404262267053, 5.74..."
20,"[6.759992748615332, 5.755311886779964, 15.7781..."
21,"[2.526728004217148, 2.1927146264351904, 1.5414..."
...,...
1563,"[0.9915840364992619, 4.225929945707321, 3.2042..."
1575,"[0.3967610336840153, 4.5755121456459165, 5.046..."
1598,"[-4.4563269801437855, 2.2408585612429306, 7.31..."
4930,"[-10.331485925707966, 4.493019976653159, 2.473..."


In [None]:
profile_user = np.sum(embedding_of_movies_of_user.values)
profile_user

array([ 2.47446222e+02,  4.31681953e+02,  9.15254913e+02, -6.87081894e+02,
       -7.83365201e-01,  8.83970719e+02, -2.66907208e+02,  1.03595814e+02,
       -2.57779658e+02, -2.15907103e+02, -1.32121558e+01,  1.02354088e+02,
        5.46646477e+02,  4.35317215e+02,  1.02414806e+02, -4.09786322e+02,
        7.97227825e+02,  1.92513362e+02, -9.78447504e+02,  1.27786327e+03,
        7.21928597e+02, -2.87539383e+02,  3.48229297e+02, -2.79813706e+02,
        8.57987192e+02,  2.33702620e+02, -6.98267350e+02, -1.30521045e+03,
        2.70779072e+02,  6.90000656e+01, -2.15209647e+02,  1.01696248e+03,
        1.09404271e+01, -1.44501538e+02,  3.29211721e+01,  2.62347423e+02,
       -2.42820433e+02,  1.83583035e+02,  4.07544913e+02, -3.71707072e+02,
       -1.02784087e+03, -1.14051110e+02,  3.95416709e+02, -6.29247000e+02,
        7.54639425e+01,  3.67280608e+01, -1.72709387e+02, -2.70954261e+02,
        5.08510133e+02, -1.19781072e+03, -1.11079637e+02, -4.66182935e+02,
        5.88765983e+02,  

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

df_metadata["cosine"] = df_metadata.embedding.apply(lambda x: cosine_similarity(profile_user.reshape(1,100), x.reshape(1,100))).apply(lambda x: x[0][0])
df_metadata[["original_title","cosine"]].sort_values("cosine", ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_metadata["cosine"] = df_metadata.embedding.apply(lambda x: cosine_similarity(profile_user.reshape(1,100), x.reshape(1,100))).apply(lambda x: x[0][0])


Unnamed: 0,original_title,cosine
355,I Love Trouble,0.973996
1070,American Strays,0.972924
1083,Le huitième jour,0.972009
330,The Underneath,0.971454
905,An Affair to Remember,0.969749
...,...,...
1476,Schizopolis,0.579656
789,Kazaam,0.576408
800,Kaspar Hauser,0.378171
104,Keiner liebt mich,0.293170


In [None]:
top50item = df_metadata[["item_id","original_title","cosine"]].sort_values("cosine", ascending=False).head(50).item_id.values
top50item

array([1180, 1362, 1640, 1553,  966,  397, 1337,  535,  662,  932, 1405,
        650, 1417, 1250, 1432, 1160,  460, 1638,  663, 1612, 1146,  493,
        979, 1033, 1375, 1055, 1422, 1603, 1326, 1611,  106,  123, 1467,
         38, 1679,  723,  521,  710, 1304,  486,  770,  625,  888,  150,
       1433, 1390, 1348,   37, 1181,  785])

In [None]:
relevant_items_for_user1 = test[test.user==1].item.values
relevant_items_for_user1

array([ 10,  12,  17,  20,  23,  24,  27,  31,  33,  36,  39,  44,  47,
        49,  51,  53,  54,  56,  60,  61,  62,  64,  65,  67,  69,  70,
        72,  73,  74,  76,  78,  80,  81,  82,  85,  86,  90,  91,  92,
        96,  97,  98, 100, 102, 103, 104, 107, 108, 112, 113, 117, 118,
       120, 125, 129, 130, 132, 134, 140, 143, 145, 148, 150, 155, 157,
       159, 160, 161, 163, 164, 170, 171, 174, 175, 180, 183, 184, 185,
       186, 188, 189, 190, 193, 196, 200, 201, 202, 206, 208, 209, 210,
       212, 213, 215, 218, 219, 221, 222, 224, 225, 226, 227, 229, 230,
       232, 233, 235, 236, 241, 242, 248, 250, 252, 253, 254, 255, 258,
       259, 260, 262, 264, 265, 266, 272])

In [None]:
set(top50item).intersection(set(relevant_items_for_user1))

{150}

In [None]:
def recommend(user_id, train, test, df_metadata, top_n=50):
    # Mendapatkan daftar item yang disukai oleh pengguna
    items_of_user = train.query("user==@user_id").item.to_list()

    # Membuat profil pengguna berdasarkan item yang disukai
    embedding_of_movies_of_user = df_metadata.loc[df_metadata.item_id.isin(items_of_user), "embedding"]
    profile_user = np.sum(embedding_of_movies_of_user.values)

    # Menghitung kemiripan cosine antara profil pengguna dan semua film
    df_metadata["cosine"] = df_metadata.embedding.apply(lambda x: cosine_similarity(profile_user.reshape(1,100), x.reshape(1,100))[0][0])

    # Mendapatkan top N rekomendasi
    top_n_items = df_metadata[["item_id","original_title","cosine"]].sort_values("cosine", ascending=False).head(top_n)

    return top_n_items

In [None]:
result = recommend(user_id=2, train=train, test=test, df_metadata=df_metadata)

result

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_metadata["cosine"] = df_metadata.embedding.apply(lambda x: cosine_similarity(profile_user.reshape(1,100), x.reshape(1,100))[0][0])


Unnamed: 0,item_id,original_title,cosine
330,1553,The Underneath,0.978068
1083,1640,Le huitième jour,0.975988
905,966,An Affair to Remember,0.974409
355,1180,I Love Trouble,0.972708
539,397,Striking Distance,0.972524
1471,535,Addicted to Love,0.97212
1382,1375,The Cement Garden,0.97047
923,493,The Thin Man,0.970374
1070,1362,American Strays,0.970104
1023,1638,Normal Life,0.970036
