In [19]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

In [20]:
ratings = pd.read_csv('C://Code//Projects//Movie Recommender System//Datasets//ratings_small.csv')

In [21]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100004 non-null  int64  
 1   movieId    100004 non-null  int64  
 2   rating     100004 non-null  float64
 3   timestamp  100004 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [22]:
ratings.dropna(inplace=True)

In [23]:
ratings.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [24]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205
...,...,...,...,...
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663


In [25]:
from sklearn.model_selection import train_test_split

In [26]:
train_X,test_X = train_test_split(ratings, test_size=0.2)

In [27]:
train_X

Unnamed: 0,userId,movieId,rating,timestamp
66504,468,5943,2.0,1296203067
47453,348,73,4.0,960838695
94105,624,3671,5.0,1019127174
94732,624,77191,2.5,1309605307
82440,562,282,3.0,1167429320
...,...,...,...,...
93295,620,97921,3.0,1455532678
80381,547,58879,4.5,1230869313
98059,654,37729,4.0,1145391509
84269,564,2919,5.0,974712184


In [28]:
test_X

Unnamed: 0,userId,movieId,rating,timestamp
9496,63,3897,4.0,1078570666
70493,489,2987,3.0,944961223
62507,452,5364,2.5,1151811461
12915,83,151,0.5,1156205069
27001,198,5995,2.0,1068823183
...,...,...,...,...
84852,569,1193,3.0,965144227
94505,624,8879,3.5,1099048295
65288,463,5418,2.0,1049922505
36413,262,8783,2.5,1433944698


In [29]:
max_userid = ratings['userId'].drop_duplicates().max()
max_movieid = ratings['movieId'].drop_duplicates().max()

print(max_userid)
print(max_movieid)

671
163949


In [30]:
embedding_dimension = 32

In [31]:
#Movies Network
input_movies = tf.keras.layers.Input(shape=[1])
embed_movies = tf.keras.layers.Embedding(max_movieid + 1, embedding_dimension)(input_movies)
out_movies = tf.keras.layers.Flatten()(embed_movies)
#Users Network
input_users = tf.keras.layers.Input(shape=[1])
embed_users = tf.keras.layers.Embedding(max_userid + 1, embedding_dimension)(input_users)
out_users = tf.keras.layers.Flatten()(embed_users)

conc_layer = tf.keras.layers.Concatenate()([out_movies,out_users])
x = tf.keras.layers.Dense(128, activation='relu')(conc_layer)
x_out = tf.keras.layers.Dense(1, activation='relu')(x)

model = tf.keras.Model([input_movies,input_users], x_out)
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='mean_squared_error')

In [32]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 32)        5246400     input_3[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1, 32)        21504       input_4[0][0]                    
____________________________________________________________________________________________

In [33]:
hist = model.fit([train_X.movieId,train_X.userId], train_X.rating,
                 batch_size=64,epochs=15,
                 verbose=1,
                 validation_data= ([test_X.movieId,test_X.userId], test_X.rating))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15

KeyboardInterrupt: 

In [None]:
train_loss = hist.history['loss']
val_loss = hist.history['val_loss']
plt.plot(train_loss, color='r', label='Train Loss')
plt.plot(val_loss, color='b', label='Validation Loss')
plt.title("Train and Validation Loss Curve")
plt.legend()
plt.show()

In [None]:
#Extract embeddings

embedd = model.get_layer('embedding')
embedd_weights= embedd.get_weights()[0]
embedd_weights.shape

In [None]:
merged_movies = pd.read_csv('Datasets//Rating_title_merged.csv')

In [None]:
merged_movies

In [None]:
Movie_id =list(merged_movies.id.drop_duplicates())

dict_map = {}
for i in Movie_id:
    try:
        dict_map[i] = merged_movies.iloc[i]['title']
    except:
        continue
    
out_v = open('vecs.tsv', 'w')
out_m = open('meta.tsv', 'w')
for i in Movie_id:
    try:
        movie = dict_map[i]
        embeddings = embedd_weights[i]
        out_m.write(movie + "\n")
        out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
    except:
        continue

out_v.close()
out_m.close()

In [None]:
#Making recommendations for user 100
movie_arr = np.array(Movie_id) #get all book IDs
user = np.array([1 for i in range(len(Movie_id))])
pred = model.predict([movie_arr, user])
pred

In [None]:
pred = pred.reshape(-1) #reshape to single dimension
pred_ids = (-pred).argsort()[0:5]
pred_ids

In [None]:
merged_movies.iloc[pred_ids]