In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

In [2]:
ratings = pd.read_csv('C://Projects//Movie Recommender System//Datasets//ratings_small.csv')

In [3]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26024289 entries, 0 to 26024288
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 794.2 MB


In [4]:
ratings.dropna(inplace=True)

In [5]:
ratings.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [6]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556
...,...,...,...,...
26024284,270896,58559,5.0,1257031564
26024285,270896,60069,5.0,1257032032
26024286,270896,63082,4.5,1257031764
26024287,270896,64957,4.5,1257033990


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
train_X,test_X = train_test_split(ratings, test_size=0.2)

In [9]:
train_X

Unnamed: 0,userId,movieId,rating,timestamp
19418290,201628,54256,5.0,1458698129
25800020,268409,40629,3.5,1217950047
19787934,205619,95875,3.5,1361878092
2077331,21601,27904,3.5,1463360505
10386114,107216,4027,4.0,1112072914
...,...,...,...,...
16346404,169907,1148,5.0,942418878
23997367,249169,71033,3.5,1461154064
7674543,79255,4701,4.0,1243323781
17997411,186905,4448,3.0,1124997859


In [10]:
test_X

Unnamed: 0,userId,movieId,rating,timestamp
17793701,184647,4881,5.0,1027090321
23647307,245475,5349,5.0,1111758890
24514518,254777,159849,4.0,1468088611
7072216,73022,3426,1.0,1034724782
19935115,207244,830,0.5,1284924965
...,...,...,...,...
25960545,270206,1126,3.0,995588838
15956329,165994,4675,3.0,1020890167
11210988,116153,86880,3.0,1496871775
3184682,33141,1952,2.5,1360332950


In [11]:
max_userid = ratings['userId'].drop_duplicates().max()
max_movieid = ratings['movieId'].drop_duplicates().max()

print(max_userid)
print(max_movieid)

270896
176275


In [12]:
embedding_dimension = 32

In [13]:
#Movies Network
input_movies = tf.keras.layers.Input(shape=[1])
embed_movies = tf.keras.layers.Embedding(max_movieid + 1, embedding_dimension)(input_movies)
out_movies = tf.keras.layers.Flatten()(embed_movies)
#Users Network
input_users = tf.keras.layers.Input(shape=[1])
embed_users = tf.keras.layers.Embedding(max_userid + 1, embedding_dimension)(input_users)
out_users = tf.keras.layers.Flatten()(embed_users)

conc_layer = tf.keras.layers.Concatenate()([out_movies,out_users])
x = tf.keras.layers.Dense(128, activation='relu')(conc_layer)
x_out = tf.keras.layers.Dense(1, activation='relu')(x)

model = tf.keras.Model([input_movies,input_users], x_out)
opt = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='mean_squared_error')

In [14]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 1, 32)        5640832     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 32)        8668704     input_2[0][0]                    
______________________________________________________________________________________________

In [None]:
hist = model.fit([train_X.movieId,train_X.userId], train_X.rating,
                 batch_size=64,epochs=15,
                 verbose=1,
                 validation_data= ([test_X.movieId,test_X.userId], test_X.rating))

Epoch 1/15
   336/325304 [..............................] - ETA: 15:35:43 - loss: 7.3800

In [None]:
train_loss = hist.history['loss']
val_loss = hist.history['val_loss']
plt.plot(train_loss, color='r', label='Train Loss')
plt.plot(val_loss, color='b', label='Validation Loss')
plt.title("Train and Validation Loss Curve")
plt.legend()
plt.show()

In [None]:
#Extract embeddings

embedd = model.get_layer('embedding')
embedd_weights= embedd.get_weights()[0]
embedd_weights.shape

In [24]:
merged_movies = pd.read_csv('Datasets//Rating_title_merged.csv')

In [None]:
merged_movies

In [None]:
Movie_id =list(merged_movies.id.drop_duplicates())

dict_map = {}
for i in Movie_id:
    try:
        dict_map[i] = merged_movies.iloc[i]['title']
    except:
        continue
    
out_v = open('vecs.tsv', 'w')
out_m = open('meta.tsv', 'w')
for i in Movie_id:
    try:
        movie = dict_map[i]
        embeddings = embedd_weights[i]
        out_m.write(movie + "\n")
        out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
    except:
        continue

out_v.close()
out_m.close()

In [39]:
#Making recommendations for user 100
movie_arr = np.array(Movie_id) #get all book IDs
user = np.array([1 for i in range(len(Movie_id))])
pred = model.predict([movie_arr, user])
pred

array([[2.579938 ],
       [1.4530795],
       [3.9213607],
       ...,
       [2.7712343],
       [2.2147112],
       [2.4150655]], dtype=float32)

In [40]:
pred = pred.reshape(-1) #reshape to single dimension
pred_ids = (-pred).argsort()[0:5]
pred_ids

array([1650, 1707, 2226,  242, 1696], dtype=int64)

In [43]:
merged_movies.iloc[pred_ids]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
1650,False,,15000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 80, 'nam...",http://www.biglebowskibluray.com/splash.php,115,tt0118715,en,The Big Lebowski,"Jeffrey ""The Dude"" Lebowski, a Los Angeles sla...",...,1998-03-06,46189568.0,117.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Times like these call for a Big Lebowski.,The Big Lebowski,False,7.8,3001.0
1707,False,,0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",,5910,tt0119250,ja,はなび,A police officer leaves the force in the face ...,...,1997-09-03,0.0,103.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,,Fireworks,False,7.6,118.0
2226,False,,30000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",,17037,tt0155753,en,I'll Be Home for Christmas,"Estranged from his father, college student Jak...",...,1998-11-12,0.0,86.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Somewhere between L.A. and N.Y. Jake found the...,I'll Be Home for Christmas,False,5.2,65.0
242,False,,700000,"[{'id': 99, 'name': 'Documentary'}]",,14275,tt0110057,en,Hoop Dreams,This documentary follows two inner-city Chicag...,...,1994-09-12,7830611.0,171.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,An Extraordinary True Story.,Hoop Dreams,False,7.7,91.0
1696,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,47452,tt0119574,en,Love and Death on Long Island,Giles De'Ath is a widower who doesn't like any...,...,1997-01-01,0.0,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Love and Death on Long Island,False,6.9,11.0
