Recommendation-System-Using-Neural-Collaborative-Filtering
In this project the aim is to recommend a user movies based on their previous rating on movies which was watched by the user


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
movies=pd.read_csv("movie.csv")
ratings=pd.read_csv("rating.csv")

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies.shape

(27278, 3)

In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [6]:
ratings.shape

(20000263, 4)

In [7]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB


In [8]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 610.4+ MB


In [9]:
ratings = ratings[0:110000]
movies = movies[0:110000]

In [10]:
# from the rating dataset will retrieve all the unique usersid
user_ids = ratings['userId'].unique().tolist()
user2user = {x: i for i, x in enumerate(user_ids)}  #---> crestes dictionary 
userencodeduser = {i: x for i, x in enumerate(user_ids)}  #---> crestes dictionary  with indexes

In [11]:
ratings.shape

(110000, 4)

In [12]:
movie_ids=ratings['movieId'].unique().tolist()
movie2movie = {x: i for i, x in enumerate(movie_ids)}
movieencodemovie = {i: x for i, x in enumerate(movie_ids)}

In [13]:
movies.shape

(27278, 3)

In [14]:
#Mapping the two tables 
ratings['user'] = ratings['userId'].map(user2user)
ratings['movie'] = ratings['movieId'].map(movie2movie)

In [15]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,user,movie
0,1,2,3.5,2005-04-02 23:53:47,0,0
1,1,29,3.5,2005-04-02 23:31:16,0,1
2,1,32,3.5,2005-04-02 23:33:39,0,2
3,1,47,3.5,2005-04-02 23:32:07,0,3
4,1,50,3.5,2005-04-02 23:29:40,0,4


In [16]:
# FINDING THE MINIMUM AND MAXIMUM RATING 
min_rating = min(ratings['rating'])
max_rating = max(ratings['rating'])

In [17]:
print(min_rating)
print(max_rating)

0.5
5.0


In [18]:
# FINDING THE MINIMUM AND MAXIMUM RATINGs GIVEN BY THE USERS
min_rating = min(ratings['rating'])
max_rating = max(ratings['rating'])
num_users = len(user2user)
num_movies = len(movie2movie)
ratings['rating'] = ratings['rating'].values.astype(np.float32)
print("Number of users : {}, Number of Movies: {}, Min rating: {},Max rating: {}".format(num_users,num_movies,min_rating,
                                                                                         max_rating))

Number of users : 751, Number of Movies: 8864, Min rating: 0.5,Max rating: 5.0


In [19]:
# Train-Test Split
ratings = ratings.sample(frac = 1, random_state = 1) #randomly sample the dataset
x = ratings[['user','movie']].values
# Normalize the target between 0 and 1
y = ratings['rating'].apply(lambda x: (x - min_rating)/ (max_rating - min_rating)).values

In [20]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.10,random_state=0)

In [21]:
x_train.shape

(99000, 2)

In [22]:
y_train.shape

(99000,)

In [23]:
from tensorflow.keras.models import Sequential,Model
from tensorflow.keras.layers import *
from tensorflow.keras import layers
from tensorflow import keras

In [24]:
# Defining the model
embed_size = 50 #embedding size we need
user_input = layers.Input(shape = [1])
user_embed = layers.Embedding(num_users,embed_size,embeddings_initializer ="he_normal",embeddings_regularizer=keras.regularizers.l2(1e-6))(user_input)
# Emedding Layer will create 610 vectors look-up table of 50 dimension each
user_vect = layers.Flatten()(user_embed)

movie_input = layers.Input(shape = [1])
movie_embed = layers.Embedding(num_movies,embed_size,embeddings_initializer ="he_normal",embeddings_regularizer=keras.regularizers.l2(1e-6))(movie_input)
movie_vect = layers.Flatten()(movie_embed)

prod = layers.dot(inputs = [user_vect, movie_vect], axes = 1)

dense1 = layers.Dense(150,activation='relu', kernel_initializer="he_normal")(prod)
dense2 = layers.Dense(64,activation='relu', kernel_initializer="he_normal")(dense1)
dense3 = layers.Dense(1,activation='relu')(dense2)

In [25]:
# BUILDING OUR MODEL
model = Model([user_input, movie_input], dense3)

In [26]:
# COMPILING THE MODEL
model.compile(optimizer = 'adam',loss = 'mean_squared_error')

In [30]:
%%time
history = model.fit([x_train[:,0],x_train[:,1]], y_train, batch_size = 32, epochs = 10, verbose = 1) 
# here we will be sending user_id and movie_id as inputs

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: total: 17min 31s
Wall time: 8min 1s


In [31]:
prediction = model.evaluate([x_test[:,0],x_test[:,1]])



In [32]:
pred = model.predict([x_train[4:5,0],x_train[4:5,1]])
pred



array([[0.67353195]], dtype=float32)

In [33]:
movies.shape

(27278, 3)

# getting 1 random user id and recommending that user top 10 movies based on that user ratings and the movies watched by him

In [34]:

user_id = ratings.userId.sample(1).iloc[0]  # getting 1 random user id
movies_watched_by_user = ratings[ratings.userId == user_id] #filtering out the movies watched by user
movies_not_watched = movies[~movies["movieId"].isin(movies_watched_by_user.movieId.values)]["movieId"] #getting the movies which are not watched by the user

In [35]:
user_id

739

In [36]:
movies_watched_by_user.head()

Unnamed: 0,userId,movieId,rating,timestamp,user,movie
106180,739,31410,3.0,2009-03-31 21:31:00,738,1278
105932,739,165,4.0,2009-04-27 14:52:37,738,367
106274,739,62437,3.0,2009-03-28 14:05:58,738,7660
106207,739,48780,5.0,2010-04-05 14:10:23,738,2833
106061,739,2355,3.0,2008-06-28 17:04:02,738,739


In [37]:
movies_not_watched

2             3
3             4
4             5
6             7
7             8
          ...  
27273    131254
27274    131256
27275    131258
27276    131260
27277    131262
Name: movieId, Length: 26787, dtype: int64

In [38]:
#this is an innerjoin between movies_not_watched and movie ids from rating table (to check whether the movie ids are not out of place)
movies_not_watched = list( set(movies_not_watched).intersection(set(movie2movie.keys()))) 

In [39]:
movies_not_watched

[32770,
 3,
 4,
 5,
 65538,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 17,
 18,
 20,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 65567,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 65577,
 48,
 65585,
 49,
 51,
 52,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 32853,
 92,
 93,
 94,
 95,
 96,
 97,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 65642,
 109,
 112,
 113,
 114,
 32882,
 116,
 117,
 118,
 119,
 121,
 122,
 123,
 124,
 125,
 126,
 129,
 130,
 131,
 132,
 32898,
 134,
 135,
 136,
 139,
 140,
 141,
 144,
 65682,
 147,
 146,
 149,
 148,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 162,
 163,
 164,
 166,
 168,
 169,
 170,
 171,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 32954,
 193,
 194,
 195,
 196,
 198,
 199,
 200,
 201,
 2

In [40]:
max(movies_not_watched)

128594

In [41]:
movies_not_watched_index = [[movie2movie.get(x)] for x in movies_not_watched] # get the indexes of the movies not watched by the user which we will use later for recommending
movies_not_watched_index

[[6365],
 [175],
 [1994],
 [1044],
 [6679],
 [420],
 [1965],
 [1677],
 [365],
 [386],
 [2614],
 [4834],
 [1187],
 [429],
 [387],
 [1469],
 [1678],
 [1646],
 [1679],
 [228],
 [681],
 [3537],
 [3816],
 [2021],
 [1],
 [3163],
 [1647],
 [3836],
 [1045],
 [3809],
 [1046],
 [7844],
 [3553],
 [631],
 [3538],
 [2123],
 [1680],
 [2074],
 [1681],
 [1867],
 [1682],
 [8862],
 [632],
 [3947],
 [7031],
 [8381],
 [421],
 [4837],
 [1311],
 [8382],
 [1683],
 [1868],
 [3164],
 [388],
 [1312],
 [176],
 [2615],
 [2050],
 [688],
 [3505],
 [1313],
 [1884],
 [1396],
 [177],
 [4838],
 [4149],
 [1470],
 [1314],
 [3554],
 [1315],
 [4977],
 [2124],
 [1316],
 [5579],
 [2616],
 [3165],
 [3166],
 [3167],
 [1995],
 [1317],
 [3555],
 [1318],
 [1471],
 [3902],
 [8293],
 [1319],
 [1684],
 [2075],
 [1047],
 [3533],
 [2723],
 [3539],
 [1320],
 [1321],
 [2051],
 [5580],
 [389],
 [431],
 [2988],
 [1885],
 [6480],
 [8383],
 [5],
 [4180],
 [2125],
 [6079],
 [3540],
 [4115],
 [3534],
 [3168],
 [3169],
 [432],
 [3170],
 [2126]

In [42]:
user_encoder = userencodeduser.get(user_id)
user_encoder 

740

In [43]:
# Stacking the userid with respect to the movie not watched by the user 
user_movie_array = np.hstack(([[user_encoder]] * len(movies_not_watched), movies_not_watched_index))
user_movie_array

array([[ 740, 6365],
       [ 740,  175],
       [ 740, 1994],
       ...,
       [ 740, 1027],
       [ 740, 5391],
       [ 740, 6532]])

In [44]:
# we will do the prediction of the user 387 and predicting the rating which the user may give based on his previous history of rating
ratings = model.predict([user_movie_array[:,0],user_movie_array[:,1]]).flatten()
ratings



array([0.5030095 , 0.39254472, 0.5625954 , ..., 0.66785014, 0.33443445,
       0.34200513], dtype=float32)

In [45]:
top_ratings_indices = ratings.argsort()[-10:][::-1] #indices of highest 10 ratings


In [46]:
# getting the id of movies which our model has predicted for the particular user
recommended_movie_ids = [movieencodemovie.get(movies_not_watched_index[x][0]) for x in top_ratings_indices]

In [48]:
print("Showing recommendations for user: {}".format(user_id))
print("====" * 9)
print("Movies with high ratings from user")
print("----" * 8)
top_movies_user = (
    movies_watched_by_user.sort_values(by="rating", ascending=False)
    .head(5)
    .movieId.values
)
movie_df_rows = movies[movies["movieId"].isin(top_movies_user)]
for row in movie_df_rows.itertuples():
    print(row.title, ":", row.genres)

Showing recommendations for user: 739
Movies with high ratings from user
--------------------------------
Godfather, The (1972) : Crime|Drama
Training Day (2001) : Crime|Drama|Thriller
Insomnia (2002) : Action|Crime|Drama|Mystery|Thriller
3-Iron (Bin-jip) (2004) : Drama|Romance
Juno (2007) : Comedy|Drama|Romance


In [49]:
print("----" * 8)
print("Top 10 movie recommendations")
print("----" * 8)
recommended_movies = movies[movies["movieId"].isin(recommended_movie_ids)]
for row in recommended_movies.itertuples():
    print(row.title, ":", row.genres)

--------------------------------
Top 10 movie recommendations
--------------------------------
Searching for Bobby Fischer (1993) : Drama
Apartment, The (1960) : Comedy|Drama|Romance
On Golden Pond (1981) : Drama
Lady Vanishes, The (1938) : Drama|Mystery|Thriller
Key Largo (1948) : Crime|Drama|Film-Noir|Thriller
One-Eyed Jacks (1961) : Western
Swimming Pool (2003) : Drama|Mystery|Thriller
Pink Panther Strikes Again, The (1976) : Comedy|Crime
Murder on the Orient Express (1974) : Crime|Mystery|Thriller
Old Boy (2003) : Mystery|Thriller
