In [18]:
from pathlib import Path
import pathlib


import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow import metrics
from keras import optimizers
from tensorflow.keras.utils import plot_model

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import linear_kernel

In [12]:
# data loading
data_dir = './Data/MovieLen'
data_path = pathlib.Path(data_dir)

ratings = pd.read_csv(data_path / 'rating.csv')
movies = pd.read_csv(data_path / 'movie.csv')

ratings = ratings[:100000]

len(ratings)

100000

In [13]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
len(ratings['movieId'].unique())

26744

In [14]:
# Data prepartion
pd.crosstab(ratings.userId, ratings.movieId).head()

movieId,1,2,3,4,5,6,7,8,9,10,...,117511,117590,118354,118696,118900,118997,119141,125916,128488,128594
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
top_k = 15

g = ratings.groupby("userId")["rating"].count()
top_k_users = g.sort_values(ascending=False)[:top_k]

g = ratings.groupby("movieId")['rating'].count()
top_k_movies = g.sort_values(ascending = False)[: top_k]

top_r = ratings.join(top_k_users, rsuffix="-r", how = "inner", on = 'userId')

top_r

Unnamed: 0,userId,movieId,rating,timestamp,rating-r
11454,104,14,1.0,1999-12-11 12:56:43,998
11455,104,17,3.0,1999-12-11 16:13:33,998
11456,104,21,3.0,1999-12-12 12:44:48,998
11457,104,25,4.0,1999-12-12 12:55:57,998
11458,104,28,3.0,1999-12-11 13:01:39,998
...,...,...,...,...,...
93784,648,8464,4.0,2004-07-07 11:05:04,904
93785,648,8492,5.0,2004-07-07 11:07:37,904
93786,648,8493,3.5,2004-07-07 11:07:13,904
93787,648,8623,3.5,2004-07-07 11:06:34,904


In [16]:
top_r = top_r.join(top_k_movies, rsuffix = '-r', how = "inner", on = "movieId")
top_r

Unnamed: 0,userId,movieId,rating,timestamp,rating-r,rating-r.1
11488,104,150,2.0,1999-12-12 12:41:03,998,235
11513,104,260,3.0,1999-12-11 15:44:20,998,264
11524,104,296,3.0,1999-12-11 12:19:51,998,350
11532,104,318,3.0,1999-12-11 16:13:33,998,305
11579,104,527,5.0,1999-12-11 16:12:22,998,247
...,...,...,...,...,...,...
93002,648,593,5.0,2000-11-20 06:13:18,904,295
93029,648,780,3.0,2000-11-20 06:01:10,904,234
93093,648,1210,4.0,2000-11-20 05:30:50,904,233
93347,648,2571,2.0,2001-04-15 09:43:04,904,253


In [17]:
pd.crosstab(top_r.userId, top_r.movieId, top_r.rating, aggfunc=np.sum)

  pd.crosstab(top_r.userId, top_r.movieId, top_r.rating, aggfunc=np.sum)


movieId,1,110,150,260,296,318,356,480,527,589,593,780,1210,2571,2858
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
104,,,2.0,3.0,3.0,3.0,,,5.0,,3.0,,,1.0,2.0
116,3.0,4.5,3.0,4.5,4.5,4.5,4.0,4.0,4.0,4.0,3.0,1.0,5.0,4.0,4.5
156,5.0,5.0,5.0,,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0
208,4.0,3.5,5.0,4.5,5.0,4.5,1.0,4.5,4.5,2.5,4.5,4.0,4.0,3.0,3.5
298,4.0,5.0,3.0,5.0,4.0,5.0,3.0,4.0,3.0,5.0,4.0,4.0,5.0,5.0,5.0
348,,,,,,,,,4.5,,,3.5,4.5,,4.0
359,5.0,3.5,4.0,5.0,5.0,5.0,4.5,4.0,5.0,5.0,5.0,3.0,5.0,3.0,4.0
394,,2.5,4.0,4.0,5.0,3.0,,2.0,3.0,4.0,5.0,1.0,3.0,5.0,5.0
424,,4.5,,3.5,,4.5,3.0,4.0,,3.5,,4.0,4.0,4.5,
572,5.0,,5.0,,5.0,4.0,5.0,5.0,,3.5,4.5,5.0,,4.0,4.5


In [20]:
# encoding UserId and movieId so they have continous ids

user_enc = LabelEncoder()
ratings['userId'] = user_enc.fit_transform(ratings['userId'].values)
n_users = ratings['userId'].nunique()

movie_enc = LabelEncoder()
ratings['movieId'] = user_enc.fit_transform(ratings['movieId'].values)
n_movies = ratings['movieId'].nunique()

ratings['rating'] = ratings['rating'].astype(np.float32)
min_rating = min(ratings['rating'])
max_rating = max(ratings['rating'])

min_rating, max_rating, n_users, n_movies

(0.5, 5.0, 702, 8227)

In [22]:
# Splitting Data into inputs and labels
X = ratings[["userId", "movieId"]].values
y = ratings['rating'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.9)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size = 0.8)

print(f"Training Split : {X_train.shape, y_train.shape}")
print(f"Validation Split : {X_valid.shape, y_valid.shape}")
print(f"Testing Split : {X_test.shape, y_test.shape}")

Training Split : ((72000, 2), (72000,))
Validation Split : ((18000, 2), (18000,))
Testing Split : ((10000, 2), (10000,))


In [40]:
# Network Architc


#User branch
embedding_size = 50

# User Embeddings
user = keras.layers.Input(shape = (1), name = "user_id")
user_embedding = keras.layers.Embedding(n_users, embedding_size, embeddings_regularizer= keras.regularizers.l2(10e-6), name = "user_embeddings")(user)
user_embedding = keras.layers.Reshape((embedding_size,))(user_embedding)

# User bias
user_bias = keras.layers.Embedding(n_users, 1, embeddings_regularizer= keras.regularizers.l2(10e-6), name = "user_bias")(user)
user_bias = keras.layers.Reshape((1,))(user_bias)


#Movie branch

# Movie Embeddings
movie = keras.layers.Input(shape = (1), name = "movie_id")
movie_embedding = keras.layers.Embedding(n_movies, embedding_size, embeddings_regularizer= keras.regularizers.l2(10e-6), name = "movie_embeddings")(movie)
movie_embedding = keras.layers.Reshape((embedding_size,))(movie_embedding)

# Movie bias
movie_bias = keras.layers.Embedding(n_movies, 1, embeddings_regularizer= keras.regularizers.l2(10e-6), name = "movie_bias")(movie)
movie_bias = keras.layers.Reshape((1,))(movie_bias)
                       
# Calculating Ratings
rating = keras.layers.Dot(axes = 1, name = "Similarity_Measure")([user_embedding, movie_embedding])

# Adding bias
rating = keras.layers.Add()([rating, user_bias, movie_bias])
rating = keras.layers.Activation(activation="sigmoid", name = "rating_rescaling")(rating)
rating = keras.layers.Lambda(lambda x : x*(max_rating - min_rating) + min_rating)(rating)


# Model 
model_1_LF_50 = keras.models.Model([user, movie], rating)

model_1_LF_50.compile(loss = "mse", metrics=metrics.RootMeanSquaredError(), optimizer= optimizers.Adam(lr = 0.001))

print(model_1_LF_50.summary())



Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 user_id (InputLayer)        [(None, 1)]                  0         []                            
                                                                                                  
 movie_id (InputLayer)       [(None, 1)]                  0         []                            
                                                                                                  
 user_embeddings (Embedding  (None, 1, 50)                35100     ['user_id[0][0]']             
 )                                                                                                
                                                                                                  
 movie_embeddings (Embeddin  (None, 1, 50)                411350    ['movie_id[0][0]']      

In [41]:
history = model_1_LF_50.fit(x = [X_train[:, 0], X_train[: , 1]],
                    y = y_train,
                    batch_size = 64,
                    verbose= 1,
                    epochs = 20,
                    validation_data=([X_test[:,0], X_test[:,1]], y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [42]:
# trying different emdedding sizes
embedding_size = 40

# User Embeddings
user = keras.layers.Input(shape = (1), name = "user_id")
user_embedding = keras.layers.Embedding(n_users, embedding_size, embeddings_regularizer= keras.regularizers.l2(10e-6), name = "user_embeddings")(user)
user_embedding = keras.layers.Reshape((embedding_size,))(user_embedding)

# User bias
user_bias = keras.layers.Embedding(n_users, 1, embeddings_regularizer= keras.regularizers.l2(10e-6), name = "user_bias")(user)
user_bias = keras.layers.Reshape((1,))(user_bias)


#Movie branch

# Movie Embeddings
movie = keras.layers.Input(shape = (1), name = "movie_id")
movie_embedding = keras.layers.Embedding(n_movies, embedding_size, embeddings_regularizer= keras.regularizers.l2(10e-6), name = "movie_embeddings")(movie)
movie_embedding = keras.layers.Reshape((embedding_size,))(movie_embedding)

# Movie bias
movie_bias = keras.layers.Embedding(n_movies, 1, embeddings_regularizer= keras.regularizers.l2(10e-6), name = "movie_bias")(movie)
movie_bias = keras.layers.Reshape((1,))(movie_bias)
                       
# Calculating Ratings
rating = keras.layers.Dot(axes = 1, name = "Similarity_Measure")([user_embedding, movie_embedding])

# Adding bias
rating = keras.layers.Add()([rating, user_bias, movie_bias])
rating = keras.layers.Activation(activation="sigmoid", name = "rating_rescaling")(rating)
rating = keras.layers.Lambda(lambda x : x*(max_rating - min_rating) + min_rating)(rating)


# Model 
model_2_LF_40 = keras.models.Model([user, movie], rating)

model_2_LF_40.compile(loss = "mse", metrics=metrics.RootMeanSquaredError(), optimizer= optimizers.Adam(lr = 0.001))



In [43]:
history = model_2_LF_40.fit(x = [X_train[:, 0], X_train[: , 1]],
                    y = y_train,
                    batch_size = 32,
                    verbose= 1,
                    epochs = 10,
                    validation_data=([X_valid[:,0], X_valid[:,1]], y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [59]:
# Model evaluation on testing data
y_preds_1 = model_1_LF_50.predict([X_test[:, 0], X_test[: , 1]])
y_preds_2 = model_2_LF_40.predict([X_test[:, 0], X_test[: , 1]])



In [71]:
user_id = pd.Series(X_test[:,0])
movie_id = pd.Series(X_test[:, 1])
actual_rating = pd.Series(y_test)
predictions_model_1_LF_50 = pd.Series(y_preds_1.reshape(10000))
predictions_model_2_LF_40 =- pd.Series(y_preds_2.reshape(10000))

In [72]:
dict = {"userId" :user_id,
        "movieId" : movie_id,
        "ratings" : actual_rating,
        "predictions_model_1_LF_50" : predictions_model_1_LF_50,
        "predictions_model_2_LF_40" : predictions_model_2_LF_40}

predictions = pd.DataFrame(dict)

In [74]:
predictions.head(40)

Unnamed: 0,userId,movieId,ratings,predictions_model_1_LF_50,predictions_model_2_LF_40
0,630,6730,3.5,3.061637,-3.525019
1,563,3645,5.0,3.494477,-3.726115
2,649,3908,3.0,3.580866,-3.610754
3,23,497,2.0,1.315119,-3.42361
4,631,389,2.0,2.116646,-2.730416
5,207,2162,4.0,3.833312,-3.703948
6,103,1054,3.0,2.576988,-1.997699
7,297,202,3.0,4.375248,-2.716926
8,257,6365,2.5,2.084016,-1.281837
9,553,2204,5.0,3.125151,-3.369891
