In [1]:
import pandas as pd
import numpy as np

In [2]:
movieData = pd.read_csv('ml-latest-small/movies.csv')

# Preprocessing

Here we will check the dataframes for null values, check the dataframe for the structure, etc..

In [3]:
movieData.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratingData = pd.read_csv('ml-latest-small/ratings.csv')

In [5]:
ratingData.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
ratingData['userId'].isnull().any()

False

In [7]:
ratingData['movieId'].isnull().any()

False

In [8]:
ratingData['rating'].isnull().any()

False

In [9]:
noOfUniqueUsers = len(ratingData['userId'].unique())
noOfUniqueMovies = len(ratingData['movieId'].unique())
print(noOfUniqueUsers, noOfUniqueMovies)

610 9724


In [10]:
userId = sorted(list(ratingData['userId'].unique()))
movieId = sorted(list(ratingData['movieId'].unique()))

# Readying the data for model training and testing

Spitting into training and testing

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
train, test = train_test_split(ratingData, test_size = 0.2)

# Coding the Model

In [13]:
from keras.models import Sequential, Model
from keras.layers import Embedding, Flatten, Dense, Dropout, concatenate, multiply, Input, Reshape, dot

Using TensorFlow backend.


In [14]:
d_embeddings = 30
bias = 1

In [15]:
movie = Input(shape = [1], name = 'movie')
movieEmbedding = Embedding(noOfUniqueMovies+1, d_embeddings, name = 'Movie_Embedding')(movie)
movieBias = Embedding(noOfUniqueMovies + 1, bias, name="Movie_Bias")(movie)





In [16]:
user = Input(shape = [1], name = 'user')
userEmbedding = Embedding(noOfUniqueUsers+1, d_embeddings, name = 'User_Embedding')(user)
userBias = Embedding(noOfUniqueUsers + 1, bias, name="User_Bias")(user)

In [17]:
utilityMatrix = multiply([movieEmbedding, userEmbedding])




In [18]:
inputs = concatenate([utilityMatrix, userBias, movieBias])
inputs = Flatten()(inputs)

In [19]:
layer_1 = Dense(50, activation="relu", name = "Dense_layer1")(inputs)
layer_1 = Dropout(0.2)(layer_1)
#layer_2 = Dense(100, activation="relu", name = "Dense_layer2")(layer_1)
#layer_2 = Dropout(0.2)(layer_2)
output = Dense(1, activation="relu", name = "output")(layer_1)


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [20]:
recModel = Model(inputs = [movie, user], outputs = output)

In [21]:
recModel.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
movie (InputLayer)              (None, 1)            0                                            
__________________________________________________________________________________________________
user (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
Movie_Embedding (Embedding)     (None, 1, 30)        291750      movie[0][0]                      
__________________________________________________________________________________________________
User_Embedding (Embedding)      (None, 1, 30)        18330       user[0][0]                       
__________________________________________________________________________________________________
multiply_1

In [22]:
from keras.optimizers import Adam, SGD, RMSprop, Adadelta, Adamax, Adagrad
adam = Adam(lr = 0.001)
sgd = SGD(lr = 0.001)
rms = RMSprop(lr = 0.001)
adadelta = Adadelta(lr = 0.001)
adamax = Adamax(lr = 0.001)
adagrad = Adagrad(lr = 0.001)

In [23]:
recModel.compile(optimizer = adam, loss = ['mse'], metrics = ['mean_absolute_error'])
trainingHistory = recModel.fit([train['userId'], train['movieId']],
                          train['rating'],
                          batch_size = 256,
                          validation_split = 0.005,
                          epochs = 4,
                          verbose = 1)



Train on 80264 samples, validate on 404 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


Testing

In [24]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [25]:
def getItem(itemlist):
    return np.array([[item] for item in itemlist])

In [26]:
recModel.save("mrs.h5")

In [27]:
predictions = recModel.predict([getItem(test['userId']), getItem(test['movieId'])])

In [28]:
print("MSE for test data :", mean_squared_error(test["rating"], predictions))

MSE for test data : 0.871057681393455


In [29]:
from keras.models import load_model
model = load_model("mrs.h5")

In [30]:
def predictRating(userId, movieId):
    return model.predict(([np.array([userId]), np.array([movieId])]))[0][0]

In [31]:
def topPredictions(userId, ratingData, movieData):
    userRatings = ratingData[ratingData['userId'] == userId][['userId', 'movieId', 'rating']]
    userRatings['prediction'] = userRatings.apply(lambda x: predictRating(userId, int(x['movieId'])), axis=1)
    return userRatings.sort_values(by='prediction', 
                         ascending=False).merge(movieData, 
                                                on='movieId', 
                                                how='inner',
                                               suffixes = ['_u', '_m']).head(20)                                       

In [32]:
predictRating(1, 2)

3.347455

In [33]:
topPredictions(1,ratingData,movieData)

Unnamed: 0,userId,movieId,rating,prediction,title,genres
0,1,50,5.0,5.192434,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
1,1,608,5.0,5.099771,Fargo (1996),Comedy|Crime|Drama|Thriller
2,1,356,4.0,4.989652,Forrest Gump (1994),Comedy|Drama|Romance|War
3,1,296,3.0,4.91747,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
4,1,101,5.0,4.915381,Bottle Rocket (1996),Adventure|Comedy|Crime|Romance
5,1,333,5.0,4.87558,Tommy Boy (1995),Comedy
6,1,260,5.0,4.799741,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
7,1,362,5.0,4.786876,"Jungle Book, The (1994)",Adventure|Children|Romance
8,1,235,4.0,4.779713,Ed Wood (1994),Comedy|Drama
9,1,596,5.0,4.74925,Pinocchio (1940),Animation|Children|Fantasy|Musical
