# Movie Reccomender System

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from numpy import loadtxt

Load Ratings and Rated Movies Indicator

In [2]:
file = open('./data2/small_movies_Y.csv', 'rb')
Y = loadtxt(file,delimiter = ",")
file = open('./data2/small_movies_R.csv', 'rb')
R = loadtxt(file,delimiter = ",")

In [3]:
# Getting number of movies and users
num_movies = Y.shape[0]
num_users = Y.shape[1]

Load the Movie List

In [4]:
dfML = pd.read_csv('./data2/small_movie_list.csv', header=0, index_col=0, delimiter=',', quotechar='"')
mlist = dfML["title"].to_list()

Getting New User Input

In [16]:
# Initializing new user ratings
new_user_ratings = np.zeros(num_movies)

new_user_ratings[24] = 3
new_user_ratings[46] = 2
new_user_ratings[70] = 4
new_user_ratings[79] = 2
new_user_ratings[3689] = 4

new_user_ratings[3711] = 3
new_user_ratings[3714] = 4
new_user_ratings[3802] = 2
new_user_ratings[2647] = 3
new_user_ratings[2653] = 2

new_user_ratings[2672] = 4
new_user_ratings[2716] = 5
new_user_ratings[2717] = 3
new_user_ratings[1444] = 4
new_user_ratings[1832] = 1

new_user_rated = [i for i in range(len(new_user_ratings)) if new_user_ratings[i] > 0]

print('\n New User Ratings:\n')
for i in range(len(new_user_ratings)):
    if new_user_ratings[i] > 0 :
        print(f'Rated {new_user_ratings[i]} for {dfML.loc[i,"title"]}')


 New User Ratings:

Rated 3.0 for Final Destination (2000)
Rated 2.0 for Flintstones in Viva Rock Vegas, The (2000)
Rated 4.0 for Chicken Run (2000)
Rated 2.0 for X-Men (2000)
Rated 4.0 for 40-Year-Old Virgin, The (2005)
Rated 1.0 for Ghost Rider (2007)
Rated 3.0 for Hot Tub Time Machine (2010)
Rated 2.0 for Kick-Ass (2010)
Rated 4.0 for MacGruber (2010)
Rated 5.0 for Inception (2010)
Rated 3.0 for Grown Ups (2010)
Rated 4.0 for 22 Jump Street (2014)
Rated 3.0 for Sex Tape (2014)
Rated 4.0 for Guardians of the Galaxy (2014)
Rated 2.0 for The Imitation Game (2014)


Add New User Ratings to the Already Rated Data and Normalize Them

In [17]:
Y = np.c_[new_user_ratings, Y]

# Add new user ratings indicators to R
R = np.c_[(new_user_ratings != 0).astype(int), R]

# Normalize the Dataset
Ymean = (np.sum(Y*R,axis=1)/(np.sum(R, axis=1)+1e-12)).reshape(-1,1)
Ynorm = Y - np.multiply(Ymean, R)

Creating the Cost Function with Regularization

In [18]:
def cost_func(X, W, b, Y, R, lambda_):
    nm, nu = Y.shape
    J = 0
    Regt1 = 0
    Regt2 = 0
 
    for i in range(nm):
        for j in range(nu):
            J += (R[i][j]*np.square((np.dot(W[j],X[i])+b[0][j]-Y[i][j])))/2
            
        Regt2 += (np.sum(np.square(X[i])))*(lambda_/2)
        
    for j in range(nu):
        Regt1 += (np.sum(np.square(W[j])))*(lambda_/2)
    
    J += Regt1 + Regt2

    return J

Vectorized Cost Function Implementation

In [19]:
def cost_func_v(X, W, b, Y, R, lambda_):
    j = (tf.linalg.matmul(X, tf.transpose(W)) + b - Y)*R
    J = 0.5 * tf.reduce_sum(j**2) + (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
    return J

Preparing to Train the Model by Initializing Prameters and Selecting the Optimizer

In [20]:
# 500 features are used
num_movies, num_users = Y.shape
num_features = 200

# Setting initial learnable parameters (W,X)
tf.random.set_seed(1234)
W = tf.Variable(tf.random.normal((num_users,  num_features),dtype=tf.float64),  name='W')
X = tf.Variable(tf.random.normal((num_movies, num_features),dtype=tf.float64),  name='X')
b = tf.Variable(tf.random.normal((1, num_users),   dtype=tf.float64),  name='b')

# Adam Optimizer is used with learning rate of 0.1
optimizer = keras.optimizers.Adam(learning_rate=1e-1)

Training the Collaborative Filtering Model

In [21]:
print(f'shape of Y {Y.shape}')
print(f'shape of X {X.shape}')
print(f'shape of W {W.shape}')
print(f'shape of R {R.shape}')

shape of Y (4778, 445)
shape of X (4778, 200)
shape of W (445, 200)
shape of R (4778, 445)


In [23]:
# Initializing number of iterations and the regualarization parameter
iterations = 1000
lambda_ = 1

# Custom Training Loop
for iter in range(iterations):
    with tf.GradientTape() as tape:
        # computing the cost
        cost_value = cost_func_v(X, W, b, Ynorm, R, lambda_)

    # Using gradient tape to retrieve the gradients
    grads = tape.gradient( cost_value, [X,W,b])

    # update the values of varibles to minimize the loss
    optimizer.apply_gradients( zip(grads, [X,W,b]))

    #logging the process
    if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

Training loss at iteration 0: 4540906.4
Training loss at iteration 20: 318068.1
Training loss at iteration 40: 137926.3
Training loss at iteration 60: 71411.6
Training loss at iteration 80: 41396.0
Training loss at iteration 100: 25923.7
Training loss at iteration 120: 17171.5
Training loss at iteration 140: 11906.8
Training loss at iteration 160: 8602.2
Training loss at iteration 180: 6462.7
Training loss at iteration 200: 5045.2
Training loss at iteration 220: 4089.5
Training loss at iteration 240: 3436.1
Training loss at iteration 260: 2984.0
Training loss at iteration 280: 2667.6
Training loss at iteration 300: 2443.6
Training loss at iteration 320: 2282.9
Training loss at iteration 340: 2166.3
Training loss at iteration 360: 2080.5
Training loss at iteration 380: 2016.4
Training loss at iteration 400: 1968.0
Training loss at iteration 420: 1930.8
Training loss at iteration 440: 1902.0
Training loss at iteration 460: 1879.3
Training loss at iteration 480: 1861.3
Training loss at it

Making Reccommendations to New User Based on Movies with Highest Predicted Ratings

In [12]:
print(f'shape of Y {Y.shape}')
print(f'shape of X {X.shape}')
print(f'shape of W {W.shape}')
print(f'shape of R {R.shape}')

shape of Y (4778, 444)
shape of X (4778, 200)
shape of W (444, 200)
shape of R (4778, 444)


In [24]:
# Making predictions with trained weights and biases
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()

# Restoring the mean that we substracted during regularization process
pm = p + Ymean

new_predictions = pm[:,0]

# sort predictions
sp = tf.argsort(new_predictions, direction='DESCENDING')

for i in range(20):
    j = sp[i]
    if new_predictions[j] > 5:
        new_predictions[j] = 5
    if j not in new_user_rated:
        print(f'Predicting rating {new_predictions[j]:0.2f} for movie {mlist[j]}')

print('\n\nOriginal vs Predicted ratings:\n')
for i in range(len(new_user_ratings)):
    if new_predictions[i] > 5:
        new_predictions[i] = 5
    if new_user_ratings[i] > 0:
        print(f'Original {new_user_ratings[i]}, Predicted {new_predictions[i]:0.2f} for {mlist[i]}')

Predicting rating 4.80 for movie Odd Life of Timothy Green, The (2012)
Predicting rating 4.75 for movie Into the Forest of Fireflies' Light (2011)
Predicting rating 4.73 for movie Dragons: Gift of the Night Fury (2011)
Predicting rating 4.73 for movie 'Salem's Lot (2004)
Predicting rating 4.73 for movie Satin Rouge (2002)
Predicting rating 4.72 for movie The Girl with All the Gifts (2016)
Predicting rating 4.72 for movie 9/11 (2002)
Predicting rating 4.72 for movie Black Tar Heroin: The Dark End of the Street (2000)
Predicting rating 4.72 for movie Idiots and Angels (2008)
Predicting rating 4.72 for movie Wonder Woman (2009)
Predicting rating 4.72 for movie Faster (2010)
Predicting rating 4.72 for movie Justice League: Doom (2012) 
Predicting rating 4.72 for movie A Detective Story (2003)
Predicting rating 4.72 for movie Superman/Batman: Public Enemies (2009)
Predicting rating 4.72 for movie Open Hearts (Elsker dig for evigt) (2002)
Predicting rating 4.72 for movie Palindromes (2004)
P