In [1]:
# Read the movies database
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_distances

ratings_df = pd.read_csv('./ml-100k/u.data', sep='\t', index_col=False, names=['userId', 'movieId', 'rating'])
movies_df = pd.read_csv('./ml-100k/u.item', sep='|', header=0, index_col=False, encoding='ISO-8859-1')

# Add a column with the centered ratings
average_df = ratings_df[['userId', 'rating']].groupby(['userId'], as_index=False).mean().rename(columns={'rating': 'average'})
ratings_df = pd.merge(ratings_df, average_df, on='userId', how='left')
ratings_df['rating_centered'] = ratings_df['rating'] - ratings_df['average']
ratings_df

Unnamed: 0,userId,movieId,rating,average,rating_centered
0,196,242,3,3.615385,-0.615385
1,186,302,3,3.413043,-0.413043
2,22,377,1,3.351562,-2.351562
3,244,51,2,3.651261,-1.651261
4,166,346,1,3.550000,-2.550000
...,...,...,...,...,...
99995,880,476,3,3.426630,-0.426630
99996,716,204,5,3.888476,1.111524
99997,276,1090,1,3.465251,-2.465251
99998,13,225,2,3.097484,-1.097484


In [6]:
# Build the matrix of user's scores to movies
# Remember: The rows are the users and the movies the columns
# We can use the ID's as the indexes of the array but the first column and row will be 0
nUsers = ratings_df.userId.unique().size
nMovies = ratings_df.movieId.unique().size

ratingsM = np.zeros((nUsers+1, nMovies+1))
for index, rating in ratings_df.iterrows():
    ratingsM[int(rating.userId), int(rating.movieId)] = rating.rating_centered
    
    

In [4]:
# Collaborative filtering - Memory based
# --------------------------------------

# Item view: I will recommend movies based on user choice

# Let's say that user's chooses movie 102 (check 50/500/449)
# What movies can we recommend him?

# Get the distances between movies (the products, so we have to deal with columns)
distances = cosine_distances(ratingsM.T)

myMovie = 10
print("My movie is: ", movies_df[movies_df.movieId == myMovie].title.iloc[0])
print()

# Print the 10 movies closer to my movie
distancesSortedIx = np.argsort(distances[myMovie])
for i in range(1, 10):
    movieId = distancesSortedIx[i]
    title = movies_df[movies_df.movieId == movieId].title.iloc[0]
    distance = distances[myMovie][movieId]
    print(movieId, title, distance)


My movie is:  Richard III (1995)

190 Henry V (1989) 0.7884503002263261
1606 Deceiver (1997) 0.7909474313965291
1244 Metro (1997) 0.8116567310124835
533 Daytrippers, The (1996) 0.821703015377508
718 In the Bleak Midwinter (1995) 0.8483484695518608
652 Rosencrantz and Guildenstern Are Dead (1990) 0.8588836566343084
1103 Trust (1990) 0.8601843961350373
137 Big Night (1996) 0.8606897205808655
921 Farewell My Concubine (1993) 0.860734093840797


In [8]:
# Collaborative filtering - Memory based
# --------------------------------------

# User view: I will recommend movies for a specific user based on user choice

# Let's say that user's ID is 65
# What movies can we recommend him?

# Get the distances between movies (the products, so we have to deal with columns)
distances = cosine_distances(ratingsM)

#userId2 is the given user:
userId2 = 65
print("The recommended movies for the User with ID number", ratings_df[ratings_df.userId == userId2].userId.iloc[0], "are:")
print()

# Print the 10 movies recommended for the given userId2
distancesSortedIx = np.argsort(distances[userId2])
for i in range(1, 10):
    movieId = distancesSortedIx[i]
    title = movies_df[movies_df.movieId == movieId].title.iloc[0]
    distance = distances[userId2][movieId]
    print(movieId, title, distance)


The recommended movies for the User with ID number 65 are:

224 Ridicule (1996) 0.8318524691929073
473 James and the Giant Peach (1996) 0.8384992134696969
311 Wings of the Dove, The (1997) 0.8407771290629572
533 Daytrippers, The (1996) 0.8476224685397103
908 Half Baked (1998) 0.8539336457679558
719 Canadian Bacon (1994) 0.8584491731204407
554 Waterworld (1995) 0.858504416571082
72 Mask, The (1994) 0.8619424080919561
372 Jeffrey (1995) 0.8623119243950776


In [10]:
# Collaborative filtering - Model based
# --------------------------------------
# First calculate the latent factors matrix
# Then make recommendations based on similarity

# Metaparameters
k = 100        # number of latent factors
l = 0.1        # lambda. The same value for x and y
accuracy = 0.999

# X and Y initialization
np.random.seed(42)
X = np.random.normal(size=(ratingsM.shape[0], k))
Y = np.random.normal(size=(k, ratingsM.shape[1]))

converged = False
pL = np.Inf
while not converged:
    y = Y.T
    inv = np.linalg.inv(y.T.dot(y) + l*np.eye(k))
    for u in range(0, X.shape[0]):
        X[u] = ratingsM[u,:].dot(y).dot(inv)
    
    inv = np.linalg.inv(X.T.dot(X) + l*np.eye(k))    
    for i in range(0, Y.shape[1]):
        Y[:,i] = ratingsM[:,i].dot(X).dot(inv)
        
    L = np.square(ratingsM - X.dot(Y)).sum()
    L = L + l * (np.square(np.linalg.norm(X)) + np.square(np.linalg.norm(Y)))
                     
    # Improvement stop criteria
    converged = (L / pL) > accuracy
    
    pL = L
    

In [11]:
# Let's make predictions
# Get the similarity matrix with the items latent factors
myMovie = 50
print("My movie is: ", movies_df[movies_df.movieId == myMovie].title.iloc[0])
print()

distances = cosine_distances(Y.T)

# Print the 10 movies closest to my movie
distancesSortedIx = np.argsort(distances[myMovie])
for i in range(1, 10):
    movieId = distancesSortedIx[i]
    title = movies_df[movies_df.movieId == movieId].title.iloc[0]
    distance = distances[myMovie][movieId]
    print(movieId, title, distance)


My movie is:  Star Wars (1977)

181 Return of the Jedi (1983) 0.18603439683861156
172 Empire Strikes Back, The (1980) 0.18889215672240423
174 Raiders of the Lost Ark (1981) 0.32541851743227035
210 Indiana Jones and the Last Crusade (1989) 0.5464874755017518
173 Princess Bride, The (1987) 0.553666172176376
12 Usual Suspects, The (1995) 0.5570510242096587
127 Godfather, The (1972) 0.559892711705227
89 Blade Runner (1982) 0.5806055948332075
64 Shawshank Redemption, The (1994) 0.5999363886678504


In [12]:
# Let's make predictions
# Get the similarity matrix with the items latent factors
# Trying to emulate the above user recommendation given a user ID

userId2 = 65
print("The recommended movies for the User with ID number", ratings_df[ratings_df.userId == userId2].userId.iloc[0], "are:")
print()

distances = cosine_distances(Y)

# Print the 10 movies recommended for the given userId2:
distancesSortedIx = np.argsort(distances[userId2])
for i in range(1, 10):
    movieId = distancesSortedIx[i]
    title = movies_df[movies_df.movieId == movieId].title.iloc[0]
    distance = distances[userId2][movieId]
    print(movieId, title, distance)

The recommended movies for the User with ID number 65 are:

92 True Romance (1993) 0.7722888318873056
49 I.Q. (1994) 0.8198997763989053
61 Three Colors: White (1994) 0.834577122331332
68 Crow, The (1994) 0.8365462210623218
6 Shanghai Triad (Yao a yao yao dao waipo qiao) (1995) 0.8439391527099369
71 Lion King, The (1994) 0.8455285154569063
25 Birdcage, The (1996) 0.8671939231077801
39 Strange Days (1995) 0.8732283463382994
98 Silence of the Lambs, The (1991) 0.8734536376692685
