In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras


100,000 ratings and 3,600 tag applications applied to 9,000 movies by 600 users. Last updated 9/2018.

In [2]:
movies = pd.read_csv('sample_data/movies.csv')
ratings = pd.read_csv('sample_data/ratings.csv')
tags = pd.read_csv('sample_data/tags.csv')
links = pd.read_csv('sample_data/links.csv')

In [3]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
# movies.to_csv('movies.csv')

In [5]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


# USING SVD

In [6]:
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357244 sha256=f7a182009ef281981b790c52ced02d7271248098c7816842795fa034eb331a11
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.4


In [7]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
reader = Reader()
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8767  0.8720  0.8770  0.8660  0.8746  0.8733  0.0041  
MAE (testset)     0.6746  0.6697  0.6716  0.6667  0.6717  0.6709  0.0026  
Fit time          1.76    2.73    3.92    3.41    3.52    3.07    0.76    
Test time         0.15    0.31    0.31    0.32    0.49    0.32    0.11    


{'test_rmse': array([0.87674045, 0.87204352, 0.87704206, 0.86595636, 0.87463593]),
 'test_mae': array([0.67457367, 0.66972284, 0.67162973, 0.66673054, 0.67173498]),
 'fit_time': (1.7562613487243652,
  2.7313153743743896,
  3.9205384254455566,
  3.4142978191375732,
  3.5214011669158936),
 'test_time': (0.14789390563964844,
  0.3075220584869385,
  0.31229734420776367,
  0.31693553924560547,
  0.49043917655944824)}

In [8]:
user_610 = ratings[ratings['userId'] == 610]
user_610 = pd.merge(user_610, movies,how='inner', on='movieId')
user_610 = user_610[['movieId', 'rating', 'title']]
user_610

Unnamed: 0,movieId,rating,title
0,1,5.0,Toy Story (1995)
1,6,5.0,Heat (1995)
2,16,4.5,Casino (1995)
3,32,4.5,Twelve Monkeys (a.k.a. 12 Monkeys) (1995)
4,47,5.0,Seven (a.k.a. Se7en) (1995)
...,...,...,...
1297,166534,4.0,Split (2017)
1298,168248,5.0,John Wick: Chapter Two (2017)
1299,168250,5.0,Get Out (2017)
1300,168252,5.0,Logan (2017)


In [9]:
user_610['Estimated_rating'] = user_610['movieId'].apply(lambda x: svd.predict(610, x).est)
user_610

Unnamed: 0,movieId,rating,title,Estimated_rating
0,1,5.0,Toy Story (1995),4.173044
1,6,5.0,Heat (1995),3.940405
2,16,4.5,Casino (1995),4.584075
3,32,4.5,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),4.425063
4,47,5.0,Seven (a.k.a. Se7en) (1995),4.567119
...,...,...,...,...
1297,166534,4.0,Split (2017),3.610536
1298,168248,5.0,John Wick: Chapter Two (2017),4.156150
1299,168250,5.0,Get Out (2017),3.936959
1300,168252,5.0,Logan (2017),4.151754


In [10]:
user_610_sort = user_610.sort_values('Estimated_rating', ascending=False)
user_610_sort

Unnamed: 0,movieId,rating,title,Estimated_rating
18,296,5.0,Pulp Fiction (1994),5.000000
52,858,5.0,"Godfather, The (1972)",5.000000
79,1208,5.0,Apocalypse Now (1979),4.970478
75,1198,5.0,Raiders of the Lost Ark (Indiana Jones and the...,4.966640
20,318,3.0,"Shawshank Redemption, The (1994)",4.944370
...,...,...,...,...
36,519,2.0,RoboCop 3 (1993),2.449905
644,43928,2.0,Ultraviolet (2006),2.387981
432,6503,1.5,Charlie's Angels: Full Throttle (2003),2.258891
207,3646,1.0,Big Momma's House (2000),2.242579


In [11]:
user_ratings = pd.merge(ratings, movies,how='inner', on='movieId')
user_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [12]:

user_movie_table = user_ratings.pivot_table(index='movieId', columns='userId', values='rating')


#The goal of collaborative filtering is to generate two vectors: For each user, a 'parameter vector' that embodies the movie tastes of a user. The dot produxt of the two vectors plus the bias term should produce an estimate of the rating the user might give to the movie.

In [13]:
user_movie_table

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,,,,,,,,,,...,,,,,,,,,,
193583,,,,,,,,,,,...,,,,,,,,,,
193585,,,,,,,,,,,...,,,,,,,,,,
193587,,,,,,,,,,,...,,,,,,,,,,


In [14]:
Y = user_movie_table.to_numpy()
Y

array([[4. , nan, nan, ..., 2.5, 3. , 5. ],
       [nan, nan, nan, ..., 2. , nan, nan],
       [4. , nan, nan, ..., 2. , nan, nan],
       ...,
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan, nan, nan, ..., nan, nan, nan]])

In [15]:

R = np.zeros(Y.shape)
R[Y > 0] = 1
R

array([[1., 0., 0., ..., 1., 1., 1.],
       [0., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [16]:

Y = np.nan_to_num(Y)
Y

array([[4. , 0. , 0. , ..., 2.5, 3. , 5. ],
       [0. , 0. , 0. , ..., 2. , 0. , 0. ],
       [4. , 0. , 0. , ..., 2. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [17]:

num_movies = R.shape[0]
num_users = R.shape[1]
num_features = 10

# Initialize X and W with random values
X = np.random.rand(num_movies, num_features)
W = np.random.rand(num_users, num_features)

# Initialize b with zeros
b = np.zeros((1, num_users))


In [18]:

print("Y", Y.shape, "R", R.shape)
print("X", X.shape)
print("W", W.shape)
print("b", b.shape)
print("num_features", num_features)
print("num_movies",   num_movies)
print("num_users",    num_users)

Y (9724, 610) R (9724, 610)
X (9724, 10)
W (610, 10)
b (1, 610)
num_features 10
num_movies 9724
num_users 610


#The function cofi_cost_func computes the collaborative filtering objective function

In [19]:
def cofi_cost_func(X,W,b,Y,R,lambda_):
  nu,nm = Y.shape
  j = (tf.linalg.matmul(X,tf.transpose(W))+b-Y)*R
  return 0.5*tf.reduce_sum(j**2)+lambda_/2*(tf.reduce_sum(X**2)+tf.reduce_sum(W**2))
cost = cofi_cost_func(X, W, b, Y, R, 0).numpy()
print(f"Cost for lambda = 0: {cost}")


Cost for lambda = 0: 134277.60420349488


In [20]:
my_ratings = np.zeros(R.shape[0])
my_ratings[0] = 5
my_ratings[1] = 4
my_ratings[9191] = 1
my_ratings[8508] = 1
my_ratings[8408] = 1
my_ratings[9584] = 3
my_ratings[9582] = 2
my_ratings[9591] = 2
my_ratings[9602] = 3
my_ratings[9615] = 3
my_ratings[9678] = 3
my_ratings[9682] = 3
my_ratings[9693] = 4
my_ratings[9699] = 4
my_ratings[9708] = 5
my_ratings[9713] = 4

In [21]:
my_rated = [i for i in range(len(my_ratings)) if my_ratings[i] > 0]

my_rated

[0,
 1,
 8408,
 8508,
 9191,
 9582,
 9584,
 9591,
 9602,
 9615,
 9678,
 9682,
 9693,
 9699,
 9708,
 9713]

In [22]:

print('\nNew user ratings:\n')
for i in range(len(my_ratings)):
    if my_ratings[i] > 0 :
        print(f'Rated {my_ratings[i]} for  {movies.loc[i,"title"]}');


New user ratings:

Rated 5.0 for  Toy Story (1995)
Rated 4.0 for  Jumanji (1995)
Rated 1.0 for  Oculus (2013)
Rated 1.0 for  Honeymoon (2014)
Rated 1.0 for  The Devil's Candy (2015)
Rated 2.0 for  Annabelle: Creation (2017)
Rated 3.0 for  It (2017)
Rated 2.0 for  Death Note (2017)
Rated 3.0 for  Kingsman: The Golden Circle (2017)
Rated 3.0 for  Geostorm (2017)
Rated 3.0 for  Insidious: The Last Key (2018)
Rated 3.0 for  Maze Runner: The Death Cure (2018)
Rated 4.0 for  Fullmetal Alchemist 2018 (2017)
Rated 4.0 for  A Quiet Place (2018)
Rated 5.0 for  Incredibles 2 (2018)
Rated 4.0 for  Ant-Man and the Wasp (2018)


In [23]:
Y = np.c_[my_ratings, Y]
R = np.c_[my_ratings > 0, R]

Ymean = np.zeros((num_movies, 1))
Ynorm = np.zeros(Y.shape)

for i in range(num_movies):
    idx = (R[i,:] == 1)
    Ymean[i] = np.mean(Y[i, idx])
    Ynorm[i, idx] = Y[i, idx] - Ymean[i]



In [24]:
num_movies, num_users = Y.shape
num_features = 100
tf.random.set_seed(1234)
W = tf.Variable(tf.random.normal((num_users,  num_features),dtype=tf.float64),  name='W')
X = tf.Variable(tf.random.normal((num_movies, num_features),dtype=tf.float64),  name='X')
b = tf.Variable(tf.random.normal((1,          num_users),   dtype=tf.float64),  name='b')

# Instantiate an optimizer.
optimizer = keras.optimizers.Adam(learning_rate=1e-1)

In [25]:
iterations = 200
lambda_ = 1
for iter in range(iterations):
    # Use TensorFlow’s GradientTape
    # to record the operations used to compute the cost
    with tf.GradientTape() as tape:

        # Compute the cost (forward pass included in cost)
        cost_value = cofi_cost_func(X, W, b, Ynorm, R, lambda_)

    # Use the gradient tape to automatically retrieve
    # the gradients of the trainable variables with respect to the loss
    grads = tape.gradient( cost_value, [X,W,b] )

    # Run one step of gradient descent by updating
    # the value of the variables to minimize the loss.
    optimizer.apply_gradients( zip(grads, [X,W,b]) )

    # Log periodically.
    if iter % 20 == 0:
        print(f"Training loss at iteration {iter}: {cost_value:0.1f}")

Training loss at iteration 0: 5541079.9
Training loss at iteration 20: 279415.8
Training loss at iteration 40: 108071.2
Training loss at iteration 60: 53041.0
Training loss at iteration 80: 30325.4
Training loss at iteration 100: 19367.5
Training loss at iteration 120: 13491.8
Training loss at iteration 140: 10121.0
Training loss at iteration 160: 8098.3
Training loss at iteration 180: 6842.0


In [26]:
X

<tf.Variable 'X:0' shape=(9724, 100) dtype=float64, numpy=
array([[-6.19911720e-02, -2.02862579e-01, -1.81358613e-01, ...,
        -2.74341712e-02, -3.13125992e-01,  1.63109497e-01],
       [ 7.75778064e-03,  6.22179708e-02,  3.29799664e-01, ...,
         1.57233961e-01,  3.48157897e-01, -3.88746230e-01],
       [ 1.56015637e-01, -2.19073049e-01,  9.18433082e-02, ...,
         5.42910456e-02,  4.13040640e-01, -3.60480937e-01],
       ...,
       [ 5.81473347e-03, -5.46099404e-04, -1.81460281e-03, ...,
        -1.42700438e-02, -1.63192511e-03,  4.52443435e-03],
       [-3.03184860e-02, -3.99011407e-04, -1.99666696e-03, ...,
        -3.91254700e-04, -1.34436096e-03,  2.80376782e-03],
       [-1.62830784e-03, -7.17914638e-04,  2.57815185e-03, ...,
         4.08094201e-03, -3.43221923e-03, -5.78465381e-03]])>

In [27]:
# Make a prediction using trained weights and biases
p = np.matmul(X.numpy(), np.transpose(W.numpy())) + b.numpy()

pm = p + Ymean
my_predictions = pm[:,0]
my_predictions

array([4.91619073, 3.88270425, 2.5957909 , ..., 2.99525391, 3.01349576,
       3.48871051])

In [28]:
ix = my_predictions.argsort()[::-1]
ix

array([   0,  520, 9708, ..., 4768, 3695, 4994])

In [29]:
for i in range(17):
  j = ix[i]
  print(f'Rated {my_predictions[j]:0.2f} for  {movies.loc[j,"title"]}');

Rated 4.92 for  Toy Story (1995)
Rated 4.65 for  Fargo (1996)
Rated 4.64 for  Incredibles 2 (2018)
Rated 4.55 for  Get Him to the Greek (2010)
Rated 4.54 for  Thin Line Between Love and Hate, A (1996)
Rated 4.53 for  Crow: Salvation, The (2000)
Rated 4.53 for  48 Hrs. (1982)
Rated 4.53 for  Good Girl, The (2002)
Rated 4.52 for  Europa (Zentropa) (1991)
Rated 4.52 for  Batman/Superman Movie, The (1998)
Rated 4.52 for  Three O'Clock High (1987)
Rated 4.52 for  Eyewitness (Janitor, The) (1981)
Rated 4.52 for  Rory Scovel Tries Stand-Up for the First Time (2017)
Rated 4.52 for  Bernie (2011)
Rated 4.51 for  Scooby-Doo! Abracadabra-Doo (2010)
Rated 4.51 for  Twelve Tasks of Asterix, The (Les douze travaux d'Astérix) (1976)
Rated 4.51 for  Return of Martin Guerre, The (Retour de Martin Guerre, Le) (1982)


In [30]:
for i in range(len(my_ratings)):
  if my_ratings[i] > 0 :
    print(f'Original {my_ratings[i]}, Predicted {my_predictions[i]:0.2f} for {movies.loc[i,"title"]}');

Original 5.0, Predicted 4.92 for Toy Story (1995)
Original 4.0, Predicted 3.88 for Jumanji (1995)
Original 1.0, Predicted 1.32 for Oculus (2013)
Original 1.0, Predicted 1.16 for Honeymoon (2014)
Original 1.0, Predicted 1.33 for The Devil's Candy (2015)
Original 2.0, Predicted 1.98 for Annabelle: Creation (2017)
Original 3.0, Predicted 2.83 for It (2017)
Original 2.0, Predicted 2.16 for Death Note (2017)
Original 3.0, Predicted 2.71 for Kingsman: The Golden Circle (2017)
Original 3.0, Predicted 3.17 for Geostorm (2017)
Original 3.0, Predicted 3.02 for Insidious: The Last Key (2018)
Original 3.0, Predicted 3.08 for Maze Runner: The Death Cure (2018)
Original 4.0, Predicted 3.99 for Fullmetal Alchemist 2018 (2017)
Original 4.0, Predicted 3.91 for A Quiet Place (2018)
Original 5.0, Predicted 4.64 for Incredibles 2 (2018)
Original 4.0, Predicted 3.91 for Ant-Man and the Wasp (2018)


In [31]:
user_610_pred = pm[:,-1]
user_610_pred

array([4.70027894, 3.71037565, 3.6150755 , ..., 3.750863  , 3.69076006,
       4.23410894])

In [32]:
ix = user_610_pred.argsort()[::-1]
ix

array([1217,  201, 1430, ..., 7172, 9103, 6832])

In [33]:
for i in range(17):
  j = ix[i]
  print(f'Rated {user_610_pred[j]:0.2f} for  {movies.loc[j,"title"]}');

Rated 5.96 for  Peacemaker, The (1997)
Rated 5.87 for  Ed Wood (1994)
Rated 5.81 for  French Connection, The (1971)
Rated 5.78 for  Passion Fish (1992)
Rated 5.61 for  Cape Fear (1962)
Rated 5.50 for  In the Name of the Father (1993)
Rated 5.47 for  To Catch a Thief (1955)
Rated 5.46 for  Bambi (1942)
Rated 5.44 for  American President, The (1995)
Rated 5.44 for  North by Northwest (1959)
Rated 5.43 for  City of Lost Children, The (Cité des enfants perdus, La) (1995)
Rated 5.43 for  Madness of King George, The (1994)
Rated 5.42 for  Vampires (1998)
Rated 5.42 for  If These Walls Could Talk 2 (2000)
Rated 5.40 for  Philadelphia Story, The (1940)
Rated 5.40 for  Scream 3 (2000)
Rated 5.35 for  Fog, The (2005)


In [34]:
user_610 = user_ratings[user_ratings['userId'] == 610][['userId','movieId','rating','title','genres']]
user_610

Unnamed: 0,userId,movieId,rating,title,genres
214,610,1,5.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
368,610,6,5.0,Heat (1995),Action|Crime|Thriller
571,610,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
775,610,50,4.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
830,610,70,4.0,From Dusk Till Dawn (1996),Action|Comedy|Horror|Thriller
...,...,...,...,...,...
100831,610,160341,2.5,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,Blair Witch (2016),Horror|Thriller


In [35]:
user_610_actual = Y[:,-1]

# Find movies that user 610 has rated
rated_movies = R[:,-1] > 0
rated_movie_indices = np.where(rated_movies)[0]
rated_movie_indices

array([   0,    5,   15, ..., 9444, 9445, 9485])

In [36]:
user_610['Pred_Rating'] = user_610_pred[rated_movie_indices]
user_610

Unnamed: 0,userId,movieId,rating,title,genres,Pred_Rating
214,610,1,5.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.700279
368,610,6,5.0,Heat (1995),Action|Crime|Thriller,4.799086
571,610,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,4.353576
775,610,50,4.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,4.713619
830,610,70,4.0,From Dusk Till Dawn (1996),Action|Comedy|Horror|Thriller,5.284158
...,...,...,...,...,...,...
100831,610,160341,2.5,Bloodmoon (1997),Action|Thriller,3.951905
100832,610,160527,4.5,Sympathy for the Underdog (1971),Action|Crime|Drama,4.937108
100833,610,160836,3.0,Hazard (2005),Action|Drama|Thriller,4.807269
100834,610,163937,3.5,Blair Witch (2016),Horror|Thriller,4.997250
