In [None]:
from __future__ import print_function, division
from builtins import range, input
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from collections import Counter
from scipy.sparse import lil_matrix, csr_matrix, save_npz, load_npz
from keras.layers import Input, Dropout, Dense
from keras.regularizers import l2
from keras.optimizers import SGD
from keras.models import Model
import keras.backend as K
from sortedcontainers import SortedList
from datetime import datetime
from copy import deepcopy

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/recommender system/rating.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [None]:
# make the user ids go from 0...N-1
df.userId = df.userId - 1

In [None]:
#It identifies unique movie IDs from the DataFrame's movieId column.
#It initializes an empty dictionary (movie2idx) to store the mapping of movie IDs to indices.
#It iterates over each unique movie ID, assigning an index to each and updating the movie2idx dictionary.

unique_movie_ids = set(df.movieId.values)
movie2idx = {}
count = 0
for movie_id in unique_movie_ids:
  movie2idx[movie_id] = count
  count += 1
df['movie_idx'] = df.apply(lambda row: movie2idx[row.movieId], axis=1)

In [None]:
#deleting timestamp column since it it not required to make recommendations
df = df.drop(columns=['timestamp'])

In [None]:
df.head()

Unnamed: 0,userId,movieId,rating,movie_idx
0,0,2,3.5,2
1,0,29,3.5,29
2,0,32,3.5,32
3,0,47,3.5,47
4,0,50,3.5,50


In [None]:
print("original dataframe size:", len(df))

original dataframe size: 20000263


In [None]:
N = df.userId.max() + 1 # number of users
M = df.movie_idx.max() + 1 #number of movies
N,M

(138493, 26744)

In [None]:
#This line creates a counter object user_ids_count that counts
# the occurrences of each unique user ID in the userId column of the DataFrame
user_ids_count = Counter(df.userId)

#creates a counter object movie_ids_count that counts the occurrences of each
#unique movie index in the movie_idx column of the DataFrame df
movie_ids_count = Counter(df.movie_idx)

In [None]:
#selects a subset of the original DataFrame df by filtering out
# only the top n most common user IDs and the top m most common movie indices
n = 10000
m = 2000

user_ids = [u for u, c in user_ids_count.most_common(n)]
movie_ids = [m for m, c in movie_ids_count.most_common(m)]
df_small = df[df.userId.isin(user_ids) & df.movie_idx.isin(movie_ids)].copy()

In [None]:
#creates a mapping from old user IDs to new user IDs, where the new user IDs start from 0 and increment by
new_user_id_map = {}
i = 0
for old in user_ids:
  new_user_id_map[old] = i
  i += 1
print("i:", i)

i: 10000


In [None]:
#it creates a mapping from old movie IDs to new movie IDs, starting from 0 and incrementing by 1
new_movie_id_map = {}
j = 0
for old in movie_ids:
  new_movie_id_map[old] = j
  j += 1
print("j:", j)

j: 2000


In [None]:
#assigns the new user IDs and movie indices to the corresponding columns in the df_small
print("Setting new ids")
df_small.loc[:, 'userId'] = df_small.apply(lambda row: new_user_id_map[row.userId], axis=1)
df_small.loc[:, 'movie_idx'] = df_small.apply(lambda row: new_movie_id_map[row.movie_idx], axis=1)

Setting new ids


In [None]:
print("max user id:", df_small.userId.max())
print("max movie id:", df_small.movie_idx.max())

max user id: 9999
max movie id: 1999


In [None]:
print("small dataframe size:", len(df_small))

small dataframe size: 5392025


In [None]:
# split into train and test

#shuffles the rows of the DataFram
df = shuffle(df)

#takes 80% of the length of the DataFrame df as the cutoff poin
cutoff = int(0.8*len(df))

#This line creates the training set df_train by selecting the
#rows of the DataFrame df up to the cutoff index. It includes
#the first 80% of the shuffled DataFrame.
df_train = df.iloc[:cutoff]

#This line creates the testing set df_test by selecting the rows
#of the DataFrame df from the cutoff index onwards. It includes the remaining 20% of the shuffled DataFrame.
df_test = df.iloc[cutoff:]

In [None]:
# a dictionary to tell us which users have rated which movies
user2movie = {}
# a dicationary to tell us which movies have been rated by which users
movie2user = {}
# a dictionary to look up ratings
usermovie2rating = {}
count = 0

#The provided code defines dictionaries to store user ratings information.
# A function, applied to each row of the training DataFrame, updates these
#dictionaries with user-movie interactions and ratings. Progress is printed
#every 100,000 rows processed to track the update process.
#the format of the ratings is represented as tuples containing the user ID and the movie index
#(user_id, movie_idx)
#In code, the format of the ratings is represented as tuples containing the user ID and the movie index. Here's how it looks:
#(123, 456)
#This tuple serves as a key in the usermovie2rating dictionary, with the corresponding rating as the value.{(user_id,movie_id):rating}

def update_user2movie_and_movie2user(row):
  global count
  count += 1
  if count % 100000 == 0:
    print("processed: %.3f" % (float(count)/cutoff))

  i = int(row.userId)
  j = int(row.movie_idx)
  if i not in user2movie:
    user2movie[i] = [j]
  else:
    user2movie[i].append(j)

  if j not in movie2user:
    movie2user[j] = [i]
  else:
    movie2user[j].append(i)

  usermovie2rating[(i,j)] = row.rating
df_train.apply(update_user2movie_and_movie2user, axis=1)

processed: 0.006
processed: 0.012
processed: 0.019
processed: 0.025
processed: 0.031
processed: 0.037
processed: 0.044
processed: 0.050
processed: 0.056
processed: 0.062
processed: 0.069
processed: 0.075
processed: 0.081
processed: 0.087
processed: 0.094
processed: 0.100
processed: 0.106
processed: 0.112
processed: 0.119
processed: 0.125
processed: 0.131
processed: 0.137
processed: 0.144
processed: 0.150
processed: 0.156
processed: 0.162
processed: 0.169
processed: 0.175
processed: 0.181
processed: 0.187
processed: 0.194
processed: 0.200
processed: 0.206
processed: 0.212
processed: 0.219
processed: 0.225
processed: 0.231
processed: 0.237
processed: 0.244
processed: 0.250
processed: 0.256
processed: 0.262
processed: 0.269
processed: 0.275
processed: 0.281
processed: 0.287
processed: 0.294
processed: 0.300
processed: 0.306
processed: 0.312
processed: 0.319
processed: 0.325
processed: 0.331
processed: 0.337
processed: 0.344
processed: 0.350
processed: 0.356
processed: 0.362
processed: 0.3

9785531     None
15042412    None
9934001     None
1746776     None
7262890     None
            ... 
11848396    None
7357382     None
1163341     None
7059161     None
283425      None
Length: 16000210, dtype: object

In [None]:
# This code snippet creates a dictionary usermovie2rating_test to store test ratings data.
#A function update_usermovie2rating_test() is defined and applied to each row of the test DataFrame df_test using the apply() function.
# This function updates the usermovie2rating_test dictionary with user-movie
# interactions and ratings from the test data. Progress is printed every 100,000 rows processed to track the update process.
usermovie2rating_test = {}
print("Calling: update_usermovie2rating_test")
count = 0
def update_usermovie2rating_test(row):
  global count
  count += 1
  if count % 100000 == 0:
    print("processed: %.3f" % (float(count)/len(df_test)))

  i = int(row.userId)
  j = int(row.movie_idx)
  usermovie2rating_test[(i,j)] = row.rating
df_test.apply(update_usermovie2rating_test, axis=1)

Calling: update_usermovie2rating_test
processed: 0.025
processed: 0.050
processed: 0.075
processed: 0.100
processed: 0.125
processed: 0.150
processed: 0.175
processed: 0.200
processed: 0.225
processed: 0.250
processed: 0.275
processed: 0.300
processed: 0.325
processed: 0.350
processed: 0.375
processed: 0.400
processed: 0.425
processed: 0.450
processed: 0.475
processed: 0.500
processed: 0.525
processed: 0.550
processed: 0.575
processed: 0.600
processed: 0.625
processed: 0.650
processed: 0.675
processed: 0.700
processed: 0.725
processed: 0.750
processed: 0.775
processed: 0.800
processed: 0.825
processed: 0.850
processed: 0.875
processed: 0.900
processed: 0.925
processed: 0.950
processed: 0.975
processed: 1.000


1318198     None
14683386    None
7025402     None
18832912    None
12901092    None
            ... 
19473210    None
636571      None
19444431    None
10354187    None
9241031     None
Length: 4000053, dtype: object

In [None]:

#this code snippet calculates the dimensions of the user-item rating matrix based on the collected data,
#ensuring that it covers all users and movies present in either the training
#or test set. This information is crucial for constructing the rating matrix used in recommendation systems.

N = np.max(list(user2movie.keys())) + 1
# the test set may contain movies the train set doesn't have data on
m1 = np.max(list(movie2user.keys()))
m2 = np.max([m for (u, m), r in usermovie2rating_test.items()])
M = max(m1, m2) + 1
print("N:", N, "M:", M)

N: 138493 M: 26744


In [None]:
# converts the user2movie and movie2user dictionaries to include ratings along with the movie indices and user IDs, respectively
#By converting these dictionaries to include ratings, the code facilitates easier access to user-movie rating information for further analysis or modeling.


# convert user2movie and movie2user to include ratings
print("converting...")
user2movierating = {}
for i, movies in user2movie.items():
  r = np.array([usermovie2rating[(i,j)] for j in movies])
  user2movierating[i] = (movies, r)
movie2userrating = {}
for j, users in movie2user.items():
  r = np.array([usermovie2rating[(i,j)] for i in users])
  movie2userrating[j] = (users, r)

converting...


In [None]:
#creates a movie2userrating_test dictionary for the test set, which includes user IDs and their corresponding ratings for each movie
#By creating movie2userrating_test, the code ensures that the test set ratings are organized similarly to the training set ratings, facilitating further analysis or modeling.

# create a movie2user for test set, since we need it for loss
movie2userrating_test = {}
for (i, j), r in usermovie2rating_test.items():
  if j not in movie2userrating_test:
    movie2userrating_test[j] = [[i], [r]]
  else:
    movie2userrating_test[j][0].append(i)
    movie2userrating_test[j][1].append(r)
for j, (users, r) in movie2userrating_test.items():
  movie2userrating_test[j][1] = np.array(r)
print("conversion done")

conversion done


In [None]:
#initializes variables for a matrix factorization model, including user and movie latent factors,
# biases, and the mean rating. It defines a function get_loss to compute the mean squared error (MSE) loss
# for a given set of user-movie ratings. The loss is calculated by predicting ratings using the dot product of latent factors,
# adding biases and the mean rating, and comparing them to actual ratings. The function iterates over each movie-user pair,
#accumulates squared errors, and computes the MSE. This loss function is crucial for training and evaluating the matrix factorization model.


# initialize variables
K = 10 # latent dimensionality
W = np.random.randn(N, K)
b = np.zeros(N)
U = np.random.randn(M, K)
c = np.zeros(M)
mu = np.mean(list(usermovie2rating.values()))


def get_loss(m2u):
  # d: movie_id -> (user_ids, ratings)
  N = 0.
  sse = 0
  for j, (u_ids, r) in m2u.items():
    p = W[u_ids].dot(U[j]) + b[u_ids] + c[j] + mu
    delta = p - r
    sse += delta.dot(delta)
    N += len(r)
  return sse / N


In [None]:
#The code trains a matrix factorization model over a specified number of epochs using stochastic gradient descent (SGD)
#with regularization. During each epoch, it updates the user latent factors `W` and biases `b` based on the training data.
# For each user, it constructs a matrix and vector to solve a least squares problem for updating `W` and computes the bias term.
# Progress updates are printed for every 1/10th of the users processed. This iterative training process aims to minimize
#the mean squared error (MSE) loss and learn latent representations that capture user preferences and item characteristics.

# train the parameters
epochs = 25
reg = 20. # regularization penalty
train_losses = []
test_losses = []
for epoch in range(epochs):
  print("epoch:", epoch)
  epoch_start = datetime.now()
  # perform updates

  # update W and b
  t0 = datetime.now()
  for i in range(N):
    m_ids, r = user2movierating[i]
    matrix = U[m_ids].T.dot(U[m_ids]) + np.eye(K) * reg
    vector = (r - b[i] - c[m_ids] - mu).dot(U[m_ids])
    bi = (r - U[m_ids].dot(W[i]) - c[m_ids] - mu).sum()

    # set the updates
    W[i] = np.linalg.solve(matrix, vector)
    b[i] = bi / (len(user2movie[i]) + reg)

    if i % (N//10) == 0:
      print("i:", i, "N:", N)
  print("updated W and b:", datetime.now() - t0)

epoch: 0


NameError: name 'datetime' is not defined

In [None]:
#The code trains a matrix factorization model over multiple epochs using stochastic gradient descent (SGD) with regularization.
#It updates user and movie parameters alternately, computing least squares solutions for each parameter set.
# Training and test losses are calculated for each epoch to evaluate model performance. The iterative process aims to
# minimize mean squared error (MSE) loss, indicating the model's ability to capture user preferences and item characteristics.
# Progress updates, including epoch duration and loss values, are printed for each epoch.

# train the parameters
epochs = 25
reg = 20. # regularization penalty
train_losses = []
test_losses = []
for epoch in range(epochs):
  print("epoch:", epoch)
  epoch_start = datetime.now()
  # perform updates

  # update W and b
  t0 = datetime.now()
  for i in range(N):
    m_ids, r = user2movierating[i]
    matrix = U[m_ids].T.dot(U[m_ids]) + np.eye(K) * reg
    vector = (r - b[i] - c[m_ids] - mu).dot(U[m_ids])
    bi = (r - U[m_ids].dot(W[i]) - c[m_ids] - mu).sum()

    # set the updates
    W[i] = np.linalg.solve(matrix, vector)
    b[i] = bi / (len(user2movie[i]) + reg)

    if i % (N//10) == 0:
      print("i:", i, "N:", N)
  print("updated W and b:", datetime.now() - t0)


  # update U and c
  t0 = datetime.now()
  for j in range(M):
    try:
      u_ids, r = movie2userrating[j]
      matrix = W[u_ids].T.dot(W[u_ids]) + np.eye(K) * reg
      vector = (r - b[u_ids] - c[j] - mu).dot(W[u_ids])
      cj = (r - W[u_ids].dot(U[j]) - b[u_ids] - mu).sum()

      # set the updates
      U[j] = np.linalg.solve(matrix, vector)
      c[j] = cj / (len(movie2user[j]) + reg)

      if j % (M//10) == 0:
        print("j:", j, "M:", M)
    except KeyError:
      # possible not to have any ratings for a movie
      pass
  print("updated U and c:", datetime.now() - t0)
  print("epoch duration:", datetime.now() - epoch_start)


  # store train loss
  t0 = datetime.now()
  train_losses.append(get_loss(movie2userrating))

  # store test loss
  test_losses.append(get_loss(movie2userrating_test))
  print("calculate cost:", datetime.now() - t0)
  print("train loss:", train_losses[-1])
  print("test loss:", test_losses[-1])


In [None]:
print("train losses:", train_losses)
print("test losses:", test_losses)

# plot losses
plt.plot(train_losses, label="train loss")
plt.plot(test_losses, label="test loss")
plt.legend()
plt.show()