In [None]:
from __future__ import print_function, division
from builtins import range, input
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from collections import Counter
from scipy.sparse import lil_matrix, csr_matrix, save_npz, load_npz
from keras.layers import Input, Dropout, Dense
from keras.regularizers import l2
from keras.optimizers import SGD
from keras.models import Model
import keras.backend as K
from sortedcontainers import SortedList

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/recommender system/rating.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [None]:
# make the user ids go from 0...N-1
df.userId = df.userId - 1

In [None]:
#It identifies unique movie IDs from the DataFrame's movieId column.
#It initializes an empty dictionary (movie2idx) to store the mapping of movie IDs to indices.
#It iterates over each unique movie ID, assigning an index to each and updating the movie2idx dictionary.

unique_movie_ids = set(df.movieId.values)
movie2idx = {}
count = 0
for movie_id in unique_movie_ids:
  movie2idx[movie_id] = count
  count += 1
df['movie_idx'] = df.apply(lambda row: movie2idx[row.movieId], axis=1)

In [None]:
#deleting timestamp column since it it not required to make recommendations
df = df.drop(columns=['timestamp'])

In [None]:
df.head()

Unnamed: 0,userId,movieId,rating,movie_idx
0,0,2,3.5,2
1,0,29,3.5,29
2,0,32,3.5,32
3,0,47,3.5,47
4,0,50,3.5,50


In [None]:
print("original dataframe size:", len(df))

original dataframe size: 20000263


In [None]:
N = df.userId.max() + 1 # number of users
M = df.movie_idx.max() + 1 #number of movies
N,M

(138493, 26744)

In [None]:
#This line creates a counter object user_ids_count that counts
# the occurrences of each unique user ID in the userId column of the DataFrame
user_ids_count = Counter(df.userId)

#creates a counter object movie_ids_count that counts the occurrences of each
#unique movie index in the movie_idx column of the DataFrame df
movie_ids_count = Counter(df.movie_idx)

In [None]:
#selects a subset of the original DataFrame df by filtering out
# only the top n most common user IDs and the top m most common movie indices
n = 10000
m = 2000

user_ids = [u for u, c in user_ids_count.most_common(n)]
movie_ids = [m for m, c in movie_ids_count.most_common(m)]
df_small = df[df.userId.isin(user_ids) & df.movie_idx.isin(movie_ids)].copy()

In [None]:
#creates a mapping from old user IDs to new user IDs, where the new user IDs start from 0 and increment by
new_user_id_map = {}
i = 0
for old in user_ids:
  new_user_id_map[old] = i
  i += 1
print("i:", i)

i: 10000


In [None]:
#it creates a mapping from old movie IDs to new movie IDs, starting from 0 and incrementing by 1
new_movie_id_map = {}
j = 0
for old in movie_ids:
  new_movie_id_map[old] = j
  j += 1
print("j:", j)

j: 2000


In [None]:
#assigns the new user IDs and movie indices to the corresponding columns in the df_small
print("Setting new ids")
df_small.loc[:, 'userId'] = df_small.apply(lambda row: new_user_id_map[row.userId], axis=1)
df_small.loc[:, 'movie_idx'] = df_small.apply(lambda row: new_movie_id_map[row.movie_idx], axis=1)

Setting new ids


In [None]:
print("max user id:", df_small.userId.max())
print("max movie id:", df_small.movie_idx.max())

max user id: 9999
max movie id: 1999


In [None]:
print("small dataframe size:", len(df_small))

small dataframe size: 5392025


In [None]:
df=df_small
N = df.userId.max() + 1 # number of users
M = df.movie_idx.max() + 1 # number of movies
N,M

(10000, 2000)

In [None]:
# split into train and test

#shuffles the rows of the DataFram
df = shuffle(df)

#takes 80% of the length of the DataFrame df as the cutoff poin
cutoff = int(0.8*len(df))

#This line creates the training set df_train by selecting the
#rows of the DataFrame df up to the cutoff index. It includes
#the first 80% of the shuffled DataFrame.
df_train = df.iloc[:cutoff]

#This line creates the testing set df_test by selecting the rows
#of the DataFrame df from the cutoff index onwards. It includes the remaining 20% of the shuffled DataFrame.
df_test = df.iloc[cutoff:]

In [None]:
# a dictionary to tell us which users have rated which movies
user2movie = {}
# a dicationary to tell us which movies have been rated by which users
movie2user = {}
# a dictionary to look up ratings
usermovie2rating = {}
count = 0

#The provided code defines dictionaries to store user ratings information.
# A function, applied to each row of the training DataFrame, updates these
#dictionaries with user-movie interactions and ratings. Progress is printed
#every 100,000 rows processed to track the update process.
#the format of the ratings is represented as tuples containing the user ID and the movie index
#(user_id, movie_idx)
#In code, the format of the ratings is represented as tuples containing the user ID and the movie index. Here's how it looks:
#(123, 456)
#This tuple serves as a key in the usermovie2rating dictionary, with the corresponding rating as the value.{(user_id,movie_id):rating}

def update_user2movie_and_movie2user(row):
  global count
  count += 1
  if count % 100000 == 0:
    print("processed: %.3f" % (float(count)/cutoff))

  i = int(row.userId)
  j = int(row.movie_idx)
  if i not in user2movie:
    user2movie[i] = [j]
  else:
    user2movie[i].append(j)

  if j not in movie2user:
    movie2user[j] = [i]
  else:
    movie2user[j].append(i)

  usermovie2rating[(i,j)] = row.rating
df_train.apply(update_user2movie_and_movie2user, axis=1)

processed: 0.023
processed: 0.046
processed: 0.070
processed: 0.093
processed: 0.116
processed: 0.139
processed: 0.162
processed: 0.185
processed: 0.209
processed: 0.232
processed: 0.255
processed: 0.278
processed: 0.301
processed: 0.325
processed: 0.348
processed: 0.371
processed: 0.394
processed: 0.417
processed: 0.440
processed: 0.464
processed: 0.487
processed: 0.510
processed: 0.533
processed: 0.556
processed: 0.580
processed: 0.603
processed: 0.626
processed: 0.649
processed: 0.672
processed: 0.695
processed: 0.719
processed: 0.742
processed: 0.765
processed: 0.788
processed: 0.811
processed: 0.835
processed: 0.858
processed: 0.881
processed: 0.904
processed: 0.927
processed: 0.950
processed: 0.974
processed: 0.997


4385213     None
12142257    None
1563558     None
13456684    None
6942226     None
            ... 
1040777     None
16135021    None
6324069     None
1598959     None
5296429     None
Length: 4313620, dtype: object

In [None]:
# This code snippet creates a dictionary usermovie2rating_test to store test ratings data.
#A function update_usermovie2rating_test() is defined and applied to each row of the test DataFrame df_test using the apply() function.
# This function updates the usermovie2rating_test dictionary with user-movie
# interactions and ratings from the test data. Progress is printed every 100,000 rows processed to track the update process.
usermovie2rating_test = {}
print("Calling: update_usermovie2rating_test")
count = 0
def update_usermovie2rating_test(row):
  global count
  count += 1
  if count % 100000 == 0:
    print("processed: %.3f" % (float(count)/len(df_test)))

  i = int(row.userId)
  j = int(row.movie_idx)
  usermovie2rating_test[(i,j)] = row.rating
df_test.apply(update_usermovie2rating_test, axis=1)

Calling: update_usermovie2rating_test
processed: 0.093
processed: 0.185
processed: 0.278
processed: 0.371
processed: 0.464
processed: 0.556
processed: 0.649
processed: 0.742
processed: 0.835
processed: 0.927


11387636    None
14444481    None
13385681    None
11007076    None
2625745     None
            ... 
679037      None
2927040     None
5561850     None
9511326     None
15193624    None
Length: 1078405, dtype: object

In [None]:
#calculating the maximum user ID and the maximum movie index to determine the dimensions of  user-movie interaction
#matrices for both training and testing data.



#This calculates the maximum user ID p#resent in the training data by taking the maximum key from the user2movie
#dictionary (which stores which movies have been rated by each user). Adding 1 ensures that the count starts from 0
N = np.max(list(user2movie.keys())) + 1

#This calculates the maximum movie index present in the training data by taking the maximum key from the movie2user
#dictionary (which stores which users have rated each movie).
m1 = np.max(list(movie2user.keys()))

#This calculates the maximum movie index present in the testing data by extracting the movie indices from the keys
# of the usermovie2rating_test dictionary (which stores the ratings given by users to movies in the test data).
m2 = np.max([m for (u, m), r in usermovie2rating_test.items()])

#This calculates the maximum movie index overall, taking the maximum value between m1 (from training data) and m2
#(from testing data), and adding 1 to ensure that the count starts from 0.
M = max(m1, m2) + 1
print("N:", N, "M:", M)


N: 10000 M: 2000


In [None]:
 #implementing a user-based collaborative filtering algorithm to find similar users based on their ratings



K = 25 # number of neighbors we'd like to consider
limit = 5 # number of common movies users must have in common in order to consider
neighbors = [] # store neighbors in this list
averages = [] # each user's average rating for later use
deviations = [] # each user's deviation for later use




#The loop iterates over each user (indexed by i) from 0 to N-1, where N is the total number of users.
#For each user i, it retrieves the movies rated by user i (movies_i), calculates the average rating (avg_i), and deviation (dev_i) of user i
#It then iterates over all other users (j) to find the K closest users to user i based on their ratings similarity
#For each user j, it calculates the average rating (avg_j) and deviation (dev_j), and then computes the correlation coefficient (w_ij)
# between user i and user j based on their common movies

#The correlation coefficient is inserted into a sorted list (sl) to keep track of the closest neighbors. The list is sorted in ascending order,
# so the largest values (closest neighbors) are at the beginning. If the list exceeds K neighbors, the farthest neighbor is removed.

#The correlation coefficient (w_ij) is calculated using Pearson correlation between the deviations of ratings for common movies
#The sorted list (sl) contains tuples of (correlation_coefficient, user_id) pairs. It's sorted based on the negative correlation coefficient
# because the list is sorted in ascending order, and we want the largest values (closest neighbors) at the beginning.
#By the end of this loop, neighbors will contain the K closest neighbors for each user i
for i in range(N):
  # find the 25 closest users to user i
  movies_i = user2movie[i]
  movies_i_set = set(movies_i)

  # calculate avg and deviation
  ratings_i = { movie:usermovie2rating[(i, movie)] for movie in movies_i }
  avg_i = np.mean(list(ratings_i.values()))
  dev_i = { movie:(rating - avg_i) for movie, rating in ratings_i.items() }
  dev_i_values = np.array(list(dev_i.values()))
  sigma_i = np.sqrt(dev_i_values.dot(dev_i_values))

  # save these for later use
  averages.append(avg_i)
  deviations.append(dev_i)

  sl = SortedList()
  for j in range(N):
    # don't include yourself
    if j != i:
      movies_j = user2movie[j]
      movies_j_set = set(movies_j)
      common_movies = (movies_i_set & movies_j_set) # intersection
      if len(common_movies) > limit:
        # calculate avg and deviation
        ratings_j = { movie:usermovie2rating[(j, movie)] for movie in movies_j }
        avg_j = np.mean(list(ratings_j.values()))
        dev_j = { movie:(rating - avg_j) for movie, rating in ratings_j.items() }
        dev_j_values = np.array(list(dev_j.values()))
        sigma_j = np.sqrt(dev_j_values.dot(dev_j_values))

        # calculate correlation coefficient
        numerator = sum(dev_i[m]*dev_j[m] for m in common_movies)
        w_ij = numerator / (sigma_i * sigma_j)

        # insert into sorted list and truncate
        # negate weight, because list is sorted ascending
        # maximum value (1) is "closest"
        sl.add((-w_ij, j))
        if len(sl) > K:
          del sl[-1]

In [None]:
#this function predicts the rating that user i would give to movie m based on the ratings of similar users, while handling missing data gracefully.
def predict(i, m):
  # calculate the weighted sum of deviations
  numerator = 0
  denominator = 0
  for neg_w, j in neighbors[i]:
    # remember, the weight is stored as its negative
    # so the negative of the negative weight is the positive weight
    try:
      numerator += -neg_w * deviations[j][m]
      denominator += abs(neg_w)
    except KeyError:
      # neighbor may not have rated the same movie
      # don't want to do dictionary lookup twice
      # so just throw exception
      pass

  if denominator == 0:
    prediction = averages[i]
  else:
    prediction = numerator / denominator + averages[i]
  prediction = min(5, prediction)
  prediction = max(0.5, prediction) # min rating is 0.5
  return prediction


In [None]:
# generate predictions for both the training and testing datasets and store them along with the corresponding actual ratings
#These lists can be used for evaluating the performance of your collaborative filtering model.


train_predictions = []
train_targets = []
for (i, m), target in usermovie2rating.items():
  # calculate the prediction for this movie
  prediction = predict(i, m)

  # save the prediction and target
  train_predictions.append(prediction)
  train_targets.append(target)

test_predictions = []
test_targets = []
# same thing for test set
for (i, m), target in usermovie2rating_test.items():
  # calculate the prediction for this movie
  prediction = predict(i, m)

  # save the prediction and target
  test_predictions.append(prediction)
  test_targets.append(target)

In [None]:
# calculate accuracy
def mse(p, t):
  p = np.array(p)
  t = np.array(t)
  return np.mean((p - t)**2)

print('train mse:', mse(train_predictions, train_targets))
print('test mse:', mse(test_predictions, test_targets))

In [None]:
import pickle

# Save the model
with open('collaborative_filtering_model_item_item.pkl', 'wb') as f:
    pickle.dump({
        'user2movie': user2movie,
        'movie2user': movie2user,
        'usermovie2rating': usermovie2rating,
        'neighbors': neighbors,
        'averages': averages,
        'deviations': deviations
    }, f)
