<a href="https://colab.research.google.com/github/SreedevSB/Recommender-Systems-and-Deep-Learning/blob/main/user_user_collaborative_filtering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Import dataset

In [None]:
from google.colab import files
files.upload()

In [None]:
!mkdir /root/.kaggle
!chmod 777 kaggle.json
!mv kaggle.json /root/.kaggle/


In [None]:
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
!kaggle datasets download grouplens/movielens-20m-dataset

Downloading movielens-20m-dataset.zip to /content
 95% 186M/195M [00:01<00:00, 92.1MB/s]
100% 195M/195M [00:01<00:00, 104MB/s] 


In [None]:
!unzip movielens-20m-dataset.zip

Archive:  movielens-20m-dataset.zip
  inflating: genome_scores.csv       
  inflating: genome_tags.csv         
  inflating: link.csv                
  inflating: movie.csv               
  inflating: rating.csv              
  inflating: tag.csv                 


##Preprocessing

In [None]:
import numpy as np
import pandas as pd

In [None]:
ratings = pd.read_csv('rating.csv')

In [None]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40
...,...,...,...,...
20000258,138493,68954,4.5,2009-11-13 15:42:00
20000259,138493,69526,4.5,2009-12-03 18:31:48
20000260,138493,69644,3.0,2009-12-07 18:10:57
20000261,138493,70286,5.0,2009-11-13 15:42:24


In [None]:
print(len(ratings.userId.unique()))
print(max(ratings.userId))

138493
138493


In [None]:
print(len(ratings.movieId.unique()))
print(max(ratings.movieId))

26744
131262


In [None]:
ratings.userId = ratings.userId-1

In [None]:
#creating mapping for movieId
count = 0
d = dict()
for id in list(ratings.movieId.unique()):
  d[id] = count
  count+=1
print(d)
ratings["movieId"] = ratings.apply(lambda x : d[x["movieId"]], axis=1)

In [None]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,3.5,2005-04-02 23:53:47
1,0,1,3.5,2005-04-02 23:31:16
2,0,2,3.5,2005-04-02 23:33:39
3,0,3,3.5,2005-04-02 23:32:07
4,0,4,3.5,2005-04-02 23:29:40
...,...,...,...,...
20000258,138492,1814,4.5,2009-11-13 15:42:00
20000259,138492,1037,4.5,2009-12-03 18:31:48
20000260,138492,3950,3.0,2009-12-07 18:10:57
20000261,138492,1818,5.0,2009-11-13 15:42:24


## Create a smaller version of the dataset

In [None]:
from collections import Counter

In [None]:
userId_counts = Counter(ratings.userId)
movieId_counts = Counter(ratings.movieId)

In [None]:
N = ratings.userId.max() + 1 # Number of users
M = ratings.movieId.max() + 1 # Number of movies

In [None]:
n = 1000 #10000 # Number of users to be chosen
m = 200 #2000 # Number of movies to be chosen

In [None]:
userIds = [u for (u,count) in userId_counts.most_common(n)]
movieIds = [m for (m,count) in movieId_counts.most_common(m)]

In [None]:
ratings_small = ratings[ratings.userId.isin(userIds) & ratings.movieId.isin(movieIds)].copy()

In [None]:
ratings_small

Unnamed: 0,userId,movieId,rating,timestamp
19846,155,227,5.0,2002-11-19 20:54:26
19847,155,0,5.0,2002-12-26 21:20:49
19851,155,364,4.0,2002-11-20 19:55:17
19854,155,365,4.0,2002-12-26 23:13:17
19855,155,386,5.0,2002-12-06 19:49:14
...,...,...,...,...
19979054,138324,855,4.5,2009-01-24 18:20:00
19979058,138324,163,4.0,2009-01-24 18:23:03
19979183,138324,173,4.5,2009-01-24 18:20:25
19979387,138324,913,4.5,2009-01-24 18:16:58


In [None]:
#reassign consecutive userIds and movieIds

userId_map = dict()
count=0
for id in list(ratings_small.userId.unique()):
  userId_map[id] = count
  count+=1
ratings_small['userId'] = ratings_small.apply(lambda x : userId_map[x['userId']], axis=1)

movieId_map = dict()
count=0
for id in list(ratings_small.movieId.unique()):
  movieId_map[id] = count
  count+=1
ratings_small['movieId'] = ratings_small.apply(lambda x : movieId_map[x['movieId']], axis=1)

In [None]:
ratings_small

Unnamed: 0,userId,movieId,rating,timestamp
19846,0,0,5.0,2002-11-19 20:54:26
19847,0,1,5.0,2002-12-26 21:20:49
19851,0,2,4.0,2002-11-20 19:55:17
19854,0,3,4.0,2002-12-26 23:13:17
19855,0,4,5.0,2002-12-06 19:49:14
...,...,...,...,...
19979054,999,190,4.5,2009-01-24 18:20:00
19979058,999,191,4.0,2009-01-24 18:23:03
19979183,999,192,4.5,2009-01-24 18:20:25
19979387,999,199,4.5,2009-01-24 18:16:58


## Create lookup dictionaries user2movie , movie2user, usermovie_rating

In [None]:
import pickle
from sklearn.utils import shuffle

In [None]:
#split small dataset into train and test set
ratings_small = shuffle(ratings_small)
cutoff = int(0.8*len(ratings_small))
ratings_train = ratings_small.iloc[:cutoff,:]
ratings_test = ratings_small.iloc[cutoff:,:]

In [None]:
user2movie = dict()
movie2user = dict()
usermovie2rating = dict()

In [None]:
count = 0
def create_lookups_dicts(row):
  userId = int(row.userId)
  movieId = int(row.movieId)

  if userId in user2movie:
    user2movie[userId].append(movieId)
  else:
    user2movie[userId] = [movieId]
  
  if movieId in movie2user:
    movie2user[movieId].append(userId)
  else:
    movie2user[movieId] = [userId]
  
  usermovie2rating[(userId,movieId)] = row['rating']
  global count
  count+=1
 # print("Processed %.3f".format(float(count)/cutoff))

ratings_train = ratings_train.apply(create_lookups_dicts, axis=1)


In [None]:
usermovie2rating_test = dict()
count = 0
def create_lookups_dicts2(row):

  userId = int(row.userId)
  movieId = int(row.movieId)
 
  usermovie2rating_test[(userId,movieId)] = row['rating']
  global count
  count+=1
  #print("Processed %.3f".format(float(count)/ratings_test))

ratings_test = ratings_test.apply(create_lookups_dicts2, axis=1)

In [None]:
print(movie2user)

In [None]:
with open("user2movie.json", 'wb') as f:
  pickle.dump(user2movie, f)
with open("movie2user.json", 'wb') as f:
  pickle.dump(movie2user, f)
with open("usermovie2rating.json", 'wb') as f:
  pickle.dump(usermovie2rating, f)
with open("usermovie2rating_test.json", 'wb') as f:
  pickle.dump(usermovie2rating_test, f)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!cp user2movie.json /content/drive/MyDrive/colab-mount/collaborative-filtering/
!cp movie2user.json /content/drive/MyDrive/colab-mount/collaborative-filtering/
!cp usermovie2rating.json /content/drive/MyDrive/colab-mount/collaborative-filtering/
!cp usermovie2rating_test.json /content/drive/MyDrive/colab-mount/collaborative-filtering/

## User-User collaborative filtering

In [None]:
from datetime import datetime
from sortedcontainers import SortedList

folder = "/content/drive/MyDrive/colab-mount/collaborative-filtering/"

In [15]:
with open(folder+"user2movie.json",'rb') as f:
  user2movie = pickle.load(f)
with open(folder+"movie2user.json",'rb') as f:
  movie2user = pickle.load(f)
with open(folder+"usermovie2rating.json",'rb') as f:
  usermovie2rating = pickle.load(f)
with open(folder+"usermovie2rating_test.json",'rb') as f:
  usermovie2rating_test = pickle.load(f)

In [None]:
N  = np.max(list(user2movie.keys())) + 1 #Number of users

m1 = max(list(movie2user.keys()))  #max of movieId in train set
m2 = max([m for (u,m),r in usermovie2rating_test.items()]) #max of movieId in test set
M = max(m1,m2) + 1  #Number of movies
print("Number of users ", N)
print("Number of movies ", M)

Number of users  1000
Number of movies  200


In [None]:
from google.colab import output
K = 25
limit = 5

neighbours = []
averages = []
deviations = []

for i in range(N):
  #find the 25 closest users to user i
  movies_i = user2movie[i]
  movies_i_set = set(movies_i)

  #calculate average and deivation
  ratings_i = {movie:usermovie2rating[i,movie] for movie in movies_i}
  average_i = np.mean(list(ratings_i.values()))
  deviation_i = {movie:(rating - average_i) for movie,rating in ratings_i.items()}
  deviation_i_values = np.array(list(deviation_i.values()))
  sigma_i = np.sqrt(deviation_i_values.dot(deviation_i_values))

  averages.append(average_i)
  deviations.append(deviation_i)

  l_sorted = SortedList()
  for j in range(N):
    if j!=i:
      movies_j = user2movie[j]
      movies_j_set = set(movies_j)
      common_movies = (movies_i_set & movies_j_set)

      if len(common_movies) > limit :
          ratings_j = {movie:usermovie2rating[j,movie] for movie in movies_j}
          average_j = np.mean(list(ratings_j.values()))
          deviation_j = {movie:(rating - average_j) for movie,rating in ratings_j.items()}
          deviation_j_values = np.array(list(deviation_j.values()))
          sigma_j = np.sqrt(deviation_j_values.dot(deviation_j_values))

          numerator = sum(deviation_i[m]*deviation_j[m] for m in common_movies)
          w_ij = numerator / (sigma_i * sigma_j)


          l_sorted.add((-w_ij, j))
          if len(l_sorted) > K:
            del l_sorted[-1]
      output.clear()
      print(i,j)

  neighbours.append(l_sorted)

In [None]:
with open("neighbours.json", 'wb') as f:
  pickle.dump(neighbours, f)


!cp neighbours.json  /content/drive/MyDrive/colab-mount/collaborative-filtering/

with open("averages.json", 'wb') as f:
  pickle.dump(averages, f)

!cp averages.json  /content/drive/MyDrive/colab-mount/collaborative-filtering/

with open("deviations.json", 'wb') as f:
  pickle.dump(deviations, f)

!cp deviations.json  /content/drive/MyDrive/colab-mount/collaborative-filtering/





In [None]:
with open(folder+"averages.json", 'rb') as f:
  averages = pickle.load(f)

with open(folder+"deviations.json", 'rb') as f:
  deviations = pickle.load(f)

with open(folder+"neighbours.json", 'rb') as f:
  neighbours = pickle.load(f)


In [None]:
def predict(i,m):
  numerator = 0
  denominator = 0
  for neg_w, j in neighbours[i]:
    try :
      numerator += -neg_w * deviations[j][m]
      denominator += abs(neg_w)
    except KeyError:
      pass
  if denominator == 0:
    prediction = averages[i]
  else:
    prediction  = numerator/denominator + averages[i]
  
  prediction = min(5,prediction)
  prediction = max(0.5, prediction)
  return prediction

In [None]:
neighbours[0]

In [21]:
predict(0,0)

4.752034287815787

In [20]:
usermovie2rating[0,1]

5.0

In [24]:
train_predictions = []
train_targets = []

for (i,m), target in usermovie2rating.items():
  train_predictions.append(predict(i,m))
  train_targets.append(target)

test_predictions = []
test_targets = []

for (i,m), target in usermovie2rating_test.items():
  test_predictions.append(predict(i,m))
  test_targets.append(target)



In [40]:
sumsquare_errors = 0.0
for i in range(len(train_predictions)):
  error = train_predictions[i] - train_targets[i] 
  sumsquare_errors += error*error
mse = sumsquare_errors/len(train_predictions)
train_rmse = np.sqrt(sumsquare_errors/len(train_predictions))
train_rmse

0.6782824715942573

In [41]:
sumsquare_errors = 0.0
for i in range(len(test_predictions)):
  error = test_predictions[i] - test_targets[i] 
  sumsquare_errors += error*error
mse = sumsquare_errors/len(test_predictions)
test_rmse = np.sqrt(sumsquare_errors/len(test_predictions))
test_rmse

0.7697824262119587

In [43]:
#Alternative method to find rmse
def mse(p, t):
  p = np.array(p)
  t = np.array(t)
  return np.mean((p-t)**2)**0.5
print("Train rmse", rmse(train_predictions,train_targets))
print("Test rmse", rmse(test_predictions,test_targets))

Train mse 0.6782824715942573
Test mse 0.7697824262119608
