# DSA4212 Assignment 2
**CAA: 5 April 2023**

## Imports

In [None]:
!pip install jaxopt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import os

# Suppress warnings
import warnings
def warn(*args, **kwargs): pass
warnings.warn = warn
warnings.filterwarnings('ignore')

import numpy as onp
from tqdm.auto import tqdm
import pylab as plt
import copy
import time
from random import sample
import random
from collections import defaultdict
import pandas as pd
import jax, jaxopt
import jax.numpy as jnp
import scipy as sp
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
import operator

In [None]:
if 'google.colab' in str(get_ipython()):
    from google.colab import drive

    !nvidia-smi -L
    !lscpu |grep 'Model name'
    !free -h --si | awk  '/Mem:/{print $2}'

GPU 0: Tesla T4 (UUID: GPU-4c2988b9-0b23-5740-fcb9-1af34dfc33d1)
Model name:                      Intel(R) Xeon(R) CPU @ 2.00GHz
12G


## Load Data

In [None]:
if 'google.colab' in str(get_ipython()):
    drive.mount("/content/drive")
    %cd /content/drive/MyDrive/dataset/DSA4212/assignment_2/

anime = pd.read_csv("assignment_2_anime.csv")
train = pd.read_csv("assignment_2_ratings_train.csv")
test = pd.read_csv("assignment_2_ratings_test.csv")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/dataset/DSA4212/assignment_2


In [None]:
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [None]:
train.head()

Unnamed: 0,user_id,anime_id,rating
0,20170,10794,6
1,24592,21995,5
2,18358,7054,10
3,59267,488,7
4,69313,30544,4


In [None]:
test.head()

Unnamed: 0,user_id,anime_id,rating
0,44017,13161,4
1,14307,14993,7
2,55155,268,9
3,63515,2889,9
4,54059,2581,7


## Baseline

### Train data overall mean

In [None]:
overall_train_mean = train['rating'].mean()
overall_train_mean

7.808651715888936

In [None]:
mean_squared_error(test['rating'], [overall_train_mean]*test.shape[0])

2.472703843572946

### Regularized train data mean

In [None]:
grouped_train = train.groupby("anime_id", as_index = False).agg({"rating": [len, onp.sum, onp.mean]})
grouped_train.head()

Unnamed: 0_level_0,anime_id,rating,rating,rating
Unnamed: 0_level_1,Unnamed: 1_level_1,len,sum,mean
0,1,9314,82597,8.868048
1,5,4058,34234,8.436175
2,6,6629,55853,8.425554
3,7,1488,11142,7.487903
4,8,224,1601,7.147321


In [None]:
merged = test.merge(grouped_train, how = "left", on = "anime_id").sort_values(by='anime_id').reset_index(drop=True)
merged.head()

Unnamed: 0,user_id,anime_id,rating,"(rating, len)","(rating, sum)","(rating, mean)"
0,6466,1,7,9314.0,82597.0,8.868048
1,60544,1,8,9314.0,82597.0,8.868048
2,66429,1,6,9314.0,82597.0,8.868048
3,51420,1,9,9314.0,82597.0,8.868048
4,50310,1,6,9314.0,82597.0,8.868048


In [None]:
merged[merged[('rating', 'mean')].isna()]

Unnamed: 0,user_id,anime_id,rating,"(rating, len)","(rating, sum)","(rating, mean)"
450365,26638,1093,9,,,
544507,50440,1656,6,,,
565321,59026,1739,5,,,
602246,7561,2024,7,,,
611266,65836,2101,5,,,
...,...,...,...,...,...,...
1900428,53492,33906,3,,,
1900431,53492,33914,5,,,
1900738,1344,34136,8,,,
1900747,30573,34239,7,,,


In [None]:
# Use overall training mean
for alpha in [0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1, 2, 5, 10, 20, 50]:
  regularized = (alpha*overall_train_mean + merged[('rating', 'sum')].fillna(overall_train_mean))/(alpha + merged[('rating', 'len')].fillna(1))
  print(f"alpha = {alpha}, MSE = {mean_squared_error(merged['rating'], regularized)}")

alpha = 0, MSE = 2.062684726555174
alpha = 0.0001, MSE = 2.0626844685302634
alpha = 0.001, MSE = 2.0626821504808044
alpha = 0.01, MSE = 2.0626593777461815
alpha = 0.1, MSE = 2.062467526136466
alpha = 0.5, MSE = 2.0620792660686242
alpha = 1, MSE = 2.062032057511382
alpha = 2, MSE = 2.0623410436565743
alpha = 5, MSE = 2.063838196758722
alpha = 10, MSE = 2.0664177062310487
alpha = 20, MSE = 2.071031627720659
alpha = 50, MSE = 2.0817441554411467


In [None]:
# Use grouped training mean
for alpha in [0, 0.0001, 0.001, 0.01, 0.1, 0.5, 1, 2, 5, 10, 20, 50]:
  regularized = (alpha*overall_train_mean + merged[('rating', 'sum')].fillna(merged[('rating', 'mean')].mean()))/(alpha + merged[('rating', 'len')].fillna(1))
  print(f"alpha = {alpha}, MSE = {mean_squared_error(merged['rating'], regularized)}")

alpha = 0, MSE = 2.062685622632537
alpha = 0.0001, MSE = 2.06268536451801
alpha = 0.001, MSE = 2.0626830456628205
alpha = 0.01, MSE = 2.0626602649498746
alpha = 0.1, MSE = 2.062468340738613
alpha = 0.5, MSE = 2.0620798634168653
alpha = 1, MSE = 2.062032505508811
alpha = 2, MSE = 2.06234134231236
alpha = 5, MSE = 2.063838346082031
alpha = 10, MSE = 2.0664177876789895
alpha = 20, MSE = 2.0710316703835256
alpha = 50, MSE = 2.08174417300812


## Collaborative filtering
https://www.kaggle.com/code/ajmichelutti/collaborative-filtering-on-anime-data

#### Using cosine similarity

In [None]:
train_uid, train_aid, train_rating = list(train['user_id']), list(train['anime_id']), list(train['rating'])
train_uid_to_index, train_aid_to_index = {e: i for i, e in enumerate(sorted(set(train_uid)))}, {e: i for i, e in enumerate(sorted(set(train_aid)))}
piv = {}
for uid, aid, r in zip(train_uid, train_aid, train_rating):
  uid = train_uid_to_index[uid]
  aid = train_aid_to_index[aid]
  if uid not in piv: piv[uid] = {}
  piv[uid][aid] = r

means = {i: sum(piv[i].values())/len(piv[i].values()) for i in piv}

# Note: As we are subtracting the mean from each rating to standardize
# all users with only one rating or who had rated everything the same will be dropped
for uid in list(piv):
  vals = piv[uid].values()
  lo, hi, s, l = min(vals), max(vals), sum(vals), len(vals) 
  if lo == hi: del piv[uid]
  else:
    m = s/l
    for aid in piv[uid]: piv[uid][aid] = (piv[uid][aid]-m)/(hi-lo)

piv2 = {}
random.seed(4212)
sampled_uids = sorted(sample(range(len(train_uid_to_index)), 23_000))
set_sampled_uids = set(sampled_uids)
ids = {e: i for i, e in enumerate(sampled_uids)}
for uid in piv:
  if uid in ids:
    piv2[ids[uid]] = piv[uid]
piv = piv2

In [None]:
row, col, data = [], [], []
for uid in piv:
  for aid in piv[uid]:
    row.append(uid)
    col.append(aid)
    data.append(piv[uid][aid])
R, C = len(sampled_uids), max(col)+1
piv_sparse = sp.sparse.csr_matrix((data, (row, col)), shape=(R, C))
piv_sparse

<23000x9632 sparse matrix of type '<class 'numpy.float64'>'
	with 1490630 stored elements in Compressed Sparse Row format>

In [None]:
user_similarity = cosine_similarity(piv_sparse)

In [None]:
user_similarity.shape

(23000, 23000)

In [None]:
# Inserting the similarity matrices into dataframe objects
df_cols = list(ids)
user_sim_df = pd.DataFrame(user_similarity, index=df_cols, columns=df_cols)

In [None]:
user_sim_df.head()

Unnamed: 0,1,2,8,12,13,17,19,20,21,27,...,68393,68396,68398,68400,68403,68406,68408,68409,68411,68420
1,1.0,0.029567,0.092245,0.185719,0.019414,0.0,0.014588,0.049979,0.0,0.0,...,0.0,-0.067423,0.017397,0.0,0.085685,0.005399,0.003968,0.0,0.018302,0.0
2,0.029567,1.0,0.101509,0.079492,-0.007926,0.0,0.020983,0.071524,0.0,0.001534,...,0.0,0.06466,-0.025106,0.0,0.047333,-0.007505,0.0195,0.0,0.006088,0.0
8,0.092245,0.101509,1.0,0.072417,0.050201,0.0,0.09105,0.172464,0.0,0.0,...,0.012516,0.225282,0.006823,0.0,0.001194,0.079899,0.047636,0.0,-0.048197,0.0
12,0.185719,0.079492,0.072417,1.0,0.004038,0.0,-0.00732,0.080789,0.0,-0.045353,...,0.0,0.0,0.032541,0.0,0.128351,0.052206,0.02088,0.0,0.024412,0.0
13,0.019414,-0.007926,0.050201,0.004038,1.0,0.0,0.026413,0.025076,0.0,-0.046909,...,0.0,0.0,0.048487,0.0,-0.011949,0.03369,0.004722,0.0,0.009759,0.0


In [None]:
anime_mapper = dict(zip(anime.anime_id, anime.rating))

In [None]:
sorted_sim_u = {}
sorted_sim_v = {}
for uid in tqdm(sampled_uids):
  s = user_sim_df[uid].sort_values(ascending=False)[1:2000]
  sorted_sim_v[uid], sorted_sim_u[uid] = list(s), list(s.index)

  0%|          | 0/23000 [00:00<?, ?it/s]

In [None]:
def predicted_rating(user_id, anime_id):
    if user_id not in train_uid_to_index:
      if anime_id in anime_mapper:
        return anime_mapper[anime_id]
      print('Fallback option')
      return overall_train_mean
    user = train_uid_to_index[user_id]
    if user not in set_sampled_uids:
      return means[user]
    sim_users = sorted_sim_u[uid]
    user_values = sorted_sim_v[uid]
    rating_list = []
    weight_list = []
    for j, i in enumerate(sim_users):
        if anime_id not in train_aid_to_index or i not in piv or train_aid_to_index[anime_id] not in piv[i]:
          continue
        rating = piv[i][train_aid_to_index[anime_id]]
        similarity = user_values[j]
        rating_list.append(rating*similarity)
        weight_list.append(similarity)
    if sum(weight_list): return sum(rating_list)/sum(weight_list)
    return means[user]

In [None]:
mean_squared_error([predicted_rating(uid, aid) for uid, aid in tqdm(zip(test.user_id, test.anime_id))], list(test.rating))

0it [00:00, ?it/s]

1.8634768475837056

In [None]:
test.head()

Unnamed: 0,user_id,anime_id,rating
0,44017,13161,4
1,14307,14993,7
2,55155,268,9
3,63515,2889,9
4,54059,2581,7


In [None]:
predicted_rating(44017, 13161), predicted_rating(14307, 14993), predicted_rating(55155, 288)

(7.691011235955056, 7.419919246298789, 9.015151515151516)

#### Using Annoy

In [None]:
!pip install annoy
from annoy import AnnoyIndex

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


#### User-user based

In [None]:
means, piv = {}, {}
for uid, aid, rating in zip(train.user_id, train.anime_id, train.rating):
  if uid not in means: means[uid], piv[uid] = 0, {}
  means[uid] += rating+1j
  piv[uid][aid] = rating
for uid in means:
  means[uid] = means[uid].real/means[uid].imag
train_uids, train_aids = set(train.user_id), set(train.anime_id)
aid2idx = {e:i for i, e in enumerate(sorted(train_aids))}

In [None]:
nearest_neighbours = AnnoyIndex(len(train_aids), 'angular')
for uid in tqdm(piv):
  rows = [0]*len(train_aids)
  for aid in piv[uid]: rows[aid2idx[aid]] = piv[uid][aid]
  nearest_neighbours.add_item(uid, rows)
nearest_neighbours.build(10)

  0%|          | 0/68421 [00:00<?, ?it/s]

True

In [None]:
sorted_u, sorted_v = {}, {}
num_neighbours = 200
for i in tqdm(piv):
  sorted_u[i], sorted_v[i] = nearest_neighbours.get_nns_by_item(i, num_neighbours+1, include_distances=True)

  0%|          | 0/68421 [00:00<?, ?it/s]

In [None]:
anime_mapper = dict(zip(anime.anime_id, anime.rating))

In [None]:
def predicted_rating(user_id, anime_id):
    if user_id not in train_uids:
      if anime_id in anime_mapper:
        return anime_mapper[anime_id]
      print('Fallback option')
      return overall_train_mean
    sim_users = sorted_u[user_id]
    user_values = sorted_v[user_id]
    rating_list = []
    weight_list = []
    for uid, weight in zip(sim_users, user_values):
        if anime_id not in piv[uid]: # neighbor doesn't watch this
          continue
        rating = piv[uid][anime_id]
        similarity = 1-weight**2/2
        rating_list.append(rating*similarity)
        weight_list.append(similarity)
    if sum(weight_list): return sum(rating_list)/sum(weight_list)
    return means[user_id]

In [None]:
mean_squared_error([predicted_rating(uid, aid) for uid, aid in tqdm(zip(test.user_id, test.anime_id))], list(test.rating))

0it [00:00, ?it/s]

2.0410265477544822

#### Item-item based

In [None]:
means, piv = {}, {}
for uid, aid, rating in zip(train.user_id, train.anime_id, train.rating):
  if aid not in means: means[aid], piv[aid] = 0, {}
  means[aid] += rating+1j
  piv[aid][uid] = rating
for aid in means:
  means[aid] = means[aid].real/means[aid].imag
train_uids, train_aids = set(train.user_id), set(train.anime_id)
uid2idx = {e:i for i, e in enumerate(sorted(train_uids))}

In [None]:
nearest_neighbours = AnnoyIndex(len(train_uids), 'angular')
for aid in tqdm(piv):
  rows = [0]*len(train_uids)
  for uid in piv[aid]: rows[uid2idx[uid]] = piv[aid][uid]
  nearest_neighbours.add_item(aid, rows)
nearest_neighbours.build(10)

  0%|          | 0/9632 [00:00<?, ?it/s]

True

In [None]:
sorted_u, sorted_v = {}, {}
num_neighbours = 20
for i in tqdm(piv):
  sorted_u[i], sorted_v[i] = nearest_neighbours.get_nns_by_item(i, num_neighbours+1, include_distances=True)

  0%|          | 0/9632 [00:00<?, ?it/s]

In [None]:
anime_mapper = dict(zip(anime.anime_id, anime.rating))
overall_train_mean = train['rating'].mean()
overall_train_mean

7.808651715888936

In [None]:
def predicted_rating(user_id, anime_id):
    if user_id not in train_uids or anime_id not in train_aids:
      if anime_id in anime_mapper:
        return anime_mapper[anime_id]
      print('Fallback option')
      return overall_train_mean
    sim_animes = sorted_u[anime_id]
    anime_values = sorted_v[anime_id]
    rating_list = []
    weight_list = []
    for uid, weight in zip(sim_animes, anime_values):
        if user_id not in piv[aid]:
          continue
        rating = piv[aid][user_id]
        similarity = 1-weight**2/2
        rating_list.append(rating*similarity)
        weight_list.append(similarity)
    if sum(weight_list): return sum(rating_list)/sum(weight_list)
    return means[anime_id]

In [None]:
mean_squared_error([predicted_rating(uid, aid) for uid, aid in tqdm(zip(test.user_id, test.anime_id))], list(test.rating))

0it [00:00, ?it/s]

Fallback option
Fallback option


2.0608980665449486