<a href="https://colab.research.google.com/github/PragunSaini/vnrec_notebooks/blob/master/vndb_collaborative.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Database Setup (for cloud notebook)


In [None]:
# For postgresql setup on colab

# Install postgresql server
!sudo apt-get -y -qq update
!sudo apt-get -y -qq install postgresql
!sudo service postgresql start

# # Setup a new user `vndb`
!sudo -u postgres createuser --superuser vndb
!sudo -u postgres createdb vndb
!sudo -u postgres psql -c "ALTER USER vndb PASSWORD 'vndb'"

In [None]:
# Download vndb database dump
!curl -L https://dl.vndb.org/dump/vndb-db-latest.tar.zst -O

In [None]:
# Extract and Load data in postgresql
!sudo apt-get install zstd
!tar -I zstd -xvf vndb-db-latest.tar.zst
!PGPASSWORD=vndb psql -U vndb -h 127.0.0.1 vndb -f import.sql

## Setting up environment

In [1]:
# PostgreSQL connection
import sqlalchemy

# Data and math
import numpy as np
import pandas as pd
import scipy

# Plotting and viz.
import matplotlib as plt
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (12, 8)
import seaborn as sns
sns.set_style('whitegrid')
from tqdm.auto import tqdm

  import pandas.util.testing as tm


In [2]:
# Create PostgreSQL engine
engine = sqlalchemy.create_engine(f'postgresql://vndb:vndb@localhost:5432/vndb')

  """)


## Load user, vn and ratings data

In [3]:
# Load users table
users = pd.read_sql_table("users", con=engine)
users.set_index('id', inplace=True)
users.head()

Unnamed: 0_level_0,username,ign_votes,perm_imgvote,perm_tag
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,yorhel,False,True,True
4,hillie,False,True,True
5,vatina,False,True,True
6,3db,False,True,True
7,fuku,False,True,True


In [4]:
# Load vn table
vn = pd.read_sql_table("vn", con=engine)
vn.set_index('id', inplace=True)
vn.head()

Unnamed: 0_level_0,title,original,alias,length,image,desc,l_wp,l_encubed,l_renai,c_popularity,c_rating,c_votecount,l_wikidata
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,Minna de Nyan Nyan,みんなでニャンニャン,,3,cv20339,A young man named Ibuki is a college student l...,,,,0.04621,58.2219,624,
2,Kana ~Imouto~,加奈～いもうと～,Kana Little Sister,3,cv29271,"You have a sister by the name of Kana, who is ...",Kana:_Little_Sister,,,0.154525,74.1949,1538,1274585.0
3,Utawarerumono,うたわれるもの,Uta (うた)\nUta1\nThe One Being Sung\nThe One of...,3,cv21565,"[url=/c411]Hakuoro[/url], a man who wakes up i...",Utawarerumono,utawarerumono,,0.342757,78.5946,3303,773981.0
4,Clannad,,クラナド,5,cv24252,Okazaki Tomoya is a third year high school stu...,Clannad_(visual_novel),clannad,clannad,0.640244,86.5038,6658,110607.0
5,Little Busters!,リトルバスターズ！,LB!\nリトバス！\nritobasu\nLB-EX,5,cv42017,"Riki was a child when his parents died, leavin...",Little_Busters!,little-busters,,0.586695,85.667,5829,683502.0


In [5]:
# Read ratings table (only those user entries who have voted)
ratings = pd.read_sql('SELECT uv.vid, uv.uid, uv.vote, uv.lastmod FROM ulist_vns uv INNER JOIN ulist_vns_labels uvl ON uv.vid = uvl.vid AND uv.uid = uvl.uid WHERE uvl.lbl = 7', con=engine)
ratings.head()

Unnamed: 0,vid,uid,vote,lastmod
0,61,2,60,2008-08-06 00:00:00+00:00
1,898,2,70,2008-08-28 00:00:00+00:00
2,1290,2,50,2008-12-18 00:00:00+00:00
3,2,4,70,2008-12-28 00:00:00+00:00
4,10,4,70,2008-12-28 00:00:00+00:00


## Data Overview

In [6]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 897125 entries, 0 to 897124
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype              
---  ------   --------------   -----              
 0   vid      897125 non-null  int64              
 1   uid      897125 non-null  int64              
 2   vote     897125 non-null  int64              
 3   lastmod  897125 non-null  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), int64(3)
memory usage: 27.4 MB


In [7]:
print("In ratings data :")
uniq_user_cnt = len(ratings['uid'].unique())
uniq_vn_cnt = len(ratings['vid'].unique())
print(f"Distinct users: {uniq_user_cnt}")
print(f"Distinct VN: {uniq_vn_cnt}")
sparsity = len(ratings) / (uniq_user_cnt * uniq_vn_cnt)
print(f"Sparsity: {(1 - sparsity) * 100}")

In ratings data :
Distinct users: 44656
Distinct VN: 21312
Sparsity: 99.90573533410021


In [8]:
# Total users and vn in database
print(f"Total users: {len(users)}")
print(f"Total VN: {len(vn)}")

Total users: 63572
Total VN: 27678


So it's clear that not all users have voted on VNs and moreover some VNs have never been voted on.
We are going to ignore those users and VNs for now.

In [9]:
# Ratings vary from 10 to 100
ratings['vote'].describe()

count    897125.000000
mean         72.646870
std          18.271053
min          10.000000
25%          60.000000
50%          75.000000
75%          90.000000
max         100.000000
Name: vote, dtype: float64

## Building a collaborative filtering model

In [10]:
def clean_and_filter_ratings(df):
  # Drop lastmod timestamp from ratings
  data = df.drop('lastmod', axis='columns')

  # For significance, we are gonna drop some insignificant users and VNs
  # Here I am dropping VNs rated by less than 10 people
  # And users who have rated less then 8 VN

  vn_to_drop = data.groupby('vid').count()['uid']
  vn_to_drop = vn_to_drop[vn_to_drop < 10].index
  data = data[~data['vid'].isin(vn_to_drop)]

  users_to_drop = data.groupby('uid').count()['vid']
  users_to_drop = users_to_drop[users_to_drop < 8].index
  data = data[~data['uid'].isin(users_to_drop)]

  return data

In [11]:
data = clean_and_filter_ratings(ratings)

In [12]:
# Let's split it into training and test data
# Stratify on uid so that same proportions of users are present in both sets
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.10, stratify=data['uid'])

In [13]:
# Since each user has a different way of rating
# So we will adjust the ratings of each user by his own mean

def adjust_user_wise_ratings(df):
  means = df.groupby('uid')['vote'].mean()
  df = pd.merge(df, means, on='uid', suffixes=('', '_avg'))
  df['vote_adj'] = df['vote'] - df['vote_avg']
  return df

In [14]:
train = adjust_user_wise_ratings(train)
train.head()

Unnamed: 0,vid,uid,vote,vote_avg,vote_adj
0,16221,171415,50,72.5,-22.5
1,44,171415,100,72.5,27.5
2,3126,171415,80,72.5,7.5
3,945,171415,90,72.5,17.5
4,97,171415,80,72.5,7.5


In [15]:
# Creating a (sparse) user-item matrix 

from scipy.sparse import csr_matrix

def ratings_to_user_item(df):
  ''' Returns (sparse_matrix, row categories, column categories) '''
  uid_c = df['uid'].astype('category')
  vid_c = df['vid'].astype('category')
  sparse = csr_matrix((df['vote_adj'], (uid_c.cat.codes, vid_c.cat.codes)),
                            shape=(uid_c.dtype.categories.size, vid_c.dtype.categories.size))
  return sparse, uid_c, vid_c

In [16]:
# Get the user-item matrix and user/VN indexes
user_item, uid_c, vid_c = ratings_to_user_item(train)

In [17]:
# Looking good
user_item.shape

(20931, 8217)

In [18]:
# For indexing ease, convert to Dataframe
user_item_df = pd.DataFrame(user_item.todense(), index=uid_c.dtype.categories, columns=vid_c.dtype.categories)

In [19]:
user_item_df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,28,29,30,31,32,33,34,35,36,37,38,39,40,41,...,27772,27781,27791,27823,27829,27831,27835,27842,27912,27913,27938,27940,27959,27978,28014,28015,28016,28065,28080,28130,28151,28187,28202,28205,28218,28228,28258,28278,28327,28332,28345,28354,28400,28402,28460,28462,28482,28513,28556,28619
2,-29.166667,0.0,30.833333,0.0,0.833333,0.0,30.833333,0.0,0.0,10.833333,10.833333,0.0,0.0,0.0,0.0,0.0,40.833333,0.0,0.0,0.0,0.0,-19.166667,0.0,-19.166667,0.0,0.0,0.0,0.833333,0.0,0.833333,0.0,0.0,20.833333,20.833333,0.0,0.0,30.833333,0.0,-19.166667,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,-9.0,1.0,0.0,0.0,0.0,11.0,0.0,0.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,14.509804,0.0,0.0,4.509804,14.509804,0.0,0.0,0.0,0.0,0.0,14.509804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.490196,0.0,-5.490196,0.0,14.509804,0.0,0.0,0.0,0.0,0.0,-5.490196,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,20.666667,0.0,0.0,0.666667,0.0,0.0,-9.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,-19.333333,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,-10.357143,-0.357143,19.642857,0.0,-0.357143,9.642857,0.0,0.0,9.642857,9.642857,0.0,29.642857,9.642857,0.0,-10.357143,29.642857,0.0,0.0,-10.357143,29.642857,0.0,0.0,0.0,0.0,0.0,0.0,19.642857,0.0,-0.357143,-0.357143,0.0,9.642857,0.0,0.0,9.642857,19.642857,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# We can use cosine, pearson, adjusted cosine and other similarity metrics
# Cosine is pretty fast so we are gonna use it
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import cosine_similarity

### User - User Collaborative Filtering

First let's try out user-user collaborative filtering

In [21]:
# Creating a user-user correlation matrix

def get_user_corr(mat, index):
  # user_corr = 1 - pairwise_distances(user_item, metric='correlation')
  user_corr = cosine_similarity(mat)
  user_corr = pd.DataFrame(user_corr, index=index, columns=index, dtype=np.float32)
  np.fill_diagonal(user_corr.values, 0.0)
  return user_corr

In [22]:
# Here we will pass the sparse user-item matrix (sklearn has optimized cosine_similarity for sparse matrices)
user_corr = get_user_corr(user_item, uid_c.dtype.categories)

In [23]:
user_corr.head()

Unnamed: 0,2,4,5,6,8,9,12,13,14,15,18,25,28,29,30,31,37,38,50,55,59,62,64,67,71,79,82,86,89,90,93,97,100,101,105,107,109,114,118,127,...,180143,180145,180150,180151,180154,180163,180181,180183,180186,180191,180192,180193,180195,180197,180210,180211,180216,180223,180236,180241,180245,180247,180253,180256,180258,180260,180261,180262,180270,180301,180305,180306,180312,180325,180332,180338,180339,180356,180365,180368
2,0.0,0.214864,0.102438,-0.067137,0.153576,0.098933,0.169499,0.155431,-0.025802,0.023167,0.069494,0.076118,0.054914,0.077459,0.021395,0.110089,0.066446,0.039218,0.08945,0.087513,0.101313,0.184599,0.085922,0.046396,-0.003743,0.097977,0.079071,0.068448,0.051585,0.012284,0.055125,0.065861,0.126284,0.089712,0.197515,0.102644,0.158687,-0.068934,0.129402,0.06748,...,0.076334,0.055461,0.001391,-0.018318,0.01624,0.011096,0.0,0.0,0.0,-0.120664,0.10194,-0.069938,0.013063,0.0,0.0,0.008796,0.0396,0.034237,0.040009,0.052046,0.008136,-0.0119,0.031718,-0.049749,-0.044131,0.077923,0.016739,0.0,-0.028376,0.001072,-0.02287,5e-05,-0.001299,0.038305,0.0,0.058297,0.0,0.025054,0.054853,0.0
4,0.214864,0.0,0.099119,0.0782,0.181207,0.069616,0.067443,0.042955,-0.028392,0.0,0.0,-0.053566,-0.126238,0.036449,-0.041963,0.207803,-0.027276,0.040139,-0.110158,0.151155,-0.01402,0.099483,0.138917,0.007571,0.0,0.086431,0.058517,0.218976,0.054606,0.001671,-0.0677,-0.033819,0.277959,0.060502,-0.084844,0.006543,0.020402,0.0,0.052434,0.135233,...,0.242037,0.0,0.0,-0.005881,0.0,0.003897,0.0,0.0,0.0,-0.235616,0.233586,-0.01204,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.067577,0.168265,0.0,-0.049322,0.222027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.121004,0.0,0.0,0.034763,0.0
5,0.102438,0.099119,0.0,-0.051727,0.079407,0.082008,0.068602,0.084261,-0.048103,-0.026589,-0.002614,0.013242,0.06097,0.115889,0.003453,0.194406,0.086698,0.0776,0.182165,0.118983,0.023996,0.126109,0.175546,0.090286,0.0,0.161437,0.162087,0.098784,0.02648,0.055117,0.015666,0.042585,0.122278,0.006357,0.097431,0.10297,0.090626,-0.009057,0.06701,0.089989,...,0.013573,-0.04637,0.007148,-0.02779,0.085606,0.035006,0.0,0.0,0.0,-0.032385,0.080586,0.078524,0.003458,-0.033949,0.0,0.0,0.0,0.087227,0.008576,0.139148,0.025583,0.055126,0.035655,0.03876,0.073173,0.042895,0.097116,0.0,0.0,0.006492,0.0,0.05574,0.040875,0.218204,0.003359,0.029669,0.0,0.030024,0.042189,-0.001629
6,-0.067137,0.0782,-0.051727,0.0,0.144144,-0.040085,0.064389,0.110314,-0.039953,0.071403,0.184832,0.22833,-0.060626,-0.028353,-0.358932,0.063578,0.058383,-0.059575,-0.016268,0.20998,0.002606,0.019479,-0.020105,0.067089,0.0,-0.007206,-0.045959,0.140883,0.010817,0.087832,-0.057571,0.044401,0.23105,0.004074,0.003968,0.140078,0.263234,0.238268,0.078354,0.069974,...,0.0,-0.040503,0.0,-0.006711,0.062048,0.030285,0.0,0.0,0.0,-0.12423,0.134992,0.033123,0.107997,0.092261,0.0,0.0,0.0,0.0,0.140367,0.0,0.0,-0.010227,0.007745,0.113988,0.066202,0.122848,0.147807,0.0,0.0,0.0,0.0,0.119359,0.00444,0.0,-0.048603,0.097198,0.0,-0.009517,0.10081,-0.036651
8,0.153576,0.181207,0.079407,0.144144,0.0,0.071596,0.110672,0.254678,-0.129189,0.034415,0.15469,0.094892,0.100195,0.142739,-0.111442,0.092761,0.142002,-0.063625,0.072375,0.063259,0.088903,0.145981,0.109295,0.042121,0.0,0.13513,0.089388,0.08046,0.115878,0.067301,0.022296,-0.002565,0.181586,0.162799,-0.016064,0.130908,0.078638,0.087274,0.145962,0.14383,...,0.143161,0.022793,0.0,0.014146,0.046169,0.041436,0.0,0.0,0.0,-0.087654,0.24609,0.027341,-0.00892,0.017496,0.0,0.0,0.0,0.040914,0.042496,0.031097,0.009722,0.012916,0.058259,0.012566,-0.002737,0.155996,0.058938,0.0,0.0,0.0,0.114379,0.013512,0.079146,0.0,-0.04062,0.050388,0.0,0.0,0.048587,0.0


In order to make predictions, we will take a uid and find the nearest k neighbours (in terms of similarity).
Then by doing a weighted average of the ratings of those users, we can predict the ratings for this user.

In [24]:
# To make predictions we need to find the most similar users for a given  user
def get_nearest_users(uid, k=10):
  nearest = user_corr.loc[uid].sort_values(ascending=False)
  return nearest.iloc[:k].index

In [25]:
get_nearest_users(165683)

Int64Index([58323, 5714, 21452, 89675, 35739, 62688, 68167, 87635, 124292,
            17332],
           dtype='int64')

In [26]:
def predict_user_ratings(ratings, uid, k=10):
  # Get user's data
  user_mean = ratings[ratings['uid'] == uid]['vote_avg'].iloc[0]
  user_played = ratings[ratings['uid'] == uid]['vid']

  # Get k nearest neighbours and their ratings and similarities
  similar_users = get_nearest_users(uid, k)
  similar_users_ratings = user_item_df.loc[similar_users].drop(user_played, axis='columns')
  similarity = user_corr.loc[uid, similar_users].copy()
  similarity_denom = np.abs(similarity).sum()

  # In the case that there is no similar users, we will just return the average item ratings
  if similarity_denom == 0:
    return user_item_df.drop(user_played, axis='columns').mean(axis=0) + user_mean

  # Else compute the weighted average
  item_scores = similarity.values[np.newaxis].dot(similar_users_ratings.values)
  item_scores = item_scores / similarity_denom
  predicted_items_scores = pd.Series(item_scores[0] + user_mean, index=similar_users_ratings.columns)
  return predicted_items_scores

In [27]:
# Lets try to predict ratings that a user will give to not played VNs
predict_user_ratings(train, 165683).sort_values(ascending=False)

2002     86.113205
4        85.453281
92       84.532285
57       83.564251
24       83.331650
           ...    
17694    79.797675
66       79.762831
7849     79.624649
49       79.624649
93       79.470496
Length: 8209, dtype: float64

We can now predict scores that users will give to VNs. To build a recommendation system, we can just return the top n items from this list, sorted by predicted scores.

In [28]:
def get_user_recommendations(uid, n=10, k=10):
  predictions = predict_user_ratings(train, uid, k).sort_values(ascending=False).index
  return vn.loc[predictions[:n], ['title']]

In [29]:
get_user_recommendations(165683, 10, 10)

Unnamed: 0,title
2002,Steins;Gate
4,Clannad
92,Muv-Luv Alternative
57,"Sharin no Kuni, Himawari no Shoujo"
24,Umineko no Naku Koro ni
2153,Umineko no Naku Koro ni Chiru
12402,Fata morgana no Yakata
716,Gyakuten Saiban 3
9093,"Kono Oozora ni, Tsubasa o Hirogete"
17716,Gaokao Lianai Yibai Tian


### Item - Item Collaborative Filtering

Next, let's try item-item collaborative filtering.

In [30]:
# Instead of computing adjusted cosine for users, let's do it for items
def get_item_corr(mat, index):
  item_corr = cosine_similarity(mat.T)
  item_corr = pd.DataFrame(item_corr, index=index, columns=index, dtype=np.float32)
  np.fill_diagonal(item_corr.values, 0.0)
  return item_corr

In [31]:
item_corr = get_item_corr(user_item, vid_c.dtype.categories)

In [32]:
item_corr.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,28,29,30,31,32,33,34,35,36,37,38,39,40,41,...,27772,27781,27791,27823,27829,27831,27835,27842,27912,27913,27938,27940,27959,27978,28014,28015,28016,28065,28080,28130,28151,28187,28202,28205,28218,28228,28258,28278,28327,28332,28345,28354,28400,28402,28460,28462,28482,28513,28556,28619
1,0.0,-0.048231,-0.10709,-0.072853,-0.0555,0.01683,-0.107719,0.017217,-0.000644,-0.028043,-0.106995,-0.005745,-0.022645,-0.000755,0.006035,0.007317,-0.088215,-0.002314,0.02317,-0.025016,-0.027326,-0.055408,-0.03658,-0.041784,-0.020634,-0.02646,0.001877,-0.079962,0.014753,0.162143,-0.01272,-0.054867,-0.069436,0.005993,-0.027579,-0.024757,-0.067165,0.001479,0.025676,0.016722,...,0.001615,-0.005295,0.0,0.010077,0.001328,0.003969,0.001346,-0.000498,0.002921,0.0,-0.004092,0.012896,0.026763,-0.001979,0.002792,-0.001,-0.006261,-0.003836,0.018724,0.0,-0.021688,0.0,0.0,0.0,-0.006058,0.0,-0.011105,0.0,0.0,-0.001311,0.003386,0.0,0.007864,0.0,0.018766,0.0,-0.028015,0.0,0.0,0.0
2,-0.048231,0.0,0.042814,0.023208,0.002539,-0.048886,0.035824,-0.005297,-0.003024,0.033867,0.030463,0.017602,-0.00913,0.042261,0.022564,0.057959,0.025977,0.108631,-0.005492,0.029981,-0.001457,0.049242,0.016531,-0.021651,0.013519,0.016611,0.04549,0.169933,0.00919,-0.054805,-0.044391,0.047872,0.037992,0.014712,0.025459,0.000933,-0.002333,0.018368,-0.012121,-0.084241,...,0.0,0.00018,0.0,0.0,0.002811,0.0,-0.099906,0.0,-0.002297,0.0,-0.014497,0.0,-0.000176,0.0,0.0,0.003042,0.0,0.0,0.0,0.0,0.00637,-0.100904,0.032003,-0.005416,0.0,-0.008442,0.0,0.0,0.002962,0.0,0.001809,0.0,0.009496,-0.028684,0.0,0.0,0.0,0.0,0.002626,-0.027321
3,-0.10709,0.042814,0.0,0.090927,0.102514,-0.045023,0.157282,-0.005607,-0.007559,0.05313,0.143834,0.042369,0.080693,-0.027112,0.034852,-0.012186,0.120294,-0.039658,-0.012297,0.029664,0.03667,-0.00293,0.022562,0.058339,0.017264,0.032838,-0.028799,0.056093,0.002309,-0.120263,0.009864,0.108657,0.106407,-0.012882,0.067537,0.03626,0.085009,0.007696,-0.030612,-0.018235,...,0.0,-0.011509,0.00055,0.0,-0.012936,-0.001824,0.035552,0.0,0.000458,0.00018,0.000558,-0.010824,-0.004,0.012085,0.001155,0.00279,0.002176,0.0,0.0,0.0,-0.001804,0.039817,-0.000379,0.004326,-0.005574,0.01004,-0.000833,-0.000832,0.0,0.003619,-0.000264,-0.000593,-0.008344,0.0,0.0,-0.004684,0.0,0.002422,0.0,-0.013501
4,-0.072853,0.023208,0.090927,0.0,0.316042,-0.032362,0.121069,-0.030312,-0.057595,0.054411,0.227515,0.078422,0.057205,-0.089164,-0.034162,0.009022,0.204903,-0.035804,-1.7e-05,0.016822,0.019621,0.003547,-0.01133,0.109549,-0.00546,0.030657,-0.067758,0.006022,-0.015505,-0.095175,0.024245,0.198937,0.146087,-0.069134,0.15965,0.009188,0.135282,-0.030514,-0.001464,-0.006905,...,0.00038,-0.006054,0.001106,-0.002162,-0.005386,-0.002493,0.033295,-0.000527,-0.005928,-0.002893,-0.00177,-0.008556,-0.019484,-0.013526,-0.007363,-0.007109,0.0,-0.009032,-0.008723,0.007693,0.000474,0.022598,-0.001345,0.0,0.005616,0.0,-0.004705,-0.015485,0.009025,0.003074,-0.005111,0.0,-0.008897,0.0,-0.010066,0.000911,0.011579,0.001265,-0.001314,-0.004315
5,-0.0555,0.002539,0.102514,0.316042,0.0,-0.028123,0.098761,0.001272,-0.047528,0.02747,0.190774,0.070591,0.076385,-0.110386,-0.005792,0.014283,0.188026,-0.022754,-0.00264,0.028696,0.00198,0.013856,0.02014,0.135827,0.001521,0.025247,-0.08693,-0.008098,-0.001351,-0.081249,0.026485,0.147847,0.163669,-0.044257,0.141054,0.013795,0.142989,-0.015078,-0.011775,-0.007087,...,-0.006239,-0.008598,0.0,-0.002042,-0.002262,-0.007018,0.039028,0.000288,-0.009608,0.002514,0.003583,-0.007642,-0.005192,0.001908,0.005796,-0.00866,0.0,-0.009037,-0.002211,0.0,-0.001648,0.031803,0.004627,0.001157,0.003068,0.0,-0.001305,0.001948,0.0,0.000759,-0.006549,0.0,-0.005925,0.0,-0.001267,-0.001866,0.010545,0.001406,0.000408,-0.00877


Now to make predictions we can use many approaches

One approach is to use the VNs rated by a user, and using those ratings and similarity between rated VNs and other VNs, find scores for other VNs.

In [33]:
def predict_by_user_ratings(ratings, uid):
  # Get users ratings
  user_mean = ratings[ratings['uid'] == uid]['vote_avg'].iloc[0]
  user_played = ratings[ratings['uid'] == uid]['vid']
  user_ratings = user_item_df.loc[uid, user_played].copy()

  # Get similariies for these items (also drop already rated VNs)
  similarities = item_corr.loc[user_played].copy()
  similarities.drop(user_played, axis=1, inplace=True)
  
  # Multiply ratings by similarities and calculate weighted averages
  predicted_ratings = user_ratings.T.dot(similarities)
  predicted_ratings = predicted_ratings / np.abs(similarities).sum(axis=0) + user_mean

  # Return list of predictions
  return predicted_ratings

In [34]:
predict_by_user_ratings(train, 165683).sort_values(ascending=False)

12911    100.390065
6435      97.559024
17835     97.218667
4055      96.064498
14936     95.833459
            ...    
25342     67.606616
1922      67.527041
818       65.097222
26254     63.741287
18895           NaN
Length: 8209, dtype: float64

The problem with this approach is that VNs that are similar to only some VNs rated by user get higher scores than others. And these are the VNs that in general have low number of votes and just get high scores due to some similarity to user rated VNs.

A better approach is to find k similar VNs for each VN user has rated, find their union and calculate weighted average for these VNs.
For the rest, we can try using the general VN rating mean.

In [35]:
# This procedure finds the k closest VNs to a VN
def predict_closest_vn(vid, k=5):
  similarities = item_corr.loc[vid].sort_values(ascending=False)
  return vn.loc[similarities[:k].index, ['title']]

In [36]:
predict_closest_vn(4, 10)

Unnamed: 0,title
5,Little Busters!
11,Fate/Stay Night
751,Rewrite
2002,Steins;Gate
17,Ever17 -The Out of Infinity-
33,Kanon
211,G-senjou no Maou
57,"Sharin no Kuni, Himawari no Shoujo"
92,Muv-Luv Alternative
5154,Grisaia no Kajitsu -Le Fruit de la Grisaia-


Looks like similarity can indeed find similar VNs

In [37]:
# Predict the rating the user will give to a VN

def predict_item_rating(ratings, uid, vid, k=5):
  # Get user data  
  user_mean = ratings[ratings['uid'] == uid]['vote_avg'].iloc[0]
  played = ratings[ratings['uid'] == uid]['vid']
  # If already played, return current rating
  if vid in played.values:
    return user_item_df.loc[uid, vid] + user_mean

  # Get k most similar VNs and find ones rated by user
  similarity = item_corr.loc[vid].sort_values(ascending=False)[:k]
  common = set(played).intersection(set(similarity.index))
  # If no common VN found, return mean
  if len(common) == 0:
    return user_item_df[vid].mean() + user_mean
  # Else compute weighted average
  common = pd.Index(common)
  similarity = similarity[common]
  scores = user_item_df.loc[uid][common]
  score = scores.values.dot(similarity.values) / np.abs(similarity).sum()
  score += user_mean
  return score

In [38]:
predict_item_rating(train, 165683, 93, 10)

70.17004679985439

In [39]:
# A function to recommend new VNs to users

def get_item_recommendations(ratings, uid, k=5):
  user_mean = ratings[ratings['uid'] == uid]['vote_avg'].iloc[0]
  already_played = set(ratings[ratings['uid'] == uid]['vid'])
  closest_vns = set()
  for vid in already_played:
    closest_vns |= set(predict_closest_vn(vid, k).index)
  closest_vns -= already_played
  closest_vns = pd.Index(closest_vns)
  already_played = pd.Index(already_played)

  user_ratings = user_item_df.loc[uid]
  user_ratings = user_ratings[already_played]
  similarities = item_corr.loc[already_played]
  similarities = similarities[closest_vns]

  predictions = user_ratings.values[np.newaxis].dot(similarities.values)
  predictions = predictions[0] / np.abs(similarities).sum(axis=0)
  predictions = pd.Series(predictions + user_mean, index=closest_vns, name='predicted_vote')
  predictions = pd.merge(vn.loc[closest_vns, 'title'], predictions, left_index=True, right_index=True)
  return predictions.sort_values(by='predicted_vote', ascending=False)

In [40]:
get_item_recommendations(train, 165683)

Unnamed: 0,title,predicted_vote
751,Rewrite,84.909757
50,Fate/Hollow Ataraxia,84.838165
4,Clannad,84.622987
7014,Dangan Ronpa Kibou no Gakuen to Zetsubou no Ko...,84.279014
16743,Lucy - Geunyeoga Baladeon Geos -,84.165605
2002,Steins;Gate,84.063803
92,Muv-Luv Alternative,84.05815
5154,Grisaia no Kajitsu -Le Fruit de la Grisaia-,83.409159
1143,Maji de Watashi ni Koishinasai!!,83.057005
7,Tsukihime,82.879107


## Evaluation

In [42]:
# Testing User based CF
mae, rmse = 0, 0
grpd_uid = test.groupby('uid')
for uid, grp in tqdm(grpd_uid, total=len(grpd_uid)):
  user_ratings = predict_user_ratings(train, uid, 30)
  diffs = user_ratings.loc[grp.vid.values] - grp.vote.values
  rmse += np.sum(np.square(diffs))
  mae += np.sum(np.abs(diffs))
mae /= len(test)
rmse = np.sqrt(rmse / len(test))
print(f"USER BASED CF: k = 30")
print(f"RMSE : {rmse}")
print(f"MAE : {mae}")

HBox(children=(FloatProgress(value=0.0, max=20931.0), HTML(value='')))


USER BASED CF: k = 30
RMSE : 14.984420411734487
MAE : 11.309418404884813


In [44]:
# Testing item based CF
mae, rmse = 0, 0
for row in tqdm(test.itertuples(index=False), total=len(test)):
  diff = row.vote - predict_item_rating(train, row.uid, row.vid, 30)
  mae += np.abs(diff)
  rmse += np.square(diff)
mae /= len(test)
rmse = np.sqrt(rmse / len(test))
print(f"ITEM BASED CF: k = 30")
print(f"RMSE : {rmse}")
print(f"MAE : {mae}")

HBox(children=(FloatProgress(value=0.0, max=79311.0), HTML(value='')))


ITEM BASED CF: k = 30
RMSE : 13.87273615906075
MAE : 9.922747471973373


We can clearly see that item based CF performs better than user based CF.