## Matrix Factorization - https://arena.kakao.com/forum/topics/200

In [None]:
from implicit.evaluation import  *
from implicit.als import AlternatingLeastSquares as ALS
from implicit.bpr import BayesianPersonalizedRanking as BPR
import numpy as np
import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'
from sklearn.utils import shuffle
from scipy.sparse import *

import pandas as pd
tr = pd.read_json("./train.json", encoding="utf-8")
te = pd.read_json("./val.json", encoding="utf-8")

ret = []
for tag in tr.tags.tolist():
    ret += tag
from collections import Counter
r = dict(Counter(ret))

r = sorted(r.items(), key=lambda x: -x[1])
top_tags = [x[0] for x in r[:1000]]
tr_songs = tr.songs.tolist()
te_songs = te.songs.tolist()
tr_tags = tr.tags.tolist()
te_tags = te.tags.tolist()
te_ids = te.id.tolist()

from itertools import groupby
tr = [] # tr 초기화
iid_to_idx = {}
tag_to_idx = {} 
idx = 0

for i, l in enumerate(tr_songs):
    view = l
    for item_id in view:
        if item_id not in iid_to_idx:
            iid_to_idx[item_id] = idx
            idx += 1
    view = [iid_to_idx[x] for x in view] 
    tr.append(view) # => song_id 재설정

n_items = len(iid_to_idx)

idx = 0
for i, tags in enumerate(tr_tags):
    for tag in tags:
        if tag not in tag_to_idx:
            tag_to_idx[tag] = n_items + idx
            idx += 1 # => tags_id 설정
    tr[i].extend([tag_to_idx[x] for x in tags]) # tr[i] => 'i'th row의 songs_list + tags_list 

n_tags = len(tag_to_idx)

# test set도 train set과 같은 process 진행
from itertools import groupby
te = []

idx = 0
for i, l in enumerate(te_songs):
    view = l
    ret = [] 
    for item_id in view:
        if item_id not in iid_to_idx:
            continue
        ret.append(iid_to_idx[item_id])
    te.append(ret)
idx = 0
for i, tags in enumerate(te_tags):
    ret = []
    for tag in tags:
        if tag not in tag_to_idx:
            continue
        ret.append(tag)
    te[i].extend([tag_to_idx[x] for x in ret])

tr = shuffle(tr)

idx_to_iid = {x:y for(y,x) in iid_to_idx.items()}
idx_to_tag = {(x - n_items):y for(y,x) in tag_to_idx.items()}
# raw data와 매칭시키기 위해 dictionary에 저장



def lil_to_csr_matrix(lil, shape = None):
    row = []
    col = []
    for row_idx,list_element in enumerate(lil):
        for j, col_idx in enumerate(list_element):
            row.append(row_idx)
            col.append(col_idx)
    
    data = np.repeat(1, sum([len(x) for x in lil]))
    
    return csr_matrix((data, (row,col)), shape = shape)

tr_csr_mat = lil_to_csr_matrix(tr, (len(tr), n_items + n_tags))

te_csr_mat = lil_to_csr_matrix(te, (len(te), n_items + n_tags))

r = scipy.sparse.vstack([tr_csr_mat, te_csr_mat])

als_model = ALS(factors=128, regularization=0.08)
als_model.fit(r.T * 15.0)

item_model = ALS(use_gpu=False)
tag_model = ALS(use_gpu=False)

item_model.user_factors = als_model.user_factors
tag_model.user_factors = als_model.user_factors

item_model.item_factors = als_model.item_factors[:n_items]
tag_model.item_factors = als_model.item_factors[n_items:]

item_rec_csr = tr_csr_mat[:, :n_items]
tag_rec_csr = tr_csr_mat[:, n_items:]

item_ret = []
tag_ret = []
from tqdm.auto import tqdm
for u in tqdm(range(te_csr_mat.shape[0])):
    item_rec = item_model.recommend(u, item_rec_csr, N=100)
    item_rec = [idx_to_iid[x[0]] for x in item_rec]
    tag_rec = tag_model.recommend(u, tag_rec_csr, N=100)
    tag_rec = [idx_to_tag[x[0]] for x in tag_rec if x[0] in idx_to_tag]
    item_ret.append(item_rec)
    tag_ret.append(tag_rec)

returnval = []
for _id, rec, tag_rec in zip(te_ids, item_ret, tag_ret):
    returnval.append({
        "id": _id,
        "songs": rec[:100],
        "tags": tag_rec[:10]
    })

import json
with open('results.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(returnval, ensure_ascii=False))