In [1]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.sparse as sparse
import implicit
import itertools
import copy
import pickle

from tqdm import tqdm
from gensim.models import Word2Vec
from gensim.models.keyedvectors import WordEmbeddingsKeyedVectors
from subprocess import call
from sklearn.metrics import mean_squared_error
from itertools import groupby
from sklearn.utils import shuffle
from implicit.evaluation import  *
from implicit.als import AlternatingLeastSquares as ALS
from implicit.bpr import BayesianPersonalizedRanking as BPR

%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'
%watermark -a 'Ethen' -d -t -v -p numpy,pandas,matplotlib,sklearn

# from arena_util import write_json
# from arena_util import remove_seen

Ethen 2020-06-22 15:18:38 

CPython 3.6.8
IPython 7.2.0

numpy 1.18.0
pandas 1.0.1
matplotlib 3.0.2
sklearn 0.21.3


In [2]:
tr = pd.read_json("/Users/minki/pythonworkspace/kakao_melon/dataset/train.json", encoding="utf-8")
te = pd.read_json("/Users/minki/pythonworkspace/kakao_melon/dataset/val.json", encoding="utf-8")

In [3]:
tr_songs = tr.songs.tolist()
tr_tags = tr.tags.tolist()
tr_ids = tr.id.tolist()

te_songs = te.songs.tolist()
te_tags = te.tags.tolist()
te_ids = te.id.tolist()

In [4]:
from collections import Counter

# song별 사용횟수
song_ret = []
for song in tr_songs:
    song_ret += song
    
song_num = dict(Counter(song_ret))
most_song = sorted(song_num.items(), key = lambda x: -x[1])
top100_most_song = list(map(lambda x: x[0], most_song[:100]))
top200_most_song = list(map(lambda x: x[0], most_song[:200]))
set_top200_song = set(top200_most_song)

# tag별 사용횟수
tag_ret = []
for tag in tr_tags:
    tag_ret += tag
    
tag_num = dict(Counter(tag_ret))
most_tag = sorted(tag_num.items(), key = lambda x: -x[1])
top10_most_tag = list(map(lambda x: x[0], most_tag[:10]))
top30_most_tag = list(map(lambda x: x[0], most_tag[:30]))
set_top30_tag = set(top30_most_tag)

In [5]:
# 이 부분은 나중에 MF 과정을 추가할 때 사용할 것.
new_tr = []
song_to_idx = {} # key : song, value : idx
song_idx = 0
song_minimum_number = 3

# song별 id 초기화 및 추출
for i, l in tqdm(enumerate(tr_songs)):
    song_list = l
    new_song_list = []
    for song_id in song_list:
        if song_num[song_id] > song_minimum_number:
            new_song_list.append(song_id)
            if song_id not in song_to_idx:
                song_to_idx[song_id] = song_idx
                song_idx += 1
    new_song_list = [song_to_idx[x] for x in new_song_list]
    new_tr.append(new_song_list)
    
n_songs = len(song_to_idx)

# tag별 id 초기화 및 추출
tag_to_idx = {} # key : tag, value : idx
tag_idx = 0
tag_minimum_number = 3

for i, l in tqdm(enumerate(tr_tags)):
    tag_list = l
    new_tag_list = []
    for tag_id in tag_list:
        if tag_num[tag_id] > tag_minimum_number:
            new_tag_list.append(tag_id)
            if tag_id not in tag_to_idx:
                tag_to_idx[tag_id] = n_songs + tag_idx
                tag_idx += 1
    new_tr[i].extend([tag_to_idx[x] for x in new_tag_list])
    
n_tags = len(tag_to_idx)

# shuffle data
new_tr = shuffle(new_tr)

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [6]:
# test데이터 역시 train 데이터와 같이 전처리
new_te = []
te_song_to_idx = {}
te_tag_to_idx = {}

song_idx = 0
for i, l in enumerate(te_songs):
    view = l
    ret = [] 
    for item_id in view:
        if item_id not in te_song_to_idx:
            continue
        ret.append(te_song_to_idx[item_id])
    new_te.append(ret)

tag_idx = 0
for i, tags in enumerate(te_tags):
    ret = []
    for tag in tags:
        if tag not in te_tag_to_idx:
            continue
        ret.append(tag)
    new_te[i].extend([te_tag_to_idx[x] for x in ret])

In [7]:
# raw data와 매칭시키기 위해 dictionary에 저장
# y= song, x = id
idx_to_song = {x:y for(y,x) in song_to_idx.items()}
idx_to_tag = {(x - n_songs):y for(y,x) in tag_to_idx.items()}

# make csr_matrix
def lil_to_csr_matrix(lil, shape = None):
    row = []
    col = []
    for row_idx,list_element in enumerate(lil):
        for j, col_idx in enumerate(list_element):
            row.append(row_idx)
            col.append(col_idx)
    
    data = np.repeat(1, sum([len(x) for x in lil]))
    
    return csr_matrix((data, (row,col)), shape = shape)

# train csr matrix
tr_csr_mat = lil_to_csr_matrix(new_tr, (len(new_tr), n_songs + n_tags))

# test csr matrix
te_csr_mat = lil_to_csr_matrix(new_te, (len(new_te), n_songs + n_tags))

In [16]:
# ALS 모델 생성
# ALS 모델 생성
import implicit
als_model = ALS(factors=128, regularization=0.08, calculate_training_loss = True)
als_model.fit(tr_csr_mat.T * 15.0)

song_model = ALS(use_gpu=False)
tag_model = ALS(use_gpu=False)

# item_factor = Array of latent factors for each item in the training set
# user_factor = Array of latent factors for each user in the training set

song_model.user_factors = als_model.user_factors
tag_model.user_factors = als_model.user_factors

song_model.item_factors = als_model.item_factors[:n_songs]
tag_model.item_factors = als_model.item_factors[n_songs:]

song_rec_csr = tr_csr_mat[:, :n_songs]
tag_rec_csr = tr_csr_mat[:, n_songs:]

HBox(children=(IntProgress(value=0, max=15), HTML(value='')))

In [48]:
# ALS를 통한 추천
song_ret = []
tag_ret = []
total = []
from tqdm.auto import tqdm
for plylist_id in tqdm(range(tr_csr_mat.shape[0])):
    song_rec = song_model.recommend(plylist_id, song_rec_csr, N=500)
    song_rec = [idx_to_song[x[0]] for x in song_rec]
    
    tag_rec = tag_model.recommend(plylist_id, tag_rec_csr, N=50)
    tag_rec = [idx_to_tag[x[0]] for x in tag_rec if x[0] in idx_to_tag]
    
    song_ret.append(song_rec)
    tag_ret.append(tag_rec)

HBox(children=(IntProgress(value=0, max=115071), HTML(value='')))

In [73]:
returnval = []
for _id, rec, tag_rec in zip(tr_ids, song_ret, tag_ret):
    returnval.append({
        "id": _id,
        "songs": rec[:500],
        "tags": tag_rec[:50]
    })

In [75]:
total = []

for i in tqdm(range(len(returnval))):
    sub_list = list(map(lambda x: str(x), returnval[i]['songs'])) + returnval[i]['tags']
    total.append(sub_list)

HBox(children=(IntProgress(value=0, max=115071), HTML(value='')))

In [77]:
# Make Word2Vec Model

min_count = 3
size = 100
window = 210
sg = 5

w2v_model = Word2Vec(total, min_count = min_count, size = size, window = window, sg = sg)

KeyboardInterrupt: 

In [None]:
te_ply_song = {}
te_ply_tag = {}
total_te_ply = {}

for idx, val in tqdm(te.iterrows()):
    te_ply_song[str(val.id)] = val.songs
    te_ply_tag[str(val.id)] = val.tags
    val.songs = list(map(lambda x: str(x), val.songs))
    total_te_ply[str(val.id)] = val.songs + val.tags

In [8]:
# Update Model
songs_list = song_num.keys()
tags_list = tag_num.keys()
total_key = list(total_te_ply.keys())
total_list = list(total_te_ply.values())
p2v_model = WordEmbeddingsKeyedVectors(size)

ID = []
vec = []
idx = 0

for q in tqdm(total_list):
    tmp_vec = 0
    if len(q) >= 1:
        for song in q:
            try:
                tmp_vec += w2v_model.wv.get_vector(song)
            except KeyError:
                pass
    if type(tmp_vec) != int:
        ID.append(total_key[idx])
        vec.append(tmp_vec)
    idx += 1
p2v_model.add(ID, vec)

100%|██████████| 23015/23015 [00:01<00:00, 13699.57it/s]


In [17]:
# get_result
no_idx = 0

answer = []
for ply_id in tqdm(total_key):
    get_song = []
    get_tag = []
    try:
        most_id = [x[0] for x in p2v_model.most_similar(str(ply_id), topn=200)]
        for ID in most_id:
            get_song += te_ply_song[ID]
            get_tag += te_ply_tag[ID]

        # extract top 100 songs
        top_song = dict(Counter(get_song))
        top_song = sorted(top_song.items(), key = lambda x: -x[1])
        top_song = list(map(lambda x: x[0], top_song[:100]))
        
        # extract top 100 tags
        top_tag = dict(Counter(get_tag))
        top_tag = sorted(top_tag.items(), key = lambda x: -x[1])
        top_tag = list(map(lambda x: x[0], top_tag[:10]))
        
        if len(top_song) != 100 or len(top_tag) != 10:
            if len(top_song) != 100:
                set_top_song = set(top_song)
                empty_song_num = len(top_song)
                dif_song = set_top200_song - set_top_song
                top_song = top_song + dif_song[:(100 - empty_song_num)]
            elif len(top_tag) != 10:
                set_top_tag = set(top_tag)
                empty_tag_num = len(top_tag)
                dif_tag = set_top30_tag - set_top_tag
                top_tag = top_tag + dif_tag[:(100 - empty_tag_num)]
                
            answer.append({
            "id": int(ply_id),
            "songs": top_song,
            "tags": top_tag
            })
            
        else :
            answer.append({
            "id": int(ply_id),
            "songs": top_song,
            "tags": top_tag
            })
        
    except:
        answer.append({
            "id": int(ply_id),
            "songs": top100_most_song,
            "tags": top10_most_tag
        })
        no_idx += 1

100%|██████████| 23015/23015 [01:13<00:00, 311.95it/s]


In [18]:
import json
with open('results.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(answer, ensure_ascii=False))