In [1]:
import glob
import json
import os
import re
import numpy as np
import pandas as pd
import logging
import torch
from torch.autograd import Variable
from itertools import repeat
from functools import reduce
from operator import add
from tqdm import tqdm_notebook as tqdm
from math import log
from collections import Counter
from arena_util import evaluate, remove_seen, write_json
from models import NearestNeighbor
from scipy import sparse
from IPython.display import display
from ipyparallel import Client
from multiprocessing import log_to_stderr, RawArray, Pool, Array, Value

# Data import

In [3]:
metas_df = pd.read_json('data/song_meta.json')
train_df = pd.read_json('data/train.json')
valid_df = pd.read_json('data/valid.json')

# 1. KNN 

In [54]:
N_SONG = len(metas_df)
N_TRACK = max(*train_df['id'], *valid_df['id'])
N_VALID = len(valid_df)
N_TRAIN = len(train_df)
logger = log_to_stderr()
logger.setLevel(20)
N_SONG_TRAIN = len(np.concatenate(train_df['songs']))
N_SONG_VALID = len(np.concatenate(valid_df['songs']))
print(f"{'N_SONG':<15} {N_SONG}")
print(f"{'N_TRACK':<15} {N_SONG_VALID}")
print(f"{'N_SONG_VALID':<15} {N_SONG_VALID}")
print(f"{'N_SONG_TRAIN':<15} {N_SONG_TRAIN}")

N_SONG          707989
N_TRACK         421199
N_SONG_VALID    421199
N_SONG_TRAIN    5285871


In [140]:
def create_track_song_csr(df):
    df = df.reset_index(drop=True)
    display(df)
    track_song_csr = sparse.csr_matrix((N_TRACK, N_SONG), dtype=np.bool)

    n_song_in_track = len(np.concatenate(df['songs']))
    logger.info(f"total of {n_song_in_track}")
    for i, (track, song) in enumerate(zip(np.repeat(df['id'], list(map(len, df['songs']))), np.concatenate(df['songs']))):
        if i % (n_song_in_track // 10) == 0:
            logger.info(f"\t {i * 100 // n_song_in_track}% done")
        track_song_csr[(track, song)] = True

    return track_song_csr

In [200]:
song_cnt = Counter()
for songs in train_df['songs']:
    song_cnt.update(songs)

mp_songs = [k for k, v in song_cnt.most_common(150)]

tag_cnt = Counter()
for tags in train_df['tags']:
    tag_cnt.update(tags)

mp_tags = [k for k, v in tag_cnt.most_common(150)]

try:
    train_csr = sparse.load_npz("process/train.npz")
except:
    logger.info(f"Creating train.npz")
    with Pool() as pool:
        csr_li = pool.map(create_track_song_csr, [train_df[chunk * N_TRAIN // 10: (chunk + 1) * N_TRAIN // 10] for chunk in range(10)])
    train_csr = reduce(add, csr_li)
    sparse.save_npz("process/train.npz", tot)

try:
    valid_csr = sparse.load_npz("process/valid.npz")
except:
    logger.info(f"Creating valid.npz")
    with Pool(1) as pool:
        csr_li = pool.map(create_track_song_csr, [valid_df[chunk * N_VALID // 10: (chunk + 1) * N_VALID // 10] for chunk in range(10)])
    valid_csr = reduce(add, csr_li)
    sparse.save_npz("process/valid.npz", tot)

In [203]:
results = []
count = 0
for _, row in valid_df.iterrows():
    count += 1
    if count % 1000 == 0:
        print(count)
    col_vec = train_csr.astype(int) * valid_csr.getrow(row['id']).astype(int).H
    track2sim = Counter()
    
    for r in col_vec.nonzero()[0]:
        track2sim[r] = col_vec[(r, 0)]

    sim_tracks = [track for track, cnt in track2sim.most_common(10)]
    candidates = [song for track in sim_tracks for song in train_csr.getrow(track).nonzero()[1]] + mp_songs
    
    results.append({
        "id": row['id'],
        "songs": remove_seen(row['songs'], candidates)[:100],
        "tags": remove_seen(row['tags'], mp_tags)[:10]
    })
results

'인디', 'bgm', '밤새벽', '감성', '혼자', '불금', '겨울감성', '시원한', '봄', '따뜻한', '아침', '이별', '봄노래', '어쿠스틱', '파티']}, {'id': 131409, 'songs': [169984, 501764, 540678, 469000, 364554, 215053, 8208, 69648, 426003, 460823, 133143, 129059, 188452, 208933, 573478, 348200, 118827, 505900, 509998, 317488, 442419, 649267, 575544, 30780, 583742, 67651, 399426, 75842, 67655, 180302, 47186, 647251, 372821, 628826, 581723, 168031, 557152, 436320, 49250, 200802, 360546, 350309, 624738, 612462, 651374, 510064, 454768, 634998, 211063, 438392, 559232, 16513, 55427, 426116, 649347, 276612, 20617, 510089, 508043, 530575, 157840, 426129, 512148, 272536, 409754, 465051, 575649, 637093, 39077, 4263, 581799, 168104, 157861, 542896, 592049, 563379, 645301, 225461, 528567, 159928, 690359, 503995, 489660, 422077, 411838, 590012, 493762, 366786, 49348, 157892, 71878, 88261, 362695, 667849, 704707, 329934, 153807, 303312, 544974, 147667], 'tags': ['비', '댄스', 'RnB', '연말', '카페', '인디', 'bgm', '밤새벽', '감성', '혼자', '불금', '겨울감성', '시원한', 

In [222]:
new = []
for row in results:
    new.append({
        "id": row['id'],
        "songs": row['songs'],
        "tags": row['tags'][:10]
    })

with open("session2.json", "w", encoding="utf8") as f:
    json_str = json.dumps(new, ensure_ascii=False, default=int)
    f.write(json_str)

# 2. Item-based CF

In [10]:
try:
    train_csr = sparse.load_npz("process/train.npz")
except:
    logger.info(f"Creating train.npz")
    with Pool() as pool:
        csr_li = pool.map(create_track_song_csr, [train_df[chunk * N_TRAIN // 10: (chunk + 1) * N_TRAIN // 10] for chunk in range(10)])
    train_csr = reduce(add, csr_li)
    sparse.save_npz("process/train.npz", tot)

In [9]:
class MatrixFactorization(torch.nn.Module):
    def __init__(self, n_users, n_items, n_factors=20):
        super().__init__()
        self.user_factors = torch.nn.Embedding(n_users, n_factors, sparse=True)
        self.item_factors = torch.nn.Embedding(n_items, n_factors, sparse=True)

    def forward(self, user, item):
        return (self.user_factors(user) * self.item_factors(item)).sum(1)

    def predict(self, user, item):
        return self.forward(user, item)

# Feature Generation