In [2]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import os
import gc

from tqdm import tqdm
from scipy.sparse import csr_matrix

%matplotlib inline

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Input

from tensorflow import keras

"""mpl.rcParams['axes.unicode_minus'] = False
plt.rcParams["figure.figsize"] = (8,3)
plt.rcParams["font.family"] = "Malgun Gothic"
plt.rcParams["font.size"] = 10"""

os.chdir("./arena_data/")

train = pd.read_json("train.json", encoding="utf-8")
test = pd.read_json("test.json", encoding="utf-8")
val = pd.read_json("val.json", encoding="utf-8")
song_meta = pd.read_json("song_meta.json", encoding="utf-8")
genre_gn_all = pd.read_json("genre_gn_all.json", typ = 'series', encoding="utf-8") 

### Song FIltering

In [5]:
songs_in_plylst = np.dstack(
    (
        np.concatenate(train.songs.values),
        np.repeat(train.id.values, list(map(len, train.songs))), 
    )
)

songs_in_plylst = pd.DataFrame(songs_in_plylst[0], columns = ['song_id', 'plylst_id'])
songs_in_plylst.song_id.value_counts().quantile([.1, .25, .5, .75, .8, .9, .95, .99])

song_cnt = songs_in_plylst.song_id.value_counts().reset_index(name = "counts_in_plylst")

song_cnt[song_cnt.counts_in_plylst >= 13].head()

Unnamed: 0,index,counts_in_plylst
0,144663,2175
1,116573,2121
2,357367,1981
3,366786,1919
4,654757,1647


In [6]:
major_song_lst = song_cnt[song_cnt.counts_in_plylst >= 13]['index'].tolist()

### train playlist 수정

In [14]:
# fsongs = train.songs.apply(lambda x : list(set(x).intersection(set(major_song_lst))))

In [3]:
import pickle

"""
with open("fsongs.pickle", "wb") as f:
    pickle.dump(fsongs, f, protocol = pickle.HIGHEST_PROTOCOL)
"""

with open("fsongs.pickle", 'rb') as f:
    fsongs = pickle.load(f)

train['fsongs'] = fsongs

# va_fsongs = val.songs.apply(lambda x : list(set(x).intersection(set(major_song_lst))))

"""
with open("va_fsongs.pickle", "wb") as f:
    pickle.dump(va_fsongs, f, protocol = pickle.HIGHEST_PROTOCOL)
"""

with open("va_fsongs.pickle", 'rb') as f:
    va_fsongs = pickle.load(f)

val['fsongs'] = va_fsongs

In [5]:
tr_songs = train.fsongs.tolist()
tr_tags = train.tags.tolist()

va_songs = val.fsongs.tolist()
va_tags = val.tags.tolist()

In [6]:
tr = [] # tr 초기화
iid_to_idx = {}
tag_to_idx = {} 
idx = 0

for i, l in enumerate(tr_songs):
    view = l
    for item_id in view:
        if item_id not in iid_to_idx:
            iid_to_idx[item_id] = idx
            idx += 1
    view = [iid_to_idx[x] for x in view] 
    tr.append(view) # => song_id 재설정

n_items = len(iid_to_idx)

In [7]:
idx_to_iid = {x:y for(y,x) in iid_to_idx.items()}
idx_to_tag = {(x - n_items):y for(y,x) in tag_to_idx.items()}

In [8]:
del [tr_songs, tr_tags]

In [48]:
"""idx = 0
for i, tags in enumerate(tr_tags):
    for tag in tags:
        if tag not in tag_to_idx:
            tag_to_idx[tag] = n_items + idx
            idx += 1 # => tags_id 설정
    tr[i].extend([tag_to_idx[x] for x in tags]) # tr[i] => 'i'th row의 songs_list + tags_list 

n_tags = len(tag_to_idx)"""

In [9]:
train['fsong_reindex'] = tr
val['fsong_reindex'] = val.fsongs.apply(lambda x : [iid_to_idx.get(i) for i in x])

### to csr matrix

In [10]:
train_songs = train.fsong_reindex.tolist()

rows = []
cols = []

for row, songs_lst in tqdm(enumerate(train_songs)):
    for col in songs_lst:
        rows.append(row)
        cols.append(col)

data = np.repeat(1, len(rows))

X = csr_matrix((data, (rows, cols)))

115071it [00:00, 126276.46it/s]


In [11]:
val_songs = val.fsong_reindex.tolist()

rows = []
cols = []

for row, songs_lst in tqdm(enumerate(val_songs)):
    for col in songs_lst:
        rows.append(row)
        cols.append(col)

data = np.repeat(1, len(rows))

va_X = csr_matrix((data, (rows, cols)))

23015it [00:00, 303941.77it/s]


In [12]:
X.shape, va_X.shape

((115071, 64869), (23015, 64869))

In [13]:
train_X = np.asarray(X[:50000].toarray()).astype('float16')
val_X = np.asarray(va_X.toarray()).astype('float16')

In [21]:
def AE_model(input_shape=None, enc_dim = None, n = 128):
    model = Sequential()
    model.add(Input(shape=input_shape, sparse=False))
    model.add(Dropout(.75))
    model.add(Dense(n, activation="relu"))
    model.add(Dense(input_shape[0], activation="sigmoid"))
    return model

In [22]:
input_shape = train_X[0].shape
enc_dim = len(major_song_lst)

In [23]:
ae = AE_model(input_shape=input_shape, enc_dim=enc_dim)
ae.compile(optimizer='adam', loss = "mse")

W0701 17:42:03.158991 18924 nn_ops.py:4283] Large dropout rate: 0.75 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


In [14]:
import h5py
from tensorflow.keras.models import load_model

# ae.save("auto_encoder.h5")
ae = load_model("auto_encoder.h5")

W0702 17:37:54.470636  4608 nn_ops.py:4283] Large dropout rate: 0.75 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


In [22]:
ae.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout (Dropout)            (None, 64869)             0         
_________________________________________________________________
dense (Dense)                (None, 128)               8303360   
_________________________________________________________________
dense_1 (Dense)              (None, 64869)             8368101   
Total params: 16,671,461
Trainable params: 16,671,461
Non-trainable params: 0
_________________________________________________________________


In [24]:
ae.fit(x = train_X, y = train_X, epochs=5, batch_size=128, verbose= 1)

W0701 17:42:13.401512 18924 nn_ops.py:4283] Large dropout rate: 0.75 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


Train on 50000 samples
Epoch 1/5


W0701 17:42:14.198100 18924 nn_ops.py:4283] Large dropout rate: 0.75 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x1a756694240>

In [15]:
val_res = np.argsort(ae.predict(va_X), axis=1)[:, -100:]

W0702 17:38:03.096058  4608 training.py:504] Falling back from v2 loop because of error: Failed to find data adapter that can handle input: <class 'scipy.sparse.csr.csr_matrix'>, <class 'NoneType'>


In [18]:
pd.DataFrame(val_res)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,6665,2842,2005,3835,14086,4757,19044,4465,2486,22058,...,4389,4390,1441,4387,4382,4384,4383,4385,3947,97
1,4757,420,2486,6665,11617,6157,14552,5922,198,639,...,9561,4390,1441,4387,4382,4384,4383,4385,3947,97
2,4757,14552,3812,508,544,3264,2486,5922,639,6157,...,4386,9561,1441,4387,4382,4384,4383,4385,3947,97
3,9312,508,7096,22058,2842,2656,6665,3813,785,2486,...,4390,9561,4387,1441,4382,4384,4383,4385,3947,97
4,7756,16772,3812,22058,639,4465,2011,14552,508,198,...,4390,9561,1441,4387,4382,4384,4383,4385,3947,97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23010,420,5922,19044,5394,937,2486,6665,21705,3835,2842,...,4389,9561,1441,4387,4382,4383,4384,4385,3947,97
23011,2708,4283,41069,639,4011,6248,5738,53246,937,6991,...,4381,4389,4387,4382,1441,4384,4383,4385,3947,97
23012,198,5351,14552,937,3812,508,5987,785,708,11617,...,4389,9561,4387,1441,4382,4384,4383,4385,3947,97
23013,21705,2101,7756,9312,420,14552,2656,2486,5394,6665,...,4389,9561,1441,4387,4382,4384,4383,4385,3947,97


In [17]:
val_res.shape

(23015, 100)

In [37]:
from sklearn.metrics import mean_squared_error

In [39]:
mse = mean_squared_error

### csr matrix

In [7]:
train_songs = train.songs.tolist()

rows = []
cols = []

for row, songs_lst in tqdm(enumerate(train_songs)):
    for col in songs_lst:
        rows.append(row)
        cols.append(col)

data = np.repeat(1, len(rows))

115071it [00:01, 97580.19it/s]


In [8]:
X = csr_matrix((data, (rows, cols)))

### eda

In [9]:
tmp_df = pd.DataFrame(
    np.dstack(
        (
            np.concatenate(train.songs.values),
            np.repeat(train.id.values, list(map(len,train.songs.values)))
        )
    )[0], columns = ["song_id", "plylst_id"]
)

In [126]:
uniq_song_in_plylst = tmp_df[["song_id"]].drop_duplicates("song_id", keep = "first")

In [131]:
uniq_song_in_plylst["cnt_in_plylst"] = uniq_song_in_plylst.song_id.map(tmp_df.groupby("song_id").size().to_dict())

In [134]:
uniq_song_in_plylst.loc[uniq_song_in_plylst.cnt_in_plylst > 1].cnt_in_plylst.describe()

count    316114.000000
mean         15.775458
std          54.104228
min           2.000000
25%           2.000000
50%           4.000000
75%          10.000000
max        2175.000000
Name: cnt_in_plylst, dtype: float64

In [None]:
###

In [139]:
total_songs = song_meta.shape[0]
total_plylsts = train.shape[0]
mean_songs_in_plylst = train.songs.apply(lambda x: len(x)).sum() / total_plylsts

In [121]:
(tmp_df.groupby("song_id").size() == 1).values

array([False, False, False, ..., False,  True,  True])

In [17]:
### 장르별 수록곡 수

In [47]:
gnr_song_cnt = np.dstack(
    (
        np.concatenate(song_meta.song_gn_gnr_basket.values),
        np.repeat(song_meta.id, list(map(len, song_meta.song_gn_gnr_basket.values)))
    )
)

In [48]:
gnr_song_cnt = pd.DataFrame(gnr_song_cnt[0], columns = ["gnr_code", "song_id"])

In [60]:
gnr_song_cnt["song_id"] = gnr_song_cnt["song_id"].astype(int)

In [96]:
tmp_cnt = gnr_song_cnt.merge(
    tmp_df.groupby("song_id").size().reset_index(name="cnt_in_plylst"),
    how = "left"
).fillna(0)

In [110]:
tmp_cnt.rename(columns = {"song_id" : "song_cnt"}, inplace=True)

In [None]:
tmp_cnt = tmp_cnt.groupby("gnr_code").aggregate({"song_id" : "count", "cnt_in_plylst" : "sum"})
tmp_cnt["mean_cnt_in_plylst"] = tmp_cnt.cnt_in_plylst / train.shape[0]
tmp_cnt.reset_index(inplace=True)
tmp_cnt['gnr_name'] = tmp_cnt.gnr_code.map(genre_gn_all.to_dict())

In [113]:
tmp_cnt["mean_cnt_in_plylst_2"] = tmp_cnt.cnt_in_plylst / tmp_cnt.song_cnt # 수록곡 대비 playlist내의 선호도

In [118]:
tmp_cnt.head()

Unnamed: 0,gnr_code,song_cnt,cnt_in_plylst,mean_cnt_in_plylst,gnr_name,mean_cnt_in_plylst_2
0,GN0100,56820,1194574.0,10.381191,발라드,21.02383
1,GN0200,19017,438447.0,3.81023,댄스,23.055529
2,GN0300,27784,464016.0,4.032432,랩/힙합,16.700835
3,GN0400,14476,348061.0,3.02475,R&B/Soul,24.044004
4,GN0500,42468,546943.0,4.753092,인디음악,12.878944


In [117]:
multi_table(
    [tmp_cnt.loc[:, ["gnr_name", "mean_cnt_in_plylst"]].sort_values( by = "mean_cnt_in_plylst", ascending = False),
     tmp_cnt.loc[:, ["gnr_name", "mean_cnt_in_plylst_2"]].sort_values( by = "mean_cnt_in_plylst_2", ascending = False)]
)
# 

Unnamed: 0_level_0,gnr_name,mean_cnt_in_plylst
Unnamed: 0_level_1,gnr_name,mean_cnt_in_plylst_2
0,발라드,10.381191
4,인디음악,4.753092
8,POP,4.410112
24,아이돌,4.321375
2,랩/힙합,4.032432
1,댄스,3.81023
14,OST,3.302552
3,R&B/Soul,3.02475
5,록/메탈,2.869237
10,일렉트로니카,2.829931

Unnamed: 0,gnr_name,mean_cnt_in_plylst
0,발라드,10.381191
4,인디음악,4.753092
8,POP,4.410112
24,아이돌,4.321375
2,랩/힙합,4.032432
1,댄스,3.81023
14,OST,3.302552
3,R&B/Soul,3.02475
5,록/메탈,2.869237
10,일렉트로니카,2.829931

Unnamed: 0,gnr_name,mean_cnt_in_plylst_2
24,아이돌,41.212084
3,R&B/Soul,24.044004
1,댄스,23.055529
0,발라드,21.02383
2,랩/힙합,16.700835
7,포크/블루스,15.133263
4,인디음악,12.878944
5,록/메탈,11.120445
26,EDM,9.950175
14,OST,8.583936


### Matrix Factorization

In [9]:
X.shape

(115071, 707989)

In [211]:
from implicit.als import AlternatingLeastSquares as ALS

In [212]:
als_model = ALS(factors=64, regularization=0.001, iterations=20)

W0619 19:28:50.150537 13952 utils.py:26] OpenBLAS detected. Its highly recommend to set the environment variable 'export OPENBLAS_NUM_THREADS=1' to disable its internal multithreading


In [213]:
als_model.fit(X)

HBox(children=(IntProgress(value=0, max=20), HTML(value='')))




In [225]:
np.dot(als_model.item_factors[0, :] , als_model.user_factors.T)

array([ 1.8191078e-06,  0.0000000e+00,  0.0000000e+00, ...,
       -1.1320270e-06,  1.7471382e-07,  1.7454251e-08], dtype=float32)

In [233]:
np.dot(als_model.item_factors[0, :] , als_model.user_factors.T)[np.argmax(np.dot(als_model.item_factors[0, :] , als_model.user_factors.T), axis=0)]

0.0038509155

In [232]:
np.argmax(np.dot(als_model.item_factors[0, :] , als_model.user_factors.T), axis=0)

177460

In [236]:
def predicted_plylst(plylst_id, als_model):
    return np.dot(als_model.item_factors[plylst_id, :] , als_model.user_factors.T)

In [237]:
predicted_plylst(0, als_model)

array([ 1.8191078e-06,  0.0000000e+00,  0.0000000e+00, ...,
       -1.1320270e-06,  1.7471382e-07,  1.7454251e-08], dtype=float32)

In [None]:
als_model.item_factors[:10, :]

In [255]:
np.dot(als_model.item_factors[:1000, :], als_model.user_factors.T)

array([[ 1.8191075e-06,  0.0000000e+00,  0.0000000e+00, ...,
        -1.1320270e-06,  1.7471376e-07,  1.7454253e-08],
       [ 3.2180768e-08,  0.0000000e+00,  0.0000000e+00, ...,
         6.7979454e-05,  1.9930592e-07,  3.7079524e-09],
       [ 8.4202848e-06,  0.0000000e+00,  0.0000000e+00, ...,
        -2.8618017e-05, -1.4543695e-06,  3.5031871e-08],
       ...,
       [-1.3842008e-05,  0.0000000e+00,  0.0000000e+00, ...,
         2.5216303e-05,  1.3588677e-05, -2.7517572e-07],
       [-3.2946493e-05,  0.0000000e+00,  0.0000000e+00, ...,
         2.0403902e-04, -6.2212248e-06, -6.6444635e-08],
       [ 4.5192527e-07,  0.0000000e+00,  0.0000000e+00, ...,
        -2.5285169e-06,  2.4576269e-07,  1.2485127e-08]], dtype=float32)

In [217]:
als_model.user_factors.shape

(707989, 64)

In [219]:
als_model.user_factors[:, 0]

array([ 2.7422048e-04,  0.0000000e+00,  0.0000000e+00, ...,
        7.4929837e-04,  1.0349140e-04, -6.9245093e-06], dtype=float32)

In [None]:
als_model.

### AE

In [27]:
class DataGenerator(keras.utils.Sequence):
    def __init__(self, R, batch_size = 64, shuffle = True):
        self.R = R
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.on_epoch_end()
        
    def __len__(self):
        return int(np.floor(self.R.shape[0] / self.batch_size))
    
    def on_epoch_end(self):
        self.indices = np.arange(self.R.shape[0])
        if self.shuffle == True:
            np.random.shuffle(self.indices)
            
    def __getitem__(self, index):
        indices = self.indices[index*self.batch_size:(index+1)*self.batch_size]
        X = self.__data_generation(indices)
        y = self.__data_generation(indices)
        return X, y
            
    def __data_generation(self, indices):
        X = self.R[indices, :].toarray()
        return X

In [12]:
class fast_sparse_matrix(object):
    """
    Adds fast columnar reads and updates to
    a scipy.sparse.csr_matrix, at the cost
    of keeping a csc_matrix of equal size as
    a column-wise index into the same raw data.
    It is updateable in the sense that you can
    change the values of all the existing non-
    zero entries in a given column.  Trying to
    set other entries will result in an error.
    For other functionality you are expected to
    call methods on the underlying csr_matrix:
    >>> fsm = fast_sparse_matrix(data) # data is a csr_matrix
    >>> col = fsm.fast_get_col(2)      # get a column quickly
    >>> row = fsm.X[1]                 # get a row as usual
    """
    def __init__(self,X,col_view=None):
        """
        Create a fast_sparse_matrix from a csr_matrix X. Note
        that X is not copied and its values will be modified by
        any subsequent call to fast_update_col().
        Parameters
        ----------
        X : scipy sparse matrix
            The sparse matrix to wrap.
        col_view : scipy.csc_matrix, optional
            The corresponding index matrix to provide fast columnar access,
            created if not supplied here.
        """
        self.X = X.tocsr()
        if col_view is not None:
            self.col_view = col_view
        else:
            # create the columnar index matrix
            ind = self.X.copy()
            ind.data = np.arange(self.X.nnz)
            self.col_view = ind.tocsc()

    @property
    def shape(self):
        """
        Return the shape of the underlying matrix.
        """
        return self.X.shape

    def fast_get_col(self,j):
        """
        Return column j of the underlying matrix.
        Parameters
        ----------
        j : int
            Index of column to get.
        Returns
        -------
        col : scipy.sparse.csc_matrix
            Copy of column j of the matrix.
        """
        col = self.col_view[:,j].copy()
        col.data = self.X.data[col.data]
        return col

    def fast_update_col(self,j,vals):
        """
        Update values of existing non-zeros in column
        of the underlying matrix.
        Parameters
        ----------
        j : int
            Index of the column to update.
        vals : array like
            The new values to be assigned, must satisfy
            len(vals) == X[:,j].nnz i.e. this method can
            only change the value of existing non-zero entries
            of column j, it cannot add new ones.
        """
        dataptr = self.col_view[:,j].data
        self.X.data[dataptr] = vals

    def ensure_sparse_cols(self,max_density,remove_lowest=True):
        """
        Ensure that no column of the matrix excess the specified
        density, setting excess entries to zero where necessary.
        This can be useful to avoid popularity bias in collaborative
        filtering, by pruning the number of users for popular items:
        >>> num_users,num_items = train.shape
        >>> f = fast_sparse_matrix(train)
        >>> f.ensure_sparse_cols(max_density=0.01)
        Now any item in train has non-zero ratings from at most 1% of users.
        Parameters
        ----------
        max_density : float
            The highest allowable column-wise density. A value of one
            or more is treated as an absolute limit on the number of
            non-zero entries in a column, while a value of less than
            one is treated as a density i.e. a proportion of the overall
            number of rows.
        remove_lowest : boolean (default: True)
            If true then excess entries to be set to zero in a column are
            chosen lowest first, otherwise they are selected randomly.
        """
        if max_density >= 1:
            max_nnz = int(max_density)
        else:
            max_nnz = int(max_density*self.shape[0])
        for j in xrange(self.shape[1]):
            col = self.fast_get_col(j)
            excess = col.nnz - max_nnz
            if excess > 0:
                if remove_lowest:
                    zero_entries = np.argsort(col.data)[:excess]
                else:
                    zero_entries = random.sample(xrange(col.nnz),excess)
                col.data[zero_entries] = 0
                self.fast_update_col(j,col.data)

    def save(self,filepath):
        """
        Save to file as arrays in numpy binary format.
        Parameters
        ----------
        filepath : str
            The filepath to write to.
        """
        d = self.X.tocoo(copy=False)
        v = self.col_view.tocoo(copy=False)
        np.savez(filepath,row=d.row,col=d.col,data=d.data,shape=d.shape,
                 v_row=v.row,v_col=v.col,v_data=v.data,v_shape=v.shape)

    @staticmethod
    def load(filepath):
        """
        Load a fast_sparse_matrix from file written by fast_sparse_matrix.save().
        Parameters
        ----------
        filepath : str
            The filepath to load.
        """
        y = np.load(filepath,mmap_mode='r')
        X = coo_matrix((y['data'],(y['row'],y['col'])),shape=y['shape'])
        col_view = coo_matrix((y['v_data'],(y['v_row'],y['v_col'])),shape=y['v_shape'])
        return fast_sparse_matrix(X,col_view.tocsc())

    @staticmethod
    def loadtxt(filepath,comments='#',delimiter=None,skiprows=0,usecols=None,index_offset=1):
        """
        Create a fast_sparse_matrix from simply formatted data such as TSV, handles
        similar input to numpy.loadtxt().
        Parameters
        ----------
        filepath : file or str
            File containing simply formatted row,col,val sparse matrix data.
        comments : str, optional
            The character used to indicate the start of a comment (default: #).
        delimiter : str, optional
            The string used to separate values. By default, this is any whitespace.
        skiprows : int, optional
            Skip the first skiprows lines; default: 0.
        usecols : sequence, optional
            Which columns to read, with 0 being the first. For example, usecols = (1,4,5)
            will extract the 2nd, 5th and 6th columns. The default, None, results in all
            columns being read.
        index_offset : int, optional
            Offset applied to the row and col indices in the input data (default: 1).
            The default offset is chosen so that 1-indexed data on file results in a
            fast_sparse_matrix holding 0-indexed matrices.
        Returns
        -------
        mat : mrec.sparse.fast_sparse_matrix
            A fast_sparse_matrix holding the data in the file.
        """
        X = loadtxt(filepath,comments=comments,delimiter=delimiter,skiprows=skiprows,usecols=usecols)
        return fast_sparse_matrix(X)

    @staticmethodq
    def loadmm(filepath):
        """
        Create a fast_sparse_matrix from matrixmarket data.
        Parameters
        ----------
        filepath : file or str
            The matrixmarket file to read.
        Returns
        -------
        mat : mrec.sparse.fast_sparse_matrix
            A fast_sparse_matrix holding the data in the file.
        """
        X = mmread(filepath)
        return fast_sparse_matrix(X)