In [1]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

# считываем данные

In [2]:
train = pd.read_csv('train.csv')
                                        
test = pd.read_csv('test.csv')
songs = pd.read_csv('songs.csv')
members = pd.read_csv('members.csv',
                     parse_dates=['registration_init_time','expiration_date'])

songs_extra = pd.read_csv('song_extra_info.csv')

# составим датасет всех песен

In [3]:
sngs = pd.merge(songs_extra.loc[:, ['song_id', 'name']], songs.loc[:, ['song_id', 'artist_name', 'genre_ids']], on=['song_id', 'song_id'], how='right')

sngs.name = sngs.name.astype(str)
sngs.artist_name = sngs.artist_name.astype(str)

sngs['full_name'] = sngs.loc[:, ['name', 'artist_name']].apply(lambda x: x['name'] + ' | ' +  x['artist_name'], axis = 1)

# почему то у одной песни могут быть разные хеши, учтем 

In [4]:
sngs.loc[sngs.full_name == 'Lose Yourself | Eminem']

Unnamed: 0,song_id,name,artist_name,genre_ids,full_name
1381,p+WxS35GIo9lHYpW17Bz/PRW+UcGH9R+bq+wU2caYnY=,Lose Yourself,Eminem,1259,Lose Yourself | Eminem
111600,ARjGxZVGrJXEEqIXk/5+67G/nqC1FdH3AuwSQQ1m5G8=,Lose Yourself,Eminem,1259,Lose Yourself | Eminem
1447109,MSCKTsjyD7Urq/vbunDlHmc/xcFrpZ9noEbPrewKxms=,Lose Yourself,Eminem,1259,Lose Yourself | Eminem
2231614,H5vEv5ogrztWon/cj1ZacsNIJzFnQmAytb+ARVjLrA8=,Lose Yourself,Eminem,1259,Lose Yourself | Eminem


# утвердим единый хеш для пары песня-исполнитель

In [5]:
songs_dict = {}
for row in sngs.to_numpy():
    song_id, name, artist_name, genre_ids, full_name = row
    songs_dict[full_name] = song_id

# соберем тренировочный датасет для эмбедингов

In [6]:
usr_songs = train.loc[train.target == 1].loc[:, ['msno', 'song_id']].dropna()
usr_songs = usr_songs.merge(sngs, on=['song_id', 'song_id'])
usr_songs.song_id = usr_songs.loc[:, ['song_id', 'full_name']].apply(lambda x: songs_dict[x['full_name']], axis=1)

song_dict = dict([(j, i) for i, j in enumerate(usr_songs.song_id.unique())])
user_dict = dict([(j, i) for i, j in enumerate(usr_songs.msno.unique())])

usr_songs = usr_songs.loc[:, ['msno', 'song_id', 'name', 'artist_name', 'genre_ids', 'full_name']]
usr_songs = usr_songs.dropna()

usr_songs.msno = usr_songs.msno.apply(lambda x: user_dict[x])
usr_songs.song_id = usr_songs.song_id.apply(lambda x: song_dict[x])

# исправили множественные хеши

In [7]:
usr_songs.loc[(usr_songs.name == 'Lose Yourself') & (usr_songs.artist_name == 'Eminem')].song_id.value_counts()

10816    210
Name: song_id, dtype: int64

# сделаем плейлисты юзеров

In [8]:
play_lists = []
for u_id in tqdm(usr_songs.msno.unique(), position=0,leave=False):
    songs = usr_songs.loc[usr_songs.msno == u_id].song_id.values.tolist()
    play_lists.append(songs)

                                                      

# обучим модель

In [9]:
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
import logging
from time import time
import multiprocessing

In [10]:
lengths = []
for i in range(len(play_lists)):
    lengths.append(len(play_lists[i]))
    play_lists[i] = list(map(str, play_lists[i]))

In [11]:
# средняя длина предложения
np.mean(lengths)

135.04594474811643

In [12]:
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

class Callback(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 1
        self.training_loss = []

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 1:
            current_loss = loss
        else:
            current_loss = loss - self.loss_previous_step
        print(f"Loss after epoch {self.epoch}: {current_loss}")
        self.training_loss.append(current_loss)
        self.epoch += 1
        self.loss_previous_step = loss

In [13]:
# обучим word2vec
model = Word2Vec(
    size = 256,
    window = 10,
    min_count = 1,
    sg = 0,
    negative = 20,
    workers = multiprocessing.cpu_count())
print(model)

Word2Vec(vocab=0, size=256, alpha=0.025)


In [14]:
logging.disable(logging.NOTSET) # enable logging
t = time()

model.build_vocab(play_lists)

print(f"Time to build vocab: {round((time() - t), 2)} seconds")

2020-10-25 22:55:23,022 : INFO : collecting all words and their counts
2020-10-25 22:55:23,025 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-10-25 22:55:23,513 : INFO : PROGRESS: at sentence #10000, processed 2531600 words, keeping 137261 word types
2020-10-25 22:55:23,801 : INFO : PROGRESS: at sentence #20000, processed 3486968 words, keeping 176209 word types
2020-10-25 22:55:23,862 : INFO : collected 191425 word types from a corpus of 3656504 raw words and 27076 sentences
2020-10-25 22:55:23,863 : INFO : Loading a fresh vocabulary
2020-10-25 22:55:24,337 : INFO : effective_min_count=1 retains 191425 unique words (100% of original 191425, drops 0)
2020-10-25 22:55:24,338 : INFO : effective_min_count=1 leaves 3656504 word corpus (100% of original 3656504, drops 0)
2020-10-25 22:55:24,868 : INFO : deleting the raw counts dictionary of 191425 items
2020-10-25 22:55:24,872 : INFO : sample=0.001 downsamples 4 most-common words
2020-10-25 22:55:24,873 : IN

Time to build vocab: 35.29 seconds


In [15]:
logging.disable(logging.INFO) # disable logging
callback = Callback() # instead, print out loss for each epoch
t = time()

model.train(play_lists,
            total_examples = model.corpus_count,
            epochs = 100,
            compute_loss = True,
            callbacks = [callback]) 

print(f"Time to train the model: {round((time() - t), 2)} seconds")

Loss after epoch 1: 2894918.75
Loss after epoch 2: 1648220.25
Loss after epoch 3: 1428155.0
Loss after epoch 4: 1328166.0
Loss after epoch 5: 1290266.0
Loss after epoch 6: 1202057.0
Loss after epoch 7: 1174462.0
Loss after epoch 8: 1158547.0
Loss after epoch 9: 1160841.0
Loss after epoch 10: 1110437.0
Loss after epoch 11: 1091062.0
Loss after epoch 12: 1069668.0
Loss after epoch 13: 971928.0
Loss after epoch 14: 949088.0
Loss after epoch 15: 933218.0
Loss after epoch 16: 923592.0
Loss after epoch 17: 922830.0
Loss after epoch 18: 903334.0
Loss after epoch 19: 896566.0
Loss after epoch 20: 892130.0
Loss after epoch 21: 873648.0
Loss after epoch 22: 861804.0
Loss after epoch 23: 858724.0
Loss after epoch 24: 853104.0
Loss after epoch 25: 858686.0
Loss after epoch 26: 844330.0
Loss after epoch 27: 853000.0
Loss after epoch 28: 844394.0
Loss after epoch 29: 829708.0
Loss after epoch 30: 842164.0
Loss after epoch 31: 818884.0
Loss after epoch 32: 686892.0
Loss after epoch 33: 616724.0
Loss 

In [16]:
sngs.song_id = sngs.apply(lambda x: songs_dict[x['full_name']], axis=1)

In [17]:
sngs.song_id = sngs.song_id.apply(lambda x: song_dict[x] if x in song_dict else x)

In [18]:
sngs = sngs.drop_duplicates()

In [19]:
sngs.loc[(sngs.artist_name == 'Eminem') & (sngs.name =='Lose Yourself')]

Unnamed: 0,song_id,name,artist_name,genre_ids,full_name
1381,10816,Lose Yourself,Eminem,1259,Lose Yourself | Eminem


In [20]:
for ms in model.wv.most_similar('10816')[:5]:
    print(ms, end='\n\t')
    found = sngs.loc[sngs.song_id.astype(str) == ms[0]]
    print(found.genre_ids.item(), found.artist_name.item(), found.name.item())
    print()

('11274', 0.5260806083679199)
	1259 Eminem Not Afraid

('10023', 0.48443353176116943)
	1259 Eminem Space Bound

('10301', 0.4646959900856018)
	1259 Flo Rida I Cry

('10881', 0.4475036859512329)
	465 Air Supply Goodbye

('10699', 0.41769152879714966)
	1259 Flo Rida Who's With Me

