In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm.auto import tqdm, trange
from sklearn.model_selection import GroupKFold, KFold

In [2]:
from itertools import chain
from tqdm.notebook import tqdm

In [3]:
from gensim.models import Word2Vec

In [4]:
import metrics 

In [5]:
import warnings
from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

Здесь я просто скорю датасет с помощью Word2Vec

In [6]:
train = pd.read_csv('./data/train.csv')

In [7]:
# train = train.iloc[:25_000]

In [8]:
train.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1


In [9]:
songs = pd.read_csv('./data/songs.csv')

In [10]:
songs.head()

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language
0,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,247640,465,張信哲 (Jeff Chang),董貞,何啟弘,3.0
1,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,197328,444,BLACKPINK,TEDDY| FUTURE BOUNCE| Bekuh BOOM,TEDDY,31.0
2,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,231781,465,SUPER JUNIOR,,,31.0
3,dKMBWoZyScdxSkihKG+Vf47nc18N9q4m58+b4e7dSSE=,273554,465,S.H.E,湯小康,徐世珍,3.0
4,W3bqWd3T+VeHFzHAUfARgW9AvVRaF4N5Yzm4Mr6Eo/o=,140329,726,貴族精選,Traditional,Traditional,52.0


In [11]:
data = train.merge(songs)

In [12]:
data.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,song_length,genre_ids,artist_name,composer,lyricist,language
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1,206471,359,Bastille,Dan Smith| Mark Crew,,52.0
1,e5Ezre9HPuPos+CXQXtmo32E/hHIZTMmo6jG3yRf6UA=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,my library,Local playlist more,local-playlist,1,206471,359,Bastille,Dan Smith| Mark Crew,,52.0
2,pouJqjNRmZOnRNzzMWWkamTKkIGHyvhl/jo4HgbncnM=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,discover,Online playlist more,online-playlist,0,206471,359,Bastille,Dan Smith| Mark Crew,,52.0
3,sSexP400TJOZRhx3JB+0s9cqrCnqrlV51B9njoKR1II=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,discover,Online playlist more,online-playlist,0,206471,359,Bastille,Dan Smith| Mark Crew,,52.0
4,hKdGiUKHVqKkXGHLrc+EzdSW6q0ERAJ2Cs7/L1N0Ae4=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,discover,,online-playlist,0,206471,359,Bastille,Dan Smith| Mark Crew,,52.0


In [13]:
def check_song_id_in_vocab(song_id, songs_embeddings):
    return song_id in songs_embeddings.wv

In [14]:
def get_user_embedding(sub_data, songs_embeddings):
    user_embeddings = {}
    positive_sessions = dict(sub_data[sub_data.target == 1].groupby("msno").song_id.apply(list))
    
    user_set = set(list(positive_sessions.keys()))
    for user, user_positives in positive_sessions.items():
        user_positives = [positive for positive in user_positives]
        if len(user_positives) > 0:
            user_embeddings[user] = songs_embeddings.wv[user_positives].mean(axis=0)
        
    return user_embeddings, user_set

In [15]:
data = data.sort_values(by='msno')
groups = data.msno.to_numpy()

In [16]:
with tqdm(total=1.0, desc='Training...') as pbar:
    songs_ids, id_cnts = np.unique(data.song_id, return_counts=True)
    songs_ids = set(songs_ids[id_cnts>5])
    pbar.update(0.1)    

    mask = [not(el in songs_ids) for el in data.song_id]
    data.loc[mask, 'song_id'] = np.array(['unknonw']*sum(mask))
    pbar.update(0.1)

    sessions = dict(data.groupby("msno").song_id.apply(list))
    sentences = [values for values in sessions.values() if len(values) > 0]
    songs_embeddings = Word2Vec(sentences=sentences, vector_size=64, window=5, min_count=5, seed=0)
    pbar.update(0.3)

    user_embeddings, user_set = get_user_embedding(data, songs_embeddings)
    pbar.update(0.5)

Training...:   0%|          | 0/1.0 [00:00<?, ?it/s]

In [17]:
with tqdm(total=1.0, desc='Prediction...') as pbar:
    users_names = data.msno.to_numpy()
    songs_ids = data.song_id.to_numpy()
    pbar.update(0.1)

    unknown_songs_mask = np.array([not check_song_id_in_vocab(song, songs_embeddings) for song in songs_ids])
    songs_ids[unknown_songs_mask] = np.array(['unknonw']*sum(unknown_songs_mask))
    pbar.update(0.1)

    mask = np.array([(user in user_embeddings.keys()) for user in users_names])
    sub_embs_user = np.array([user_embeddings[el] for el in users_names[mask]])
    sub_embs_song = np.array([songs_embeddings.wv[el] for el in songs_ids[mask]])
    pbar.update(0.2)

    scores = np.zeros(len(data))
    # за даталик извиняюсь
    scores[mask] = np.sum(sub_embs_user * sub_embs_song, axis=1)
    pbar.update(0.6)

Prediction...:   0%|          | 0/1.0 [00:00<?, ?it/s]

In [19]:
pd.Series(scores).to_csv('scores.csv', index=False)