In [4]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb
from tqdm import tqdm
import optuna
import pickle
import pyarrow.parquet as pq
import pyarrow as pa
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

def isrc_to_year(isrc):
    if type(isrc) == str:
        if int(isrc[5:7]) > 17:
            return 1900 + int(isrc[5:7])
        else:
            return 2000 + int(isrc[5:7])
    else:
        return np.nan

# ПРЕПРОЦЕССИНГ

### загрузим все данные

In [32]:
train = pd.read_csv('train.csv')
                                    
songs = pd.read_csv('songs.csv')
members = pd.read_csv('members.csv',
                     parse_dates=['registration_init_time','expiration_date'])

songs_extra = pd.read_csv('song_extra_info.csv')

### составим датасеты

In [33]:
songs_mi = songs.loc[:, ['song_id', 'artist_name', 'genre_ids', 'song_length', 'language']]
train = train.merge(songs_mi, on='song_id', how='left')


members['registration_year'] = members.registration_init_time.apply(lambda x: x.year)
members['registration_month'] = members.registration_init_time.apply(lambda x: x.month)
members['registration_day'] = members.registration_init_time.apply(lambda x: x.day)

members['expiration_year'] = members.expiration_date.apply(lambda x: x.year)
members['expiration_month'] = members.expiration_date.apply(lambda x: x.month)
members['expiration_day'] = members.expiration_date.apply(lambda x: x.day)

members = members.drop(['registration_init_time'], axis=1)
members = members.drop(['expiration_date'], axis=1)


songs_extra['song_year'] = songs_extra['isrc'].apply(isrc_to_year)
songs_extra.drop(['isrc'], axis = 1, inplace = True)

train = train.merge(members, on='msno', how='left')
train = train.merge(songs_extra, on = 'song_id', how = 'left')

# заменим хеши на инты
song_dict = dict([(j, i) for i, j in enumerate(train.song_id.unique())])
user_dict = dict([(j, i) for i, j in enumerate(train.msno.unique())])

train.msno = train.msno.apply(lambda x: user_dict[x])
train.song_id = train.song_id.apply(lambda x: song_dict[x])


# посмотрим, сколько песен слушали юзеры и сколько юзеров слушали песню
train['counter'] = 1
user_cnt = train.loc[:, ['msno', 'counter']].groupby('msno').sum()
song_cnt = train.loc[:, ['song_id', 'counter']].groupby('song_id').sum()

train = train.merge(user_cnt, left_on='msno', right_on=user_cnt.index, suffixes=('', '_user'))
train = train.merge(song_cnt, left_on='song_id', right_on=song_cnt.index, suffixes=('', '_song'))
train.drop(columns=['counter'], inplace=True)

train['song_name_length'] = train.name.apply(lambda x: len(str(x)))
train['song_words_count'] = train.name.apply(lambda x: len(str(x).split()))

In [4]:
# посмотрим, что у нас по nan
train.isna().sum().sort_values(ascending=False)

gender                2961479
song_year              577858
source_screen_name     414804
genre_ids              118455
source_system_tab       24849
source_type             21539
name                     1457
language                  150
artist_name               114
song_length               114
bd                          0
song_id                     0
target                      0
city                        0
song_words_count            0
song_name_length            0
registered_via              0
registration_year           0
registration_month          0
registration_day            0
expiration_year             0
expiration_month            0
expiration_day              0
counter_user                0
counter_song                0
msno                        0
dtype: int64

In [5]:
# language, song_length просто дропнем, их не много
train = train[train['song_length'].notna() & train['language'].notna()]

# неизвестные имена сделаем неизвестными
train.loc[train['name'].isna(), ['name']] = '<unknown>'
train.loc[train['artist_name'].isna(), ['artist_name']] = '<unknown>'

# все остальное заменим на самые частовстречаемые
train = train.apply(lambda x: x.fillna(x.value_counts().index[0]))

In [6]:
# нанов больше нет
train.isna().sum().sort_values(ascending=False)

song_words_count      0
song_name_length      0
song_id               0
source_system_tab     0
source_screen_name    0
source_type           0
target                0
artist_name           0
genre_ids             0
song_length           0
language              0
city                  0
bd                    0
gender                0
registered_via        0
registration_year     0
registration_month    0
registration_day      0
expiration_year       0
expiration_month      0
expiration_day        0
name                  0
song_year             0
counter_user          0
counter_song          0
msno                  0
dtype: int64

In [8]:
# добавим полное название песни
train['full_name'] = train.loc[:, ['name', 'artist_name']].apply(lambda x: x['name'] + ' | ' +  x['artist_name'], axis = 1)

In [9]:
train.to_parquet('dataset.parquet')

# Часть 2

In [16]:
all_df = pq.read_table('dataset.parquet').to_pandas()
df = all_df.loc[:, ['msno', 'song_id', 'artist_name', 'genre_ids', 'name', 'target', 'full_name']]

In [17]:
# выделим только таргет единица, как понравившееся пользователю
df = df.loc[df.target == 1].drop('target', axis = 1)

In [18]:
# разные айдишники на одну песню - супер странно
df.loc[(df.artist_name == 'Eminem') & (df.name == 'Lose Yourself')].song_id.unique()

array([ 14917, 167154, 149139,  62953])

In [19]:
# будем идентифицировать песню по исполнителю и названию
# составим дикты соответсвия
full_name_to_int_dict = dict([(j, i) for i, j in enumerate(df.full_name.unique())])
int_to_full_name_dict = {j:i for i, j in full_name_to_int_dict.items()}
df.full_name = df.full_name.apply(lambda x: full_name_to_int_dict[x])

In [20]:
# составим плейлисты юзеров 
play_lists = []
for u_id in tqdm(df.msno.unique(), position=0,leave=False):
    songs = df.loc[df.msno == u_id].full_name.values.tolist()
    play_lists.append(songs)

                                                      

In [21]:
lengths = []
for i in range(len(play_lists)):
    lengths.append(len(play_lists[i]))
    play_lists[i] = list(map(str, play_lists[i]))

In [22]:
# средняя длина предложения
np.mean(lengths)

137.00409397705897

In [23]:
# теперь займемся обучением модели
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
import logging
from time import time
import multiprocessing

In [24]:
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

class Callback(CallbackAny2Vec):
    def __init__(self):
        self.epoch = 1
        self.training_loss = []

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 1:
            current_loss = loss
        else:
            current_loss = loss - self.loss_previous_step
        print(f"Loss after epoch {self.epoch}: {current_loss}")
        self.training_loss.append(current_loss)
        self.epoch += 1
        self.loss_previous_step = loss
        
# обучим word2vec
model = Word2Vec(
    size = 128,
    window = 10,
    min_count = 1,
    sg = 0,
    negative = 20,
    workers = multiprocessing.cpu_count())
print(model)

Word2Vec(vocab=0, size=128, alpha=0.025)


In [25]:
logging.disable(logging.NOTSET) # enable logging
t = time()

model.build_vocab(play_lists)

print(f"Time to build vocab: {round((time() - t), 2)} seconds")

2020-11-03 13:37:08,060 : INFO : collecting all words and their counts
2020-11-03 13:37:08,062 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-11-03 13:37:08,487 : INFO : PROGRESS: at sentence #10000, processed 2384931 words, keeping 145453 word types
2020-11-03 13:37:08,691 : INFO : PROGRESS: at sentence #20000, processed 3550266 words, keeping 181877 word types
2020-11-03 13:37:08,727 : INFO : collected 194949 word types from a corpus of 3714592 raw words and 27113 sentences
2020-11-03 13:37:08,727 : INFO : Loading a fresh vocabulary
2020-11-03 13:37:09,481 : INFO : effective_min_count=1 retains 194949 unique words (100% of original 194949, drops 0)
2020-11-03 13:37:09,482 : INFO : effective_min_count=1 leaves 3714592 word corpus (100% of original 3714592, drops 0)
2020-11-03 13:37:09,928 : INFO : deleting the raw counts dictionary of 194949 items
2020-11-03 13:37:09,931 : INFO : sample=0.001 downsamples 4 most-common words
2020-11-03 13:37:09,932 : IN

Time to build vocab: 37.92 seconds


In [26]:
logging.disable(logging.INFO) # disable logging
callback = Callback() # instead, print out loss for each epoch
t = time()

model.train(play_lists,
            total_examples = model.corpus_count,
            epochs = 100,
            compute_loss = True,
            callbacks = [callback]) 

print(f"Time to train the model: {round((time() - t), 2)} seconds")

Loss after epoch 1: 2961812.75
Loss after epoch 2: 1639997.75
Loss after epoch 3: 1411400.5
Loss after epoch 4: 1359397.0
Loss after epoch 5: 1291124.0
Loss after epoch 6: 1223655.0
Loss after epoch 7: 1209704.0
Loss after epoch 8: 1182948.0
Loss after epoch 9: 1152730.0
Loss after epoch 10: 1146496.0
Loss after epoch 11: 1131422.0
Loss after epoch 12: 1096061.0
Loss after epoch 13: 990844.0
Loss after epoch 14: 999838.0
Loss after epoch 15: 978050.0
Loss after epoch 16: 990216.0
Loss after epoch 17: 985506.0
Loss after epoch 18: 970248.0
Loss after epoch 19: 960458.0
Loss after epoch 20: 962980.0
Loss after epoch 21: 954052.0
Loss after epoch 22: 948720.0
Loss after epoch 23: 942266.0
Loss after epoch 24: 934652.0
Loss after epoch 25: 928194.0
Loss after epoch 26: 949374.0
Loss after epoch 27: 934416.0
Loss after epoch 28: 924682.0
Loss after epoch 29: 924436.0
Loss after epoch 30: 828348.0
Loss after epoch 31: 682964.0
Loss after epoch 32: 696460.0
Loss after epoch 33: 676736.0
Loss 

In [27]:
full_name_to_int_dict['W.T.P. | Eminem']

6424

In [28]:
# посмотрим на симиляры
for ms in model.wv.most_similar('6424')[:5]:
    print(ms, end='\n\t')
    found = df.loc[(df.artist_name == 'Eminem') & (df.full_name == int(ms[0])), ['artist_name', 'genre_ids', 'name']].drop_duplicates()
    print(found)
    print()

('6427', 0.7944604158401489)
	        artist_name genre_ids             name
3444582      Eminem      1259  Won't Back Down

('6426', 0.7898609042167664)
	        artist_name genre_ids    name
3444534      Eminem      1259  So Bad

('6422', 0.7023847103118896)
	        artist_name genre_ids     name
3444269      Eminem      1259  No Love

('6421', 0.6940373778343201)
	        artist_name genre_ids        name
3444201      Eminem      1259  25 To Life

('6428', 0.6697537302970886)
	        artist_name genre_ids     name
3444625      Eminem      1259  On Fire



In [93]:
### эмбединги песен у нас есть, ембединг юзера будет средним эмбедингом его песен из плейлиста
user_emb = dict()
counter = 0
for user_id in tqdm(df.msno.unique(), position=0, leave=False):
    songs_embeds = [model.wv[str(i)] for i in play_lists[counter]]
    user_emb[user_id] = np.array(songs_embeds).mean(axis = 0)
    counter += 1

                                                        

In [94]:
# посчитаем скалярное произведение между эмбедингом юзера и песней, которую он слушал 
new_feature = []
for row in tqdm(all_df[['msno', 'full_name']].to_numpy(), position=0,leave=False):
    user, song = row
    

    u_emb = user_emb[user] if user in user_emb else np.zeros(128)
        
    if song in full_name_to_int_dict and str(full_name_to_int_dict[song]) in model.wv:
        s_emb = model.wv[str(full_name_to_int_dict[song])]
    else:
        s_emb = np.zeros(128)
        
    new_feature.append((u_emb * s_emb).sum())

                                                             

In [96]:
import pickle
with open("emb_dists.txt", "wb") as fp:  
       pickle.dump(new_feature, fp)

# ЧАСТЬ 1

In [2]:
all_df = pq.read_table('dataset.parquet').to_pandas()

In [3]:
all_df.columns

Index(['msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'target', 'artist_name', 'genre_ids', 'song_length',
       'language', 'city', 'bd', 'gender', 'registered_via',
       'registration_year', 'registration_month', 'registration_day',
       'expiration_year', 'expiration_month', 'expiration_day', 'name',
       'song_year', 'counter_user', 'counter_song', 'song_name_length',
       'song_words_count', 'full_name'],
      dtype='object')

In [4]:
all_df.drop(columns = ['msno', 'song_id', 'artist_name', 'name', 'full_name'], axis=1, inplace=True)

In [5]:
for col in all_df.columns:
    if col not in ['target', 'counter_song', 'counter_user', 'song_name_length', 'song_words_count']:
        all_df[col] = all_df[col].astype('category')

In [6]:
X = all_df.drop(['target'], axis=1)
y = all_df['target']
kf = KFold(n_splits=5)

### сделаем небольшой серч оптимальных параметров

In [7]:
def objective(trial):
    param = {
        'objective': 'binary',
        'boosting': "dart",
        'learning_rate': trial.suggest_float('learning_rate', 0.15, 0.35, step = 0.05),
        'max_depth': trial.suggest_int('max_depth', 4, 8),
        'boosting_type': 'dart',
        "lambda_l1":trial.suggest_float('lambda_l1', 0.0, 1.0, step = 0.1),
        "lambda_l2":trial.suggest_float('lambda_l2', 0.0, 1.0, step = 0.1),
        "metric":'auc',
        "bagging_freq": 1,
        "seed": 0,
        "nthread": -1
        
    }
    
    lgbnumround = 50
    evals = {}
    gbm = lgb.train(param, 
                    lgbtrain, 
                    lgbnumround, 
                    valid_sets = [lgbtrain, lgbval], 
                    verbose_eval = None,
                    evals_result=evals)
    auc = np.array(evals['valid_1']['auc'])
    return np.max(auc)

In [8]:
search_params = []
for train_index, test_index in kf.split(X):
    print("Start iteration")
    # test
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]
    
    XX = X.iloc[train_index]
    yy = y.iloc[train_index]
    
    X_train, X_val, y_train, y_val = train_test_split(
                                     XX, yy, test_size=0.2, random_state=42, stratify=yy)
    
    print('Training LGBM model...')
    
    print('prepare model')
    lgbtrain = lgb.Dataset(X_train, y_train)
    lgbval = lgb.Dataset(X_val, y_val)
    
    print('optimizing')
    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=10)
    
    search_params.append(study.best_params)

Start iteration


[32m[I 2020-11-03 12:20:30,296][0m A new study created in memory with name: no-name-177524b8-e562-4f0c-8273-a09e559b5f99[0m


Training LGBM model...
prepare model
optimizing


[32m[I 2020-11-03 12:21:45,427][0m Trial 0 finished with value: 0.7098320505279004 and parameters: {'learning_rate': 0.35, 'max_depth': 5, 'lambda_l1': 0.7000000000000001, 'lambda_l2': 0.7000000000000001}. Best is trial 0 with value: 0.7098320505279004.[0m
[32m[I 2020-11-03 12:23:01,137][0m Trial 1 finished with value: 0.7117915954433376 and parameters: {'learning_rate': 0.25, 'max_depth': 8, 'lambda_l1': 0.5, 'lambda_l2': 0.8}. Best is trial 1 with value: 0.7117915954433376.[0m
[32m[I 2020-11-03 12:24:14,474][0m Trial 2 finished with value: 0.7038394460991109 and parameters: {'learning_rate': 0.15, 'max_depth': 6, 'lambda_l1': 0.9, 'lambda_l2': 0.0}. Best is trial 1 with value: 0.7117915954433376.[0m
[32m[I 2020-11-03 12:25:33,450][0m Trial 3 finished with value: 0.7076576996093258 and parameters: {'learning_rate': 0.2, 'max_depth': 7, 'lambda_l1': 0.7000000000000001, 'lambda_l2': 0.4}. Best is trial 1 with value: 0.7117915954433376.[0m
[32m[I 2020-11-03 12:26:43,723][0m

Start iteration


[32m[I 2020-11-03 12:32:16,821][0m A new study created in memory with name: no-name-7c49b9d2-21a9-413e-accc-79498e398dfd[0m


Training LGBM model...
prepare model
optimizing


[32m[I 2020-11-03 12:33:13,644][0m Trial 0 finished with value: 0.7111628875307786 and parameters: {'learning_rate': 0.30000000000000004, 'max_depth': 4, 'lambda_l1': 0.8, 'lambda_l2': 0.7000000000000001}. Best is trial 0 with value: 0.7111628875307786.[0m
[32m[I 2020-11-03 12:34:20,134][0m Trial 1 finished with value: 0.7187925249025774 and parameters: {'learning_rate': 0.30000000000000004, 'max_depth': 6, 'lambda_l1': 0.6000000000000001, 'lambda_l2': 0.4}. Best is trial 1 with value: 0.7187925249025774.[0m
[32m[I 2020-11-03 12:35:29,361][0m Trial 2 finished with value: 0.7149347393682349 and parameters: {'learning_rate': 0.2, 'max_depth': 7, 'lambda_l1': 0.7000000000000001, 'lambda_l2': 0.1}. Best is trial 1 with value: 0.7187925249025774.[0m
[32m[I 2020-11-03 12:36:40,795][0m Trial 3 finished with value: 0.7164348792910947 and parameters: {'learning_rate': 0.2, 'max_depth': 8, 'lambda_l1': 0.7000000000000001, 'lambda_l2': 0.2}. Best is trial 1 with value: 0.71879252490257

Start iteration


[32m[I 2020-11-03 12:43:00,029][0m A new study created in memory with name: no-name-53f96dd1-05f9-4dd0-84fd-ff6e062bcc6a[0m


Training LGBM model...
prepare model
optimizing


[32m[I 2020-11-03 12:43:58,040][0m Trial 0 finished with value: 0.7074574337498178 and parameters: {'learning_rate': 0.2, 'max_depth': 4, 'lambda_l1': 0.6000000000000001, 'lambda_l2': 0.5}. Best is trial 0 with value: 0.7074574337498178.[0m
[32m[I 2020-11-03 12:45:07,505][0m Trial 1 finished with value: 0.7163038086937794 and parameters: {'learning_rate': 0.25, 'max_depth': 6, 'lambda_l1': 0.1, 'lambda_l2': 0.6000000000000001}. Best is trial 1 with value: 0.7163038086937794.[0m
[32m[I 2020-11-03 12:46:01,439][0m Trial 2 finished with value: 0.7117599102253058 and parameters: {'learning_rate': 0.35, 'max_depth': 4, 'lambda_l1': 0.2, 'lambda_l2': 0.1}. Best is trial 1 with value: 0.7163038086937794.[0m
[32m[I 2020-11-03 12:47:10,408][0m Trial 3 finished with value: 0.7119215852529962 and parameters: {'learning_rate': 0.15, 'max_depth': 7, 'lambda_l1': 0.7000000000000001, 'lambda_l2': 0.6000000000000001}. Best is trial 1 with value: 0.7163038086937794.[0m
[32m[I 2020-11-03 12

Start iteration


[32m[I 2020-11-03 12:54:02,048][0m A new study created in memory with name: no-name-ec922830-687b-4bcb-bfb5-50f1ac0be9b6[0m


Training LGBM model...
prepare model
optimizing


[32m[I 2020-11-03 12:55:01,775][0m Trial 0 finished with value: 0.7118640948508781 and parameters: {'learning_rate': 0.35, 'max_depth': 4, 'lambda_l1': 0.1, 'lambda_l2': 0.7000000000000001}. Best is trial 0 with value: 0.7118640948508781.[0m
[32m[I 2020-11-03 12:56:18,686][0m Trial 1 finished with value: 0.7173261030419887 and parameters: {'learning_rate': 0.25, 'max_depth': 8, 'lambda_l1': 0.7000000000000001, 'lambda_l2': 0.5}. Best is trial 1 with value: 0.7173261030419887.[0m
[32m[I 2020-11-03 12:57:25,774][0m Trial 2 finished with value: 0.7206094553990315 and parameters: {'learning_rate': 0.35, 'max_depth': 7, 'lambda_l1': 1.0, 'lambda_l2': 0.0}. Best is trial 2 with value: 0.7206094553990315.[0m
[32m[I 2020-11-03 12:58:26,490][0m Trial 3 finished with value: 0.715244856966806 and parameters: {'learning_rate': 0.30000000000000004, 'max_depth': 5, 'lambda_l1': 0.5, 'lambda_l2': 1.0}. Best is trial 2 with value: 0.7206094553990315.[0m
[32m[I 2020-11-03 12:59:33,429][0m

Start iteration


[32m[I 2020-11-03 13:04:52,237][0m A new study created in memory with name: no-name-16122028-8caa-4c58-be9f-2ac2110d355b[0m


Training LGBM model...
prepare model
optimizing


[32m[I 2020-11-03 13:06:02,978][0m Trial 0 finished with value: 0.7090396956551177 and parameters: {'learning_rate': 0.2, 'max_depth': 6, 'lambda_l1': 0.0, 'lambda_l2': 0.7000000000000001}. Best is trial 0 with value: 0.7090396956551177.[0m
[32m[I 2020-11-03 13:07:08,555][0m Trial 1 finished with value: 0.7134012943408412 and parameters: {'learning_rate': 0.30000000000000004, 'max_depth': 6, 'lambda_l1': 0.8, 'lambda_l2': 0.2}. Best is trial 1 with value: 0.7134012943408412.[0m
[32m[I 2020-11-03 13:08:21,502][0m Trial 2 finished with value: 0.7155996961718344 and parameters: {'learning_rate': 0.30000000000000004, 'max_depth': 7, 'lambda_l1': 0.4, 'lambda_l2': 1.0}. Best is trial 2 with value: 0.7155996961718344.[0m
[32m[I 2020-11-03 13:09:46,448][0m Trial 3 finished with value: 0.7072535467037864 and parameters: {'learning_rate': 0.15, 'max_depth': 8, 'lambda_l1': 0.6000000000000001, 'lambda_l2': 0.5}. Best is trial 2 with value: 0.7155996961718344.[0m
[32m[I 2020-11-03 13

In [9]:
from collections import Counter
lr, md, l1, l2 = [], [], [], []
for par in search_params:
    lr.append(par['learning_rate'])
    md.append(par['max_depth'])
    l1.append(par['lambda_l1'])
    l2.append(par['lambda_l2'])
    
# получили параметры
Counter(lr).most_common()[0][0], Counter(md).most_common()[0][0], Counter(l1).most_common()[0][0], Counter(l2).most_common()[0][0]

(0.30000000000000004, 8, 0.5, 1.0)

### теперь обучим модель, посмотрим на скор

In [10]:
k_fold_results = []
for train_index, test_index in kf.split(X):
    print("Start iteration")
    # test
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]
    
    XX = X.iloc[train_index]
    yy = y.iloc[train_index]
    
    X_train, X_val, y_train, y_val = train_test_split(
                                     XX, yy, test_size=0.2, random_state=42, stratify=yy)
    
    print('Training LGBM model...')
    
    print('prepare model')
    lgbtrain = lgb.Dataset(X_train, y_train)
    lgbval = lgb.Dataset(X_val, y_val)


    params = {}
    params['learning_rate'] = 0.3
    params['application'] = 'binary'
    params['max_depth'] = 8
    params['num_leaves'] = 2**8
    params['lambda_l1'] = 0.5
    params['lambda_l2'] = 1.0
    params['verbosity'] = 0
    params['metric'] = 'auc'
    
    print('learning...')
    evals = {}
    gbm = lgb.train(params, 
                      train_set=lgbtrain, 
                      num_boost_round=50, 
                      valid_sets=[lgbtrain, lgbval],
                      verbose_eval=10,
                      evals_result=evals,)
    
    y_pred = gbm.predict(X_test)
    scr = roc_auc_score(y_test, y_pred)
    
    df_feat = pd.DataFrame(data=gbm.feature_importance(importance_type='gain'), index=gbm.feature_name(), columns=['importance'])
    df_feat = df_feat.sort_values(by='importance', ascending=False)
    
    
    k_fold_results.append((gbm, scr, df_feat))

Start iteration
Training LGBM model...
prepare model
learning...
[10]	training's auc: 0.727915	valid_1's auc: 0.722392
[20]	training's auc: 0.73816	valid_1's auc: 0.729779
[30]	training's auc: 0.743872	valid_1's auc: 0.733293
[40]	training's auc: 0.748675	valid_1's auc: 0.736091
[50]	training's auc: 0.753476	valid_1's auc: 0.738577
Start iteration
Training LGBM model...
prepare model
learning...
[10]	training's auc: 0.730258	valid_1's auc: 0.726342
[20]	training's auc: 0.739902	valid_1's auc: 0.73352
[30]	training's auc: 0.742915	valid_1's auc: 0.734844
[40]	training's auc: 0.751306	valid_1's auc: 0.741302
[50]	training's auc: 0.753944	valid_1's auc: 0.742281
Start iteration
Training LGBM model...
prepare model
learning...
[10]	training's auc: 0.729907	valid_1's auc: 0.724619
[20]	training's auc: 0.741004	valid_1's auc: 0.733134
[30]	training's auc: 0.748524	valid_1's auc: 0.738783
[40]	training's auc: 0.753681	valid_1's auc: 0.741869
[50]	training's auc: 0.757773	valid_1's auc: 0.7438

In [11]:
imps = []
auc_score = []
models = []
for rez in k_fold_results:
    imps.append(rez[-1].sort_values('importance', ascending=False))
    auc_score.append(rez[1])
    models.append(rez[0])

In [12]:
(sum(imps) / len(imps)).sort_values('importance', ascending=False)

Unnamed: 0,importance
source_type,586850.746804
counter_song,338298.725428
song_length,169590.544187
counter_user,141293.571653
registration_day,124443.04859
expiration_day,118182.034976
bd,115117.126807
source_screen_name,107842.496414
source_system_tab,40406.107944
registration_year,31605.048764


In [13]:
one_auc = np.mean(auc_score)
one_auc

0.7130743127788632

# ЧАСТЬ 3

In [97]:
all_df = pq.read_table('dataset.parquet').to_pandas()
with open("emb_dists.txt", "rb") as fp:   
       new_feature = pickle.load(fp)

In [98]:
all_df.drop(columns = ['msno', 'song_id', 'artist_name', 'name', 'full_name'], axis=1, inplace=True)

In [99]:
for col in all_df.columns:
    if col not in ['target', 'counter_song', 'counter_user', 'song_name_length', 'song_words_count']:
        all_df[col] = all_df[col].astype('category')

In [100]:
# добавим фичу из эмбедингов
all_df['emb_dists'] = new_feature

In [101]:
X = all_df.drop(['target'], axis=1)
y = all_df['target']
kf = KFold(n_splits=5)

In [102]:
k_fold_results = []
for train_index, test_index in kf.split(X):
    print("Start iteration")
    # test
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]
    
    XX = X.iloc[train_index]
    yy = y.iloc[train_index]
    
    X_train, X_val, y_train, y_val = train_test_split(
                                     XX, yy, test_size=0.2, random_state=42, stratify=yy)
    
    print('Training LGBM model...')
    
    print('prepare model')
    lgbtrain = lgb.Dataset(X_train, y_train)
    lgbval = lgb.Dataset(X_val, y_val)


    params = {}
    params['learning_rate'] = 0.3
    params['application'] = 'binary'
    params['max_depth'] = 8
    params['num_leaves'] = 2**8
    params['lambda_l1'] = 0.5
    params['lambda_l2'] = 1.0
    params['verbosity'] = 0
    params['metric'] = 'auc'
    
    print('learning...')
    evals = {}
    gbm = lgb.train(params, 
                      train_set=lgbtrain, 
                      num_boost_round=50, 
                      valid_sets=[lgbtrain, lgbval],
                      verbose_eval=10,
                      evals_result=evals,)
    
    y_pred = gbm.predict(X_test)
    scr = roc_auc_score(y_test, y_pred)
    
    df_feat = pd.DataFrame(data=gbm.feature_importance(importance_type='gain'), index=gbm.feature_name(), columns=['importance'])
    df_feat = df_feat.sort_values(by='importance', ascending=False)
    
    
    k_fold_results.append((gbm, scr, df_feat))

Start iteration
Training LGBM model...
prepare model
learning...
[10]	training's auc: 0.772108	valid_1's auc: 0.767193
[20]	training's auc: 0.784129	valid_1's auc: 0.77667
[30]	training's auc: 0.790766	valid_1's auc: 0.781378
[40]	training's auc: 0.795266	valid_1's auc: 0.784036
[50]	training's auc: 0.800229	valid_1's auc: 0.786863
Start iteration
Training LGBM model...
prepare model
learning...
[10]	training's auc: 0.777946	valid_1's auc: 0.774984
[20]	training's auc: 0.791456	valid_1's auc: 0.78567
[30]	training's auc: 0.796954	valid_1's auc: 0.789271
[40]	training's auc: 0.800277	valid_1's auc: 0.790995
[50]	training's auc: 0.804188	valid_1's auc: 0.793286
Start iteration
Training LGBM model...
prepare model
learning...
[10]	training's auc: 0.779132	valid_1's auc: 0.774382
[20]	training's auc: 0.789768	valid_1's auc: 0.782577
[30]	training's auc: 0.797951	valid_1's auc: 0.788759
[40]	training's auc: 0.800483	valid_1's auc: 0.789803
[50]	training's auc: 0.803698	valid_1's auc: 0.7914

In [103]:
imps = []
auc_score = []
models = []
for rez in k_fold_results:
    imps.append(rez[-1].sort_values('importance', ascending=False))
    auc_score.append(rez[1])
    models.append(rez[0])

In [104]:
(sum(imps) / len(imps)).sort_values('importance', ascending=False)

Unnamed: 0,importance
emb_dists,933222.758434
source_type,510738.988739
counter_song,300752.419416
song_length,274674.080472
counter_user,207224.594504
source_screen_name,118807.610065
registration_day,89490.949817
expiration_day,85655.890956
bd,82546.804894
source_system_tab,56707.701243


In [105]:
two_auc = np.mean(auc_score)
two_auc

0.7543445448173667

# ВЫВОД

Эмбединги дали больший скор и залетели в топ по импортансу. Я считал испортанс по гейну суммарному, а не по числу сплитов, тк он более релевантный