# Music Recommendation

In [24]:
from load_data import *
from embeddings import EmbeddingsBuilder
from create_features.song_features import create_song_features
from create_features.user_features import create_user_features
from catboost import CatBoostRanker, Pool, cv
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.manifold import TSNE
from sklearn.preprocessing import LabelEncoder

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Подготовка данных

In [2]:
random_state = 42
data_dir = "./data/"

In [3]:
%%time
song_features_df = create_song_features(data_dir)

CPU times: user 50.3 s, sys: 1.24 s, total: 51.5 s
Wall time: 51.7 s


In [4]:
song_features_df.head(3)

Unnamed: 0,song_id,song_length,genre_ids,artist_name,composer,lyricist,language,artist_name_count,composer_count,lyricists_count,isrc_year,genres_count
0,CXoTN1eb7AI+DntdU1vbcwGRV4SCIDxZu+YD8JP8r4E=,247640,465,張信哲 (Jeff Chang),董貞,何啟弘,3.0,1,1,1,2014,1
1,o0kFgae9QtnYgRkVPqLJwa05zIhRlUjfF7O1tDw0ZDU=,197328,444,BLACKPINK,TEDDY| FUTURE BOUNCE| Bekuh BOOM,TEDDY,31.0,1,3,1,-1,1
2,DwVvVurfpuz+XPuFvucclVQEyPqcpUkHR0ne1RQzPs0=,231781,465,SUPER JUNIOR,<UNK>,<UNK>,31.0,1,0,0,-1,0


In [5]:
%%time
user_features_df = create_user_features(data_dir)

CPU times: user 321 ms, sys: 5.45 ms, total: 327 ms
Wall time: 326 ms


In [6]:
user_features_df.head(3)

Unnamed: 0,msno,city,gender,registered_via,bd_category,registration_init_year,expiration_date_year
0,XQxgAYj3klVKjR3oxPPXYYFp4soD4TuBghkhMTD4oTw=,1,<UNK>,7,<UNK>,2011,2017
1,UizsfmJb9mV54qE9hCYyU07Va97c0lCRLEQX3ae+ztM=,1,<UNK>,7,<UNK>,2015,2017
2,D8nEhsIOBSoE6VthTaqDX8U6lqjJ7dLdr72mOyLya2A=,1,<UNK>,4,<UNK>,2016,2017


In [7]:
train_df = load_train(data_dir + "train.csv")
train_df.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1


In [8]:
train_df = train_df.merge(song_features_df, on="song_id", how="left")
train_df = train_df.merge(user_features_df, on="msno", how="left")

In [9]:
train_df.head(3)

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,song_length,genre_ids,artist_name,composer,...,composer_count,lyricists_count,isrc_year,genres_count,city,gender,registered_via,bd_category,registration_init_year,expiration_date_year
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1,206471.0,359,Bastille,Dan Smith| Mark Crew,...,2.0,0.0,2016,0.0,1,<UNK>,7,<UNK>,2012,2017
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1,284584.0,1259,Various Artists,<UNK>,...,0.0,0.0,1999,0.0,13,female,9,young,2011,2017
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1,225396.0,1259,Nas,N. Jones、W. Adams、J. Lordan、D. Ingle,...,1.0,0.0,2006,0.0,13,female,9,young,2011,2017


In [10]:
train_df.source_system_tab = train_df.source_system_tab.cat.add_categories("<UNK>").fillna(value="<UNK>")
train_df.source_screen_name = train_df.source_screen_name.cat.add_categories("<UNK>").fillna(value="<UNK>")
train_df.source_type = train_df.source_type.cat.add_categories("<UNK>").fillna(value="<UNK>")

train_df.msno = train_df.msno.astype("category")
train_df.song_id = train_df.song_id.astype("category")

In [11]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7377418 entries, 0 to 7377417
Data columns (total 23 columns):
 #   Column                  Dtype   
---  ------                  -----   
 0   msno                    category
 1   song_id                 category
 2   source_system_tab       category
 3   source_screen_name      category
 4   source_type             category
 5   target                  uint8   
 6   song_length             float64 
 7   genre_ids               category
 8   artist_name             category
 9   composer                category
 10  lyricist                category
 11  language                category
 12  artist_name_count       float64 
 13  composer_count          float64 
 14  lyricists_count         float64 
 15  isrc_year               category
 16  genres_count            float64 
 17  city                    category
 18  gender                  category
 19  registered_via          category
 20  bd_category             category
 21  registra

In [12]:
na_mask = train_df.isna().any(axis=1)
print(f"Количество записей, для которых не нашлось пользователя или песни: {sum(na_mask)}")

Количество записей, для которых не нашлось пользователя или песни: 114


Их оказалось не так много, уберем из нашего набора данных. 

In [13]:
train_df = train_df[~na_mask]

## Задание 1

Построить рекомендации для каждого пользователя, произвести оценку качества с помощью 5-fold CV с метриками NDCG, ROC AUC.

При разработке признаков я умеренно не стал брать признаки, полученные с помощью матричных факторизаций, чтобы потом добавить неклассические эмбеддинги, полученные в задании 2. 

В качестве модели машинного обучения я решил использовать **CatBoost**, так как она хорошо работает с категориальными признаками. 

In [45]:
cat_columns = train_df.select_dtypes(include=["category"]).columns.values
data = train_df.sort_values(by="msno")

In [46]:
params = {
    "loss_function": "YetiRank", 
    "iterations": 100, 
    "custom_metric": ["NDCG", "QueryAUC:type=Ranking"], 
    "random_seed": random_state
}

data_pool = Pool(data=data.drop("target", axis=1), 
                 label=data.target.values, 
                 group_id=data.msno.cat.codes.values, 
                 cat_features=cat_columns, 
                 has_header=True)

cv_data = cv(params=params, pool=data_pool, fold_count=2, 
             shuffle=True, partition_random_seed=random_state, verbose=False)

0:	test: 0.4938031	best: 0.4938031 (0)	total: 30.4s	remaining: 50m 14s
1:	test: 0.4914225	best: 0.4938031 (0)	total: 57.2s	remaining: 46m 40s
2:	test: 0.4911426	best: 0.4938031 (0)	total: 1m 20s	remaining: 43m 23s
3:	test: 0.4963642	best: 0.4963642 (3)	total: 1m 44s	remaining: 41m 44s
4:	test: 0.5020549	best: 0.5020549 (4)	total: 2m 7s	remaining: 40m 29s
5:	test: 0.5015923	best: 0.5020549 (4)	total: 2m 30s	remaining: 39m 21s
6:	test: 0.5006870	best: 0.5020549 (4)	total: 2m 54s	remaining: 38m 33s
7:	test: 0.5565855	best: 0.5565855 (7)	total: 3m 17s	remaining: 37m 46s
8:	test: 0.6379155	best: 0.6379155 (8)	total: 3m 40s	remaining: 37m 4s
9:	test: 0.6904783	best: 0.6904783 (9)	total: 4m 3s	remaining: 36m 27s
10:	test: 0.6900035	best: 0.6904783 (9)	total: 4m 25s	remaining: 35m 47s
11:	test: 0.6920859	best: 0.6920859 (11)	total: 4m 50s	remaining: 35m 26s
12:	test: 0.6915712	best: 0.6920859 (11)	total: 5m 13s	remaining: 34m 58s
13:	test: 0.7174606	best: 0.7174606 (13)	total: 5m 37s	remaining

In [48]:
cv_data

Unnamed: 0,iterations,test-PFound-mean,test-PFound-std,test-NDCG:type=Base-mean,test-NDCG:type=Base-std,test-QueryAUC:type=Ranking-mean,test-QueryAUC:type=Ranking-std
0,0,0.493803,0.005849,0.791047,0.000026,0.528079,0.000300
1,1,0.491422,0.008484,0.791139,0.000394,0.528147,0.000020
2,2,0.491143,0.008955,0.791034,0.000519,0.528186,0.000069
3,3,0.496364,0.015481,0.792818,0.002780,0.529192,0.001614
4,4,0.502055,0.006850,0.794471,0.000271,0.530278,0.000459
...,...,...,...,...,...,...,...
95,95,0.769591,0.000044,0.848015,0.001024,0.552037,0.001075
96,96,0.769621,0.000120,0.848075,0.000964,0.552099,0.001021
97,97,0.769655,0.000152,0.848071,0.000953,0.552133,0.000981
98,98,0.769898,0.000441,0.848187,0.000814,0.552218,0.000922


## Задание 2

Построить неклассические эмбеддинги и исследовать их полезность. 

In [20]:
%%time
embeddings = EmbeddingsBuilder(embedding_dim=100, random_state=random_state)
embeddings.fit(train_df)

Fitting users: 100%|██████████| 30755/30755 [15:54<00:00, 32.23it/s]


CPU times: user 17min 18s, sys: 10.5 s, total: 17min 28s
Wall time: 16min 33s


<embeddings.EmbeddingsBuilder at 0x7ff7529a2460>

Посмотрим на похожие песни и пользователей, визуализировав их с помощью t-SNE.

In [25]:
t_sne = TSNE(n_components=2)

In [None]:
song_features_df.groupby(["genre_ids", "artist_name"]).count()