In [106]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from catboost import CatBoostClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from sklearn.preprocessing import MinMaxScaler


# Загрузка данных

Скачать датасет можно по ссылке: https://www.kaggle.com/datasets/saurabhshahane/music-dataset-1950-to-2019.

In [107]:
data = pd.read_csv('tcc_ceds_music.csv', header=0)
data


Unnamed: 0.1,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,violence,world/life,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
0,0,mukesh,mohabbat bhi jhoothi,1950,pop,hold time feel break feel untrue convince spea...,95,0.000598,0.063746,0.000598,...,0.380299,0.117175,0.357739,0.454119,0.997992,0.901822,0.339448,0.137110,sadness,1.000000
1,4,frankie laine,i believe,1950,pop,believe drop rain fall grow believe darkest ni...,51,0.035537,0.096777,0.443435,...,0.001284,0.001284,0.331745,0.647540,0.954819,0.000002,0.325021,0.263240,world/life,1.000000
2,6,johnnie ray,cry,1950,pop,sweetheart send letter goodbye secret feel bet...,24,0.002770,0.002770,0.002770,...,0.002770,0.225422,0.456298,0.585288,0.840361,0.000000,0.351814,0.139112,music,1.000000
3,10,pérez prado,patricia,1950,pop,kiss lips want stroll charm mambo chacha merin...,54,0.048249,0.001548,0.001548,...,0.225889,0.001548,0.686992,0.744404,0.083935,0.199393,0.775350,0.743736,romantic,1.000000
4,12,giorgos papadopoulos,apopse eida oneiro,1950,pop,till darling till matter know till dream live ...,48,0.001350,0.001350,0.417772,...,0.068800,0.001350,0.291671,0.646489,0.975904,0.000246,0.597073,0.394375,romantic,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28367,82447,mack 10,10 million ways,2019,hip hop,cause fuck leave scar tick tock clock come kno...,78,0.001350,0.001350,0.001350,...,0.065664,0.001350,0.889527,0.759711,0.062549,0.000000,0.751649,0.695686,obscene,0.014286
28368,82448,m.o.p.,ante up (robbin hoodz theory),2019,hip hop,minks things chain ring braclets yap fame come...,67,0.001284,0.001284,0.035338,...,0.001284,0.001284,0.662082,0.789580,0.004607,0.000002,0.922712,0.797791,obscene,0.014286
28369,82449,nine,whutcha want?,2019,hip hop,get ban get ban stick crack relax plan attack ...,77,0.001504,0.154302,0.168988,...,0.001504,0.001504,0.663165,0.726970,0.104417,0.000001,0.838211,0.767761,obscene,0.014286
28370,82450,will smith,switch,2019,hip hop,check check yeah yeah hear thing call switch g...,67,0.001196,0.001196,0.001196,...,0.001196,0.001196,0.883028,0.786888,0.007027,0.000503,0.508450,0.885882,obscene,0.014286


# Первичный анализ данных

In [108]:
names_counts = data.artist_name.value_counts()
names_counts.describe()

count    5426.000000
mean        5.228898
std         9.209810
min         1.000000
25%         1.000000
50%         2.000000
75%         5.000000
max       190.000000
Name: count, dtype: float64

In [109]:
# удалим всех артистов, у которых меньше 5 песен, они будут портить предсказание

names_counts_idx = names_counts[names_counts>=5].index

data['artist_name'] = data['artist_name'].apply(lambda x : x if x in names_counts_idx else None)

In [110]:
data = data.dropna()

In [111]:
# создаем датафрейм признаков, куда не включаем текстовые признаки
X = data.drop(['Unnamed: 0', 'track_name', 'lyrics', 'dating', 'topic'], axis=1)

In [112]:
X

Unnamed: 0,artist_name,release_date,genre,len,violence,world/life,night/time,shake the audience,family/gospel,romantic,...,like/girls,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,age
1,frankie laine,1950,pop,51,0.096777,0.443435,0.001284,0.001284,0.027007,0.001284,...,0.001284,0.001284,0.001284,0.331745,0.647540,0.954819,0.000002,0.325021,0.263240,1.000000
5,perry como,1950,pop,98,0.420685,0.001053,0.074078,0.001053,0.001053,0.001053,...,0.001053,0.128292,0.001053,0.689158,0.685588,0.898594,0.000000,0.768137,0.549535,1.000000
7,johnny mathis,1950,pop,21,0.002506,0.336056,0.002506,0.002506,0.002506,0.176861,...,0.002506,0.062602,0.002506,0.379400,0.529421,0.925703,0.000072,0.373454,0.192167,1.000000
11,the chordettes,1951,pop,38,0.001645,0.001645,0.397490,0.001645,0.001645,0.001645,...,0.001645,0.001645,0.001645,0.265677,0.622260,0.988956,0.000000,0.058739,0.111083,0.985714
12,frankie laine,1951,pop,173,0.244358,0.083570,0.000627,0.013375,0.022181,0.000627,...,0.000627,0.000627,0.032581,0.522365,0.528729,0.852409,0.000000,0.408491,0.153127,0.985714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28360,nappy roots,2019,hip hop,74,0.125810,0.353286,0.001196,0.001196,0.001196,0.001196,...,0.029702,0.001196,0.001196,0.546193,0.764967,0.182730,0.000000,0.751649,0.698689,0.014286
28364,nipsey hussle,2019,hip hop,88,0.001096,0.001096,0.001096,0.033829,0.001096,0.001096,...,0.001096,0.001096,0.001096,0.635005,0.851755,0.014156,0.000000,0.735161,0.913911,0.014286
28365,nappy roots,2019,hip hop,109,0.000810,0.000810,0.000810,0.000810,0.000810,0.000810,...,0.000810,0.000810,0.000810,0.766057,0.773325,0.037348,0.000000,0.682605,0.855851,0.014286
28367,mack 10,2019,hip hop,78,0.001350,0.001350,0.001350,0.001350,0.001350,0.001350,...,0.001350,0.065664,0.001350,0.889527,0.759711,0.062549,0.000000,0.751649,0.695686,0.014286


In [113]:
# One-Hot кодируем жанр и имя артиста
X_t = pd.concat([X.drop(['artist_name', 'genre'], axis=1), pd.get_dummies(X['artist_name']), pd.get_dummies(X['genre'])], axis=1)

In [114]:
X_t

Unnamed: 0,release_date,len,violence,world/life,night/time,shake the audience,family/gospel,romantic,communication,obscene,...,ziggy marley,ziggy marley & the melody makers,zz top,blues,country,hip hop,jazz,pop,reggae,rock
1,1950,51,0.096777,0.443435,0.001284,0.001284,0.027007,0.001284,0.001284,0.001284,...,False,False,False,False,False,False,False,True,False,False
5,1950,98,0.420685,0.001053,0.074078,0.001053,0.001053,0.001053,0.001053,0.001053,...,False,False,False,False,False,False,False,True,False,False
7,1950,21,0.002506,0.336056,0.002506,0.002506,0.002506,0.176861,0.002506,0.002506,...,False,False,False,False,False,False,False,True,False,False
11,1951,38,0.001645,0.001645,0.397490,0.001645,0.001645,0.001645,0.066201,0.001645,...,False,False,False,False,False,False,False,True,False,False
12,1951,173,0.244358,0.083570,0.000627,0.013375,0.022181,0.000627,0.109129,0.458984,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28360,2019,74,0.125810,0.353286,0.001196,0.001196,0.001196,0.001196,0.001196,0.040241,...,False,False,False,False,False,True,False,False,False,False
28364,2019,88,0.001096,0.001096,0.001096,0.033829,0.001096,0.001096,0.001096,0.733959,...,False,False,False,False,False,True,False,False,False,False
28365,2019,109,0.000810,0.000810,0.000810,0.000810,0.000810,0.000810,0.000810,0.394272,...,False,False,False,False,False,True,False,False,False,False
28367,2019,78,0.001350,0.001350,0.001350,0.001350,0.001350,0.001350,0.001350,0.391651,...,False,False,False,False,False,True,False,False,False,False


In [115]:
X_t = X_t.reset_index()
data = data.reset_index()
X_t = X_t.reset_index()
data = data.reset_index()

In [116]:
# нормируем дату и длину песни
X_t[['release_date', 'len']]= MinMaxScaler().fit(X_t[['release_date', 'len']].to_numpy()).set_output(transform='pandas').transform(X_t[['release_date', 'len']].to_numpy())


In [117]:
X_t

Unnamed: 0,level_0,index,release_date,len,violence,world/life,night/time,shake the audience,family/gospel,romantic,...,ziggy marley,ziggy marley & the melody makers,zz top,blues,country,hip hop,jazz,pop,reggae,rock
0,0,1,0.000000,0.252525,0.096777,0.443435,0.001284,0.001284,0.027007,0.001284,...,False,False,False,False,False,False,False,True,False,False
1,1,5,0.000000,0.489899,0.420685,0.001053,0.074078,0.001053,0.001053,0.001053,...,False,False,False,False,False,False,False,True,False,False
2,2,7,0.000000,0.101010,0.002506,0.336056,0.002506,0.002506,0.002506,0.176861,...,False,False,False,False,False,False,False,True,False,False
3,3,11,0.014493,0.186869,0.001645,0.001645,0.397490,0.001645,0.001645,0.001645,...,False,False,False,False,False,False,False,True,False,False
4,4,12,0.014493,0.868687,0.244358,0.083570,0.000627,0.013375,0.022181,0.000627,...,False,False,False,False,False,False,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21531,21531,28360,1.000000,0.368687,0.125810,0.353286,0.001196,0.001196,0.001196,0.001196,...,False,False,False,False,False,True,False,False,False,False
21532,21532,28364,1.000000,0.439394,0.001096,0.001096,0.001096,0.033829,0.001096,0.001096,...,False,False,False,False,False,True,False,False,False,False
21533,21533,28365,1.000000,0.545455,0.000810,0.000810,0.000810,0.000810,0.000810,0.000810,...,False,False,False,False,False,True,False,False,False,False
21534,21534,28367,1.000000,0.388889,0.001350,0.001350,0.001350,0.001350,0.001350,0.001350,...,False,False,False,False,False,True,False,False,False,False


# Создание модели

In [118]:
knn = NearestNeighbors( algorithm='brute')
knn.fit(X_t)

# Тестирование

In [157]:
# функция для тестирования
def predict(artist_name, track_name, n_neighb):
    
    i = data[data['track_name']==track_name].iloc[0,0]
    d, ind = knn.kneighbors(X_t.iloc[i, :].to_numpy().reshape(1,-1), n_neighbors=n_neighb+1)
    print(data.iloc[i, 3:7])
    print('='*30)
    for i in ind[0][1:]:
        print(data.iloc[i, 3:7])
        print('-'*30)

In [158]:
data[data['artist_name']=='the beatles']

Unnamed: 0.1,level_0,index,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
18256,18256,23629,70849,the beatles,not a second time,1963,rock,know wonder change mind reason change give lin...,24,0.002105,...,0.002105,0.002105,0.443301,0.657847,0.213855,0.000000,0.965993,0.817812,world/life,0.814286
18257,18257,23630,70854,the beatles,all i've got to do,1963,rock,want yeah phone come run home yeah want kiss y...,29,0.001815,...,0.001815,0.038473,0.463880,0.650104,0.217871,0.000000,0.881492,0.579566,music,0.814286
18258,18258,23631,70858,the beatles,you really got a hold on me,1963,rock,like think treat badly madly hold hold baby wa...,45,0.100844,...,0.001316,0.001316,0.337160,0.675512,0.527108,0.000000,0.683636,0.388369,romantic,0.814286
18259,18259,23632,70861,the beatles,till there was you,1963,rock,bell hear ring hear till bird wing till music ...,11,0.004785,...,0.004785,0.004785,0.720568,0.558162,0.793172,0.000000,0.641385,0.338318,music,0.814286
18260,18260,23633,70862,the beatles,all my loving,1963,rock,close eye kiss tomorrow miss remember true awa...,22,0.002288,...,0.041517,0.002288,0.383732,0.690537,0.207831,0.000000,0.903133,0.563550,romantic,0.814286
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19530,19530,25093,75088,the beatles,long tall sally,1988,rock,go tell aunt uncle say miss baby yeeeeh baby w...,51,0.319836,...,0.001316,0.094508,0.347991,0.732277,0.209839,0.004372,0.920651,0.770764,night/time,0.457143
19553,19553,25123,75170,the beatles,i call your name,1988,rock,blame unfair sleep night go weep night know kn...,23,0.002632,...,0.002632,0.147428,0.680494,0.733277,0.314256,0.000123,0.967024,0.691682,night/time,0.457143
19560,19560,25131,75191,the beatles,this boy,1988,rock,take away regret someday want good want want t...,15,0.003289,...,0.388454,0.124520,0.648002,0.618747,0.213855,0.000016,0.700124,0.337317,sadness,0.457143
19568,19568,25142,75232,the beatles,yes it is,1988,rock,wear tonight remember say tonight color baby w...,45,0.030718,...,0.001196,0.001196,0.381566,0.704151,0.640562,0.000000,0.414674,0.329308,world/life,0.457143


In [159]:
predict('the beatles', "and i love her", 10)

artist_name        the beatles
track_name      and i love her
release_date              1964
genre                     rock
Name: 18284, dtype: object
artist_name             the beatles
track_name      i'll follow the sun
release_date                   1964
genre                          rock
Name: 18283, dtype: object
------------------------------
artist_name                        the kinks
track_name      all day and all of the night
release_date                            1964
genre                                   rock
Name: 18285, dtype: object
------------------------------
artist_name              the beatles
track_name      things we said today
release_date                    1964
genre                           rock
Name: 18286, dtype: object
------------------------------
artist_name     the rolling stones
track_name           not fade away
release_date                  1964
genre                         rock
Name: 18282, dtype: object
------------------------------
artis



In [160]:
predict('mose allison', "i don't worry about a thing", 10)

artist_name                    mose allison
track_name      i don't worry about a thing
release_date                           1962
genre                                 blues
Name: 10003, dtype: object
artist_name     chuck berry
track_name      thirty days
release_date           1962
genre                 blues
Name: 10004, dtype: object
------------------------------
artist_name     howlin' wolf
track_name          spoonful
release_date            1962
genre                  blues
Name: 10002, dtype: object
------------------------------
artist_name                   aretha franklin
track_name      ac-cent-tchu-ate the positive
release_date                             1962
genre                                   blues
Name: 10005, dtype: object
------------------------------
artist_name                    dinah washington
track_name      lover man (oh where can you be)
release_date                               1962
genre                                     blues
Name: 10006, dtype:



In [161]:
predict('marilyn manson', 'deformography', 7)	

artist_name     marilyn manson
track_name       deformography
release_date              1996
genre                     rock
Name: 20006, dtype: object
artist_name     rage against the machine
track_name         year of tha boomerang
release_date                        1996
genre                               rock
Name: 20005, dtype: object
------------------------------
artist_name       modest mouse
track_name      novocain stain
release_date              1996
genre                     rock
Name: 20007, dtype: object
------------------------------
artist_name          george michael
track_name      you have been loved
release_date                   1996
genre                          rock
Name: 20004, dtype: object
------------------------------
artist_name     modest mouse
track_name             might
release_date            1996
genre                   rock
Name: 20008, dtype: object
------------------------------
artist_name         korn
track_name      lowrider
release_date       



In [163]:
# здесь можно посмотерть разные части датасета, чтобы выбрать примеры
start_row = 20000
end_row = 20030
data.iloc[start_row:end_row, :]

Unnamed: 0.1,level_0,index,Unnamed: 0,artist_name,track_name,release_date,genre,lyrics,len,dating,...,sadness,feelings,danceability,loudness,acousticness,instrumentalness,valence,energy,topic,age
20000,20000,25675,76527,rage against the machine,wind below,1996,rock,flip capital eclipse vocal tone sweat apocalyp...,123,0.000957,...,0.000957,0.000957,0.352323,0.667051,0.002579,0.198381,0.432193,0.744737,violence,0.342857
20001,20001,25676,76530,rod stewart,when i need you,1996,rock,need close eye want heartbeat away need hold h...,69,0.025777,...,0.123891,0.000849,0.36532,0.62185,0.578313,0.0,0.176628,0.310289,romantic,0.342857
20002,20002,25678,76541,weezer,i just threw out the love of my dreams,1996,rock,tall things throw dream eye ears blood tear br...,44,0.001548,...,0.496256,0.001548,0.377234,0.813117,0.002218,2e-06,0.340478,0.963963,sadness,0.342857
20003,20003,25679,76545,korn,lowrider,1996,rock,shit friends drive lowrider lowrider little lo...,13,0.004785,...,0.004785,0.004785,0.74981,0.729405,0.000774,0.491903,0.976298,0.793787,obscene,0.342857
20004,20004,25680,76549,george michael,you have been loved,1996,rock,take road past school change time think young ...,77,0.000822,...,0.000822,0.000822,0.588433,0.514191,0.243975,0.072065,0.141591,0.24322,world/life,0.342857
20005,20005,25681,76550,rage against the machine,year of tha boomerang,1996,rock,sisters check line spend haiti state mind cast...,176,0.000578,...,0.000578,0.000578,0.316582,0.654386,0.0005,0.000428,0.509481,0.711703,obscene,0.342857
20006,20006,25682,76551,marilyn manson,deformography,1996,rock,wish fall fall hard fell insect decay little t...,113,0.000591,...,0.092487,0.000591,0.476876,0.66959,0.011144,6.2e-05,0.32296,0.851847,violence,0.342857
20007,20007,25684,76553,modest mouse,novocain stain,1996,rock,work tell stain memories yeah shoe remember so...,37,0.002288,...,0.002288,0.002288,0.355572,0.69187,0.093573,0.009028,0.44765,0.816811,world/life,0.342857
20008,20008,25685,76561,modest mouse,might,1996,rock,break bone goddamn yesterday night break bone ...,11,0.004785,...,0.317103,0.004785,0.476876,0.733918,0.069879,0.804656,0.210635,0.955955,sadness,0.342857
20009,20009,25686,76563,soundgarden,ty cobb,1996,rock,sit magic smoke mirror tire rubber fire watch ...,114,0.000532,...,0.000532,0.000532,0.447633,0.676768,0.001174,0.431174,0.172506,0.98999,violence,0.342857
