# Task 1

In [1]:
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC

In [2]:
music_genre = pd.read_csv("../data/music_genre_preprocessed.csv")
music_genre.pop('Unnamed: 0')
music_genre.head()

Unnamed: 0,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,valence,music_genre
0,27.0,0.00468,0.652,245503.541466,0.941,0.792,11,0.115,-5.201,0,0.0748,100.889,0.759,Electronic
1,31.0,0.0127,0.622,218293.0,0.89,0.95,3,0.124,-7.043,0,0.03,115.002,0.531,Electronic
2,28.0,0.00306,0.62,215613.0,0.755,0.0118,9,0.534,-4.617,1,0.0345,127.994,0.333,Electronic
3,34.0,0.0254,0.774,166875.0,0.7,0.00253,2,0.157,-4.498,1,0.239,128.014,0.27,Electronic
4,32.0,0.00465,0.638,222369.0,0.587,0.909,7,0.157,-6.266,1,0.0413,145.036,0.323,Electronic


In [3]:
# Перекодируем целевой признак

music_genre_mapping = {
    "Electronic":1,
    "Rap":2,
    "Classical":3,
    "Rock":4,
    "Hip-Hop":5,
    "Anime":6,
    "Blues":7,
    "Country":8,
    "Jazz":9,
    "Alternative":10
}
music_genre['music_genre'] = music_genre['music_genre'].map(music_genre_mapping)

In [4]:
# Уберем аномалии

outliner_columns = ['duration_ms']

for col in outliner_columns:
    # Первая и третья квантиль
    first_quartile = music_genre[col].describe()['25%']
    third_quartile = music_genre[col].describe()['75%']

    # Интерквантильный размах
    iqr = third_quartile - first_quartile

    # Убираем аномалии
    music_genre = music_genre[(music_genre[col] > (first_quartile - 3 * iqr)) & 
                              (music_genre[col] < (third_quartile + 3 * iqr))]

In [5]:
# Возьмем только 5000 объектов из перемешанной выборки
music_genre = shuffle(music_genre, random_state=0)
music_genre = music_genre.iloc[:5000]
music_genre.reset_index(drop=True, inplace=True)

In [6]:
X = music_genre.drop(['music_genre'], 1)
y = music_genre['music_genre']

In [7]:
# Разбиваем выборку на обучающую и тестовую
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [35]:
pipe = Pipeline([('scale', StandardScaler()),
                 ('clf', SVC(kernel = 'linear', C = 1))])

param_grid = dict(clf__C=np.logspace(-4, 1, 6),
                  clf__kernel=['rbf','linear'])

grid = GridSearchCV(pipe, 
                    param_grid=param_grid, 
                    cv=3, 
                    n_jobs=1, 
                    verbose=2, scoring= 'accuracy')
grid.fit(X_train, y_train)
print(grid.best_score_)
print(grid.cv_results_)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END .....................clf__C=0.0001, clf__kernel=rbf; total time=   3.7s
[CV] END .....................clf__C=0.0001, clf__kernel=rbf; total time=   4.5s
[CV] END .....................clf__C=0.0001, clf__kernel=rbf; total time=   3.8s
[CV] END ..................clf__C=0.0001, clf__kernel=linear; total time=   1.9s
[CV] END ..................clf__C=0.0001, clf__kernel=linear; total time=   1.8s
[CV] END ..................clf__C=0.0001, clf__kernel=linear; total time=   3.0s
[CV] END ......................clf__C=0.001, clf__kernel=rbf; total time=   5.4s
[CV] END ......................clf__C=0.001, clf__kernel=rbf; total time=   8.3s
[CV] END ......................clf__C=0.001, clf__kernel=rbf; total time=   6.1s
[CV] END ...................clf__C=0.001, clf__kernel=linear; total time=   2.3s
[CV] END ...................clf__C=0.001, clf__kernel=linear; total time=   2.6s
[CV] END ...................clf__C=0.001, clf__k

In [36]:
y_pred = grid.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 75   2   3   5   3  17   6   7  12   5]
 [  0  45   0  19  68   0   0   3   1   8]
 [  6   0  99   0   0   6   3   0  13   2]
 [  3  10   0 107   2   0   1   6   3   9]
 [  2  45   0  13  88   0   0   5   2  11]
 [  8   0  18   0   1  93  10   9   3   5]
 [  8   0   3   3   1  16  79  26  10   1]
 [  6   1   1  38   5   2  21  82   8   8]
 [ 33   0  15   6   4   6  20   5  66   3]
 [  8  12   0  32  15   1   6  26   5  56]]
              precision    recall  f1-score   support

           1       0.50      0.56      0.53       135
           2       0.39      0.31      0.35       144
           3       0.71      0.77      0.74       129
           4       0.48      0.76      0.59       141
           5       0.47      0.53      0.50       166
           6       0.66      0.63      0.65       147
           7       0.54      0.54      0.54       147
           8       0.49      0.48      0.48       172
           9       0.54      0.42      0.47       158
          10       0.52     

In [10]:
from catboost import CatBoostClassifier

CatBCl = CatBoostClassifier()
CatBCl.fit(X_train, y_train, verbose=False)
y_cat_pred = CatBCl.predict(X_test)

print(confusion_matrix(y_test, y_cat_pred))
print(classification_report(y_test, y_cat_pred))

[[ 85   0   2   4   1   8  10   2  14   9]
 [  1  54   0  12  69   0   0   1   0   7]
 [  2   0  97   0   0  12   6   0  11   1]
 [  1  15   0  96   5   0   1   7   1  15]
 [  2  60   0   9  76   0   0   5   0  14]
 [ 10   1  10   2   0 102   9   6   1   6]
 [  8   0   3   4   0  17  69  21  20   5]
 [  2   4   1  32   3   2  10  94  11  13]
 [ 27   1  12   8   3   4  17   9  72   5]
 [ 14  14   0  29  11   1   3  19   9  61]]
              precision    recall  f1-score   support

           1       0.56      0.63      0.59       135
           2       0.36      0.38      0.37       144
           3       0.78      0.75      0.76       129
           4       0.49      0.68      0.57       141
           5       0.45      0.46      0.46       166
           6       0.70      0.69      0.70       147
           7       0.55      0.47      0.51       147
           8       0.57      0.55      0.56       172
           9       0.52      0.46      0.48       158
          10       0.45     

Такие вот результаты

In [11]:
def pipe_grid_svc(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
    pipe = Pipeline([('scale', StandardScaler()),
                     ('clf', SVC(kernel = 'linear', C = 1))])

    param_grid = dict(clf__C=np.logspace(-4, 1, 6),
                      clf__kernel=['rbf','linear'])

    grid = GridSearchCV(pipe, 
                        param_grid=param_grid, 
                        cv=3, 
                        n_jobs=1, 
                        verbose=2, scoring= 'accuracy')
    grid.fit(X_train, y_train)
    print(grid.best_score_)
    print(grid.cv_results_)

    y_pred = grid.predict(X_test)
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))

In [12]:
def cat_boo(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
    
    CatBCl = CatBoostClassifier()
    CatBCl.fit(X_train, y_train, verbose=False)
    y_cat_pred = CatBCl.predict(X_test)

    print(confusion_matrix(y_test, y_cat_pred))
    print(classification_report(y_test, y_cat_pred))

### Отбор признаков

In [13]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import ExtraTreesClassifier

##### Одномерный отбор признаков (SelectKBest)

In [14]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X)
X_t = scaler.transform(X)

# feature extraction
test = SelectKBest(score_func=chi2, k=5)
fit = test.fit(X_t, y)

# summarize scores
print(fit.scores_)
features = fit.transform(X_t)

# summarize selected features
print(features[0:5,:])

[ 173.20385714  716.21765529  125.35200286    8.61306356  191.62030162
 1073.47287141    2.13038609   10.44310536   50.65409125   85.74333614
  233.47521877   13.26155625  102.6330807 ]
[[7.91666667e-01 3.78513189e-01 8.89772932e-01 1.90332326e-01
  1.87866928e-01]
 [5.62500000e-01 8.48380802e-02 5.86147464e-01 5.05538771e-03
  1.80528376e-01]
 [3.85416667e-01 7.06814340e-02 9.04803896e-01 0.00000000e+00
  2.97211350e-02]
 [4.27083333e-01 1.00400351e-01 2.75507546e-01 1.80261833e-03
  2.50733855e-02]
 [5.93750000e-01 4.28701502e-02 6.61302283e-01 9.10372608e-06
  3.00391389e-01]]


In [15]:
# индексы наиболее важных признаков
skb = [0, 1, 4, 5, 10]

In [16]:
X_skb = X.iloc[:, [0, 1, 4, 5, 10]]

In [17]:
X_skb.head()

Unnamed: 0,popularity,acousticness,energy,instrumentalness,speechiness
0,76.0,0.377,0.889,0.189,0.176
1,54.0,0.0845,0.586,0.00502,0.17
2,37.0,0.0704,0.904,0.0,0.0467
3,41.0,0.1,0.276,0.00179,0.0429
4,57.0,0.0427,0.661,9e-06,0.268


In [18]:
pipe_grid_svc(X_skb, y)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END .....................clf__C=0.0001, clf__kernel=rbf; total time=   3.0s
[CV] END .....................clf__C=0.0001, clf__kernel=rbf; total time=   2.7s
[CV] END .....................clf__C=0.0001, clf__kernel=rbf; total time=   3.0s
[CV] END ..................clf__C=0.0001, clf__kernel=linear; total time=   2.0s
[CV] END ..................clf__C=0.0001, clf__kernel=linear; total time=   2.1s
[CV] END ..................clf__C=0.0001, clf__kernel=linear; total time=   2.0s
[CV] END ......................clf__C=0.001, clf__kernel=rbf; total time=   2.9s
[CV] END ......................clf__C=0.001, clf__kernel=rbf; total time=   2.9s
[CV] END ......................clf__C=0.001, clf__kernel=rbf; total time=   2.8s
[CV] END ...................clf__C=0.001, clf__kernel=linear; total time=   1.3s
[CV] END ...................clf__C=0.001, clf__kernel=linear; total time=   1.4s
[CV] END ...................clf__C=0.001, clf__k

In [19]:
cat_boo(X_skb, y)

[[ 63   0   1   6   2  15  30   6  19  14]
 [  1  63   0   8  52   0   0   2   2  10]
 [  2   0 105   1   0   8   7   1  10   2]
 [  1  17   0  78   7   0   0  11   6  10]
 [  3  47   0  14  96   0   1   6   1   4]
 [ 14   1  15   0   0  87  19   6   3   3]
 [  9   1   4   4   4  14  72  19  15   5]
 [  2   7   0  29   3   3  13  83  11  14]
 [ 26   1  11   6   1   2  18  22  56  12]
 [  6  13   3  23   8   1   4  23  10  62]]
              precision    recall  f1-score   support

           1       0.50      0.40      0.45       156
           2       0.42      0.46      0.44       138
           3       0.76      0.77      0.76       136
           4       0.46      0.60      0.52       130
           5       0.55      0.56      0.56       172
           6       0.67      0.59      0.63       148
           7       0.44      0.49      0.46       147
           8       0.46      0.50      0.48       165
           9       0.42      0.36      0.39       155
          10       0.46     

##### Рекурсивное исключение признаков (recursive feature elimination, RFE)

In [20]:
# feature extraction
scaler = StandardScaler()
scaler.fit(X)
X_s = scaler.transform(X)

model = LogisticRegression()
rfe = RFE(model, 5)
fit = rfe.fit(X_s, y)
print(fit.n_features_)
print(fit.support_)
print(fit.ranking_)



5
[ True False  True False False  True False False  True False  True False
 False]
[1 3 1 5 4 1 9 8 1 6 1 7 2]


In [21]:
# индексы наиболее важных признаков
rfe = [0, 2, 5, 8, 10]

In [22]:
X_rfe = X.iloc[:, [0, 2, 5, 8, 10]]

In [23]:
X_rfe.head()

Unnamed: 0,popularity,danceability,instrumentalness,loudness,speechiness
0,76.0,0.527,0.189,-6.823,0.176
1,54.0,0.816,0.00502,-9.014,0.17
2,37.0,0.373,0.0,-6.348,0.0467
3,41.0,0.37,0.00179,-17.107,0.0429
4,57.0,0.746,9e-06,-7.658,0.268


In [24]:
pipe_grid_svc(X_rfe, y)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END .....................clf__C=0.0001, clf__kernel=rbf; total time=   6.1s
[CV] END .....................clf__C=0.0001, clf__kernel=rbf; total time=   6.4s
[CV] END .....................clf__C=0.0001, clf__kernel=rbf; total time=   8.7s
[CV] END ..................clf__C=0.0001, clf__kernel=linear; total time=   4.3s
[CV] END ..................clf__C=0.0001, clf__kernel=linear; total time=   4.8s
[CV] END ..................clf__C=0.0001, clf__kernel=linear; total time=   4.8s
[CV] END ......................clf__C=0.001, clf__kernel=rbf; total time=   7.5s
[CV] END ......................clf__C=0.001, clf__kernel=rbf; total time=   5.8s
[CV] END ......................clf__C=0.001, clf__kernel=rbf; total time=   6.9s
[CV] END ...................clf__C=0.001, clf__kernel=linear; total time=   2.7s
[CV] END ...................clf__C=0.001, clf__kernel=linear; total time=   4.1s
[CV] END ...................clf__C=0.001, clf__k

In [25]:
cat_boo(X_rfe, y)

[[ 70   0   3   2   0   6  16   8  36   8]
 [  0  52   0  12  64   0   0   4   1   8]
 [  4   0 118   0   0   6   5   1   6   2]
 [  1   4   0  85   7   0   1  18   3  16]
 [  2  56   0   8  81   0   1   5   1   7]
 [ 14   0  12   0   0  93  18   3   2   5]
 [  8   1   4   4   2  16  70  14  20  10]
 [  1   3   0  30   5   5  13  83  21  20]
 [ 26   0  12   5   1   2  21  15  56   8]
 [  4  11   0  30  13   2   4  27  10  48]]
              precision    recall  f1-score   support

           1       0.54      0.47      0.50       149
           2       0.41      0.37      0.39       141
           3       0.79      0.83      0.81       142
           4       0.48      0.63      0.55       135
           5       0.47      0.50      0.49       161
           6       0.72      0.63      0.67       147
           7       0.47      0.47      0.47       149
           8       0.47      0.46      0.46       181
           9       0.36      0.38      0.37       146
          10       0.36     

##### Отбор на основе важности признаков (ExtraTreesClassifier)

In [26]:
# feature extraction
model = ExtraTreesClassifier()
model.fit(X, y)
print(model.feature_importances_)

[0.15326256 0.08782352 0.09268602 0.06547848 0.08068753 0.09192268
 0.05220557 0.05714365 0.07965995 0.01846845 0.08945481 0.06016855
 0.07103823]


In [27]:
# индексы наиболее важных признаков
etc = [0, 1, 2, 5, 10]

In [28]:
X_etc = X.iloc[:, [0, 1, 2, 5, 10]]

In [29]:
X_etc.head()

Unnamed: 0,popularity,acousticness,danceability,instrumentalness,speechiness
0,76.0,0.377,0.527,0.189,0.176
1,54.0,0.0845,0.816,0.00502,0.17
2,37.0,0.0704,0.373,0.0,0.0467
3,41.0,0.1,0.37,0.00179,0.0429
4,57.0,0.0427,0.746,9e-06,0.268


In [30]:
pipe_grid_svc(X_etc, y)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END .....................clf__C=0.0001, clf__kernel=rbf; total time=   4.1s
[CV] END .....................clf__C=0.0001, clf__kernel=rbf; total time=   3.4s
[CV] END .....................clf__C=0.0001, clf__kernel=rbf; total time=   3.4s
[CV] END ..................clf__C=0.0001, clf__kernel=linear; total time=   2.2s
[CV] END ..................clf__C=0.0001, clf__kernel=linear; total time=   4.6s
[CV] END ..................clf__C=0.0001, clf__kernel=linear; total time=   3.9s
[CV] END ......................clf__C=0.001, clf__kernel=rbf; total time=   4.6s
[CV] END ......................clf__C=0.001, clf__kernel=rbf; total time=   9.6s
[CV] END ......................clf__C=0.001, clf__kernel=rbf; total time=   5.6s
[CV] END ...................clf__C=0.001, clf__kernel=linear; total time=   2.6s
[CV] END ...................clf__C=0.001, clf__kernel=linear; total time=   2.3s
[CV] END ...................clf__C=0.001, clf__k

In [31]:
cat_boo(X_etc, y)

[[ 76   1   2   2   3   6  14   6  20   8]
 [  4  53   0  17  73   1   0   0   0   4]
 [  1   0 107   0   0   9   7   1   8   3]
 [  0  12   0  86   6   0   1   9   5  13]
 [  1  58   0  15  84   0   0   7   1   5]
 [ 14   0  14   1   0  91  16   8   2   3]
 [ 17   1   5   7   1  14  62  21  14   9]
 [  3   5   1  33   3   1  12  87   8  13]
 [ 24   0   7   9   2   3  32  12  45   9]
 [ 10   6   0  25  17   0   8  23  10  63]]
              precision    recall  f1-score   support

           1       0.51      0.55      0.53       138
           2       0.39      0.35      0.37       152
           3       0.79      0.79      0.79       136
           4       0.44      0.65      0.53       132
           5       0.44      0.49      0.47       171
           6       0.73      0.61      0.66       149
           7       0.41      0.41      0.41       151
           8       0.50      0.52      0.51       166
           9       0.40      0.31      0.35       143
          10       0.48     

### Понижение размерности

##### Метод главных компонент (principal component analysis, PCA)

In [37]:
from sklearn.decomposition import PCA

# feature extraction
pca = PCA(n_components=5)
fit = pca.fit(X)
features = fit.transform(X)

# summarize components
print(f'Explained Variance: {fit.explained_variance_ratio_}')
print(features[0:5,:])

Explained Variance: [9.99999772e-01 1.70492521e-07 4.88052575e-08 6.18726737e-09
 2.52507328e-09]
[[-5.24786827e+04  5.82305051e+01  2.91827810e+01  4.75954839e+00
   9.54959285e-01]
 [-2.39706827e+04 -1.71513533e+01  9.71175968e+00  2.92226459e-01
   5.85943296e+00]
 [-6.87868240e+03  1.52237829e+01 -7.59515925e+00 -2.83565674e+00
  -3.29500485e+00]
 [ 1.14414317e+05 -1.03189734e+00 -3.35144584e+00  7.65008138e+00
   2.01019835e+00]
 [-8.66746827e+04 -5.47324231e-01  1.17081658e+01  7.21789372e-02
   4.87327414e+00]]


In [38]:
pipe_grid_svc(features[:,:], y)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] END .....................clf__C=0.0001, clf__kernel=rbf; total time=   4.3s
[CV] END .....................clf__C=0.0001, clf__kernel=rbf; total time=   4.2s
[CV] END .....................clf__C=0.0001, clf__kernel=rbf; total time=   4.3s
[CV] END ..................clf__C=0.0001, clf__kernel=linear; total time=   2.0s
[CV] END ..................clf__C=0.0001, clf__kernel=linear; total time=   2.6s
[CV] END ..................clf__C=0.0001, clf__kernel=linear; total time=   2.2s
[CV] END ......................clf__C=0.001, clf__kernel=rbf; total time=   4.1s
[CV] END ......................clf__C=0.001, clf__kernel=rbf; total time=   4.3s
[CV] END ......................clf__C=0.001, clf__kernel=rbf; total time=   4.1s
[CV] END ...................clf__C=0.001, clf__kernel=linear; total time=   2.2s
[CV] END ...................clf__C=0.001, clf__kernel=linear; total time=   2.2s
[CV] END ...................clf__C=0.001, clf__k

In [39]:
cat_boo(features[:,:], y)

[[ 38   7   4   4   0  19  29  13  15  19]
 [  3  40   0  28  54   0   0   3   1  13]
 [  4   0 106   2   0   6   9   4  12   1]
 [  4  37   0  51  47   0   0   7   4   9]
 [  2  42   0  32  44   0   2   6   8  15]
 [ 10   0  19   0   1  93  17   3   4   4]
 [ 30   2   5   6   1  25  41  12  13   5]
 [  8   9   1  18  17   2  11  54  12  27]
 [ 17   2   9   5   6   6  15  19  55  11]
 [  7  12   0  28  25   2   2  36  14  35]]
              precision    recall  f1-score   support

           1       0.31      0.26      0.28       148
           2       0.26      0.28      0.27       142
           3       0.74      0.74      0.74       144
           4       0.29      0.32      0.31       159
           5       0.23      0.29      0.25       151
           6       0.61      0.62      0.61       151
           7       0.33      0.29      0.31       140
           8       0.34      0.34      0.34       159
           9       0.40      0.38      0.39       145
          10       0.25     

Отбор признаков до 5 самых важных, не улучшил качество моделей. Это произошло в силу множества причин:
1. Прикладная область — музыка. Даже человек легко путает жанры «Хип-хоп» и «Рэп».
2. Понижение количества признаков в данном случае, скорее уменьшает кол-во «полезных» данных, чем избавляет от шума.
3. Такому кол-ву признаков не хватает данных, чтобы восстановить зависимости классов.