# Задание из четвёртой лабы

Обучите один из алгоритмов классификации из библиотеки sklearn. Объяснять принцип работы классификатора в этой лабораторной не требуется

1. Выберите один из датасетов первой лабораторной


2. Найдите в данных признак, который будете предсказывать
    
    Если это категория, то у неё должно быть от 2 до 10 значений (например колонка species в датасете миграции птиц содержит 3 категории)
    
    Если признак количественный, то его нужно дискретизировать (например для популярности трека в датасете spotify можно разделить треки на 3 категории: 0-20 - популярные, 21-50 - обычные, 51 - 100 - непопулярные). Полученную категорию записать в датафрейм как отдельную колонку


3. Выделите признаки для предсказания
    
    Используйте от 2 до 5 количественных признаков для предсказания целевой переменной


4. Проверьте нормальность количественных признаков
    
    Используйте статистический тест из лекции. Если распределение является лог-нормальным, логарифмируйте его и проверьте на нормальность
    
    Как минимум 1 из признаков должен быть нормальным


5. Покажите качество предсказания модели с помощью метрик

    Метрика должна учитывать баланс классов в датасете

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
seed = 0

In [None]:
data = pd.read_csv('data/SpotifyFeatures.csv')

In [None]:
data.info()
data.head(5)

In [None]:
def score(popularity: int) -> str:
    if (popularity < 15):
        return "Very popular"
    elif (popularity < 35):
        return "Popular"
    elif (popularity < 65):
        return "Regular"
    else:
        return "Almost unknown"

data.insert(5, "categorized popularity", data.popularity.apply(score))

In [None]:
data.info()
data.sample(5)

In [None]:
X = data[['liveness', 'danceability', 'energy', 'speechiness', 'tempo']]
y = data['categorized popularity']
Y = y.to_frame()
X.info(); print()
Y.info(); print()
Y.describe()

In [None]:
plt.hist(y, bins = np.arange(5) - 0.25, width = 0.5) 
plt.show()

In [None]:
def count_mean_p(sample):
    p_values = np.empty(10)
    for i in range(p_values.size):
        _, p_values[i] = stats.normaltest(sample)
    return p_values.mean()

print('Assume alpha is 0.05. If label is normally distributed, then its p-value must be greater than alpha', end = '\n\n')
for label in X:
    series = data[label]
    sample = series.sample(100, random_state = seed)
    p = count_mean_p(sample)
    print('norm test for ' + label + ' : ' + str(p))
    if (p < 0.05):
        min_val = series.min()
        if (min_val == 0):
            print('cannot lognorm here')
        else:
            max_val = series.max()
            sample = (-min_val + sample) / (max_val - min_val)
            sample = np.log(sample)
            p = count_mean_p(sample)
            print('lognorm test for ' + label + ' : ' + str(p))
    print()

In [None]:
from sklearn.kernel_approximation import RBFSampler, Nystroem
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import cross_val_score

In [None]:
def pretty_print(vals_with_counts):
    vals = vals_with_counts[0]
    counts = vals_with_counts[1]
    for i in range (len(vals)):
        print(vals[i], ": ", counts[i], end = '; ')
    print()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, \
                                                    stratify=y, test_size=0.2)
sgd_clf = SGDClassifier(random_state = seed)
sgd_clf.fit(X_train, y_train)
sgd_pred = sgd_clf.predict(X_test)

precision = metrics.precision_score(y_test, sgd_pred, average = 'weighted')
recall = metrics.recall_score(y_test, sgd_pred, average = 'weighted')
f1 = metrics.f1_score(y_test, sgd_pred, average = 'weighted')
fbeta = metrics.fbeta_score(y_test, sgd_pred, average = 'weighted', beta = 2)

In [None]:
pretty_print(np.unique(sgd_pred, return_counts = True))
pretty_print(np.unique(y_test, return_counts = True))
print('precision: ', precision) # true positive results divided by the number of all positive results, including those not identified correctly
print('recall: ', recall)       # true positive results divided by the number of all samples that should have been identified as positive
print('f1: ', f1)               # harmonic mean of precision and recall
print('fbeta: ', fbeta)         # recall is considered beta times as important as precision
print('cross_val_f1: ', cross_val_score(sgd_clf, X, y, cv = 5, scoring = 'f1_weighted').mean())

In [None]:
nystr_feature = Nystroem(random_state = seed) 
rbf_feature = RBFSampler(gamma = 1, random_state = seed)
X_features_nystr = nystr_feature.fit_transform(X)
X_features_rbf = rbf_feature.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_features_nystr, y, shuffle=True, \
                                                    stratify=y, test_size=0.2)
sgd_clf = SGDClassifier(random_state = seed)

sgd_clf.fit(X_train, y_train)
sgd_pred = sgd_clf.predict(X_test)

precision = metrics.precision_score(y_test, sgd_pred, average = 'weighted')
recall = metrics.recall_score(y_test, sgd_pred, average = 'weighted')
f1 = metrics.f1_score(y_test, sgd_pred, average = 'weighted')
fbeta = metrics.fbeta_score(y_test, sgd_pred, average = 'weighted', beta = 2)

In [None]:
pretty_print(np.unique(sgd_pred, return_counts = True))
pretty_print(np.unique(y_test, return_counts = True))
print('precision: ', precision)
print('recall: ', recall)
print('f1: ', f1)
print('fbeta: ', fbeta)
print('cross_val_f1: ', cross_val_score(sgd_clf, X_features_nystr, y, cv = 5, scoring = 'f1_weighted').mean())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_features_rbf, y, shuffle=True, \
                                                    stratify=y, test_size=0.2)
sgd_clf = SGDClassifier(random_state = seed)
sgd_clf.fit(X_train, y_train)
sgd_pred = sgd_clf.predict(X_test)

precision = metrics.precision_score(y_test, sgd_pred, average = 'weighted')
recall = metrics.recall_score(y_test, sgd_pred, average = 'weighted')
f1 = metrics.f1_score(y_test, sgd_pred, average = 'weighted')
fbeta = metrics.fbeta_score(y_test, sgd_pred, average = 'weighted', beta = 2)

In [None]:
pretty_print(np.unique(sgd_pred, return_counts = True))
pretty_print(np.unique(y_test, return_counts = True))
print('precision: ', precision)
print('recall: ', recall)
print('f1: ', f1)
print('fbeta: ', fbeta)
print('cross_val_f1: ', cross_val_score(sgd_clf, X_features_rbf, y, cv = 5, scoring = 'f1_weighted').mean())