# Разделимость данных

Примените алгоритмы уменьшения размерности к данным из предыдущей лабораторной, объясните полученные результаты

1. С помощью любого из рассмотренных на лекциях алгоритмов снижения размерности спроецируйте тренировочные данные в 2d пространство. Визуализируйте результаты снижения размерности. Цветом обозначьте категорию объекта
1. Примените алгоритм кластеризации к спроецированным данным. Расчитайте средние внутрикластерное и межкласторное расстояния получившейся проекции
1. Опишите результаты 

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.manifold import TSNE
seed = 0

In [None]:
data = pd.read_csv('data/SpotifyFeatures.csv')

In [None]:
data.info()
data.sample(5)

In [None]:
def score(popularity: int) -> int:
    if (popularity < 15):
        return 3 #"Very popular"
    elif (popularity < 35):
        return 2 #"Popular"
    elif (popularity < 65):
        return 1 #"Regular"
    else:
        return 0 #"Almost unknown"

data.insert(5, "categorized popularity", data.popularity.apply(score))

In [None]:
short_data = data.sample(30000, random_state = seed)
short_data.info()

In [None]:
X = short_data[['acousticness', 'instrumentalness', 'liveness', 'danceability', 'energy', 
                'speechiness', 'tempo', 'valence', 'duration_ms']]
y = short_data['categorized popularity']

In [None]:
tsne = TSNE(n_components = 2, random_state = seed, n_jobs = 6)
X_reduced = tsne.fit_transform(X)

In [None]:
print('Projecting %d-dimensional data to 2D' % X.shape[1])

plt.figure(figsize=(20,15))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c = y, 
            edgecolor = 'k', alpha = 0.7, s = 50,
            cmap = plt.cm.get_cmap('nipy_spectral', 4))
plt.colorbar()
plt.show()

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [None]:
y_pred4 = KMeans(n_clusters = 4, random_state = seed, algorithm = 'full').fit_predict(X_reduced)

In [None]:
plt.figure(figsize = (20,15))
plt.scatter(X_reduced[:, 0], X_reduced[:, 1], c = y_pred4, 
            cmap = plt.cm.get_cmap('nipy_spectral', 4), 
            edgecolor = 'k') 
plt.colorbar()
plt.show()

In [None]:
y_filtered = []
y_zipped = []
for i in range (4):
    curr_y = list(zip(X_reduced[y_pred4 == i], y_pred4[y_pred4 == i]))
    y_zipped.append(curr_y)
    y_filtered.append([item[0] for item in curr_y])
for i in range(4):
    for item, j in zip(y_zipped[i], range(5)):
        print(item)

In [None]:
plt.figure(figsize = (15, 15))
plt.imshow(plt.imread('data/lecture_screen.png'))
plt.show()

In [None]:
from math import hypot

def dist(p1, p2) -> float:
    return hypot(p2[0] - p1[0], p2[1] - p1[1])

def inner_cluster (filtered) -> float:
    n_clusters = len(filtered)
    dist_sum = 0.0
    cnt = 0
    for c in range(n_clusters):
        Xs = filtered[c]
        for j in range (len(Xs)):
            for i in range(j):
                dist_sum += dist(Xs[i], Xs[j])
                cnt += 1
    return dist_sum / cnt

def inter_cluster (filtered) -> float:
    n_clusters = len(filtered)
    dist_sum = 0.0
    cnt = 0
    for c2 in range(n_clusters):
        for c1 in range(c2):
            Xs1 = filtered[c1]
            Xs2 = filtered[c2]
            for i in range(len(Xs1)):
                for j in range(len(Xs2)):
                    dist_sum += dist(Xs1[i], Xs2[j])
                    cnt += 1
    return dist_sum / cnt

In [None]:
inner = inner_cluster(y_filtered)

In [None]:
inter = inter_cluster(y_filtered)

In [None]:
print("Mean incluster distance for 4 clusters: ", inner)
print("Mean intercluster distance for 4 clusters:", inter)
print("Incluster / intercluster for 4 clusters: ", inner / inter)

In [None]:
#https://scikit-learn.org/stable/modules/clustering.html#silhouette-coefficient
print("Silhouette score for 4 clusters: ", silhouette_score(X_reduced, y_pred4))