In [18]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, silhouette_score
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans,DBSCAN, MiniBatchKMeans
from sklearn.decomposition import PCA,TruncatedSVD
from sklearn.manifold import TSNE
import numpy as np
from yellowbrick.cluster import KElbowVisualizer
from yellowbrick.text import TSNEVisualizer
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import random
rs = 42 # random state
# np.random.seed(42)

In [2]:
def random_prediction(X_test, y_test):
    labels = list(set(y_test))
    prediction = [np.random.choice(labels) for _ in X_test]
    return accuracy_score(y_test, prediction)

def get_random_prediction(X_test, y_test, trials=1000):
    sum = 0
    trials
    for i in range(trials):
        sum += random_prediction(X_test, y_test)
    return sum/trials

COLORS = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'tab:brown', 'tab:pink', 'tab:olive', 'tab:cyan', 'tab:gray']
MARKERS = ['o', 'v', 's', '<', '>', '8', '^', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X']

def plot2d(X, y_pred, y_true, mode=None, centroids=None):
    transformer = None
    X_r = X
    
    if mode is not None:
        transformer = mode(n_components=2, random_state=42)
        X_r = transformer.fit_transform(X)

    assert X_r.shape[1] == 2, 'plot2d only works with 2-dimensional data'


    plt.grid()
    for ix, iyp, iyt in zip(X_r, y_pred, y_true):
        plt.plot(ix[0], ix[1], 
                    c=COLORS[iyp], 
                    marker=MARKERS[iyt])
        
    if centroids is not None:
        C_r = centroids
        if transformer is not None:
            C_r = transformer.transform(centroids)
        for cx in C_r:
            plt.plot(cx[0], cx[1], 
                        marker=MARKERS[-1], 
                        markersize=10,
                        c='red')
    plt.legend()
    plt.show()

# def plot_clusters(X, y_pred, y_true, transformer='pca', centroids):
    
#     pca = PCA(n_components=2, random_state=42)
#     tsne = TSNE(n_components=2, random_state=42)
    
#     tr = pca
#     if mode == 'tsne':
#         tr = tsne
    
#     reduced_features = pca.fit_transform(features.toarray())
#     reduced_centroids = tr.transform(centroids)
#     plt.scatter(reduced_features[:,0], reduced_features[:,1], c=cls.predict(features))
#     plt.scatter(reduced_cluster_centers[:, 0], reduced_cluster_centers[:,1], marker='x', s=150, c='b')


def plot3d(X, y_pred, y_true, mode=None, centroids=None):
    transformer = None
    X_r = X
    if mode is not None:
        transformer = mode(n_components=3)
        X_r = transformer.fit_transform(X)

    assert X_r.shape[1] == 3, 'plot2d only works with 3-dimensional data'

    fig = plt.figure()
    ax = fig.add_subplot(projection='3d')
    ax.elev = 30
    ax.azim = 120

    for ix, iyp, iyt in zip(X_r, y_pred, y_true):
        ax.plot(xs=[ix[0]], ys=[ix[1]], zs=[ix[2]], zdir='z',
                    c=COLORS[iyp], 
                    marker=MARKERS[iyt])
        
    if centroids is not None:
        C_r = centroids
        if transformer is not None:
            C_r = transformer.fit_transform(centroids)
        for cx in C_r:
            ax.plot(xs=[cx[0]], ys=[cx[1]], zs=[cx[2]], zdir='z',
                        marker=MARKERS[-1], 
                        markersize=10,
                        c='red')
    plt.show()

In [22]:
path = r'./data/processed_full.csv'
data = pd.read_csv(path)
# data = data[data['genre'] != 'rock']


In [24]:
# print(data.genre.unique()
print(Counter(data.genre))
print(len(data.index))

Counter({'rock': 9430, 'electronic': 1993, 'rap': 1488, 'pop/r&b': 1087, 'folk/country': 574, 'experimental': 562, 'metal': 357, 'jazz': 341, 'global': 183})
16015


In [20]:
X = data.content.apply(lambda x: np.str_(x))
y = data.genre.apply(lambda x: np.str_(x))

In [5]:
# X = data.content.head(1500).apply(lambda x: np.str_(x))
# y = data.genre.head(1500).apply(lambda x: np.str_(x))

In [6]:
print(X.shape)
print(y.shape)

(1500,)
(1500,)


In [7]:
tfidf = TfidfVectorizer(min_df=30)
le = LabelEncoder()

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, shuffle=True, random_state=rs) 

In [9]:
X_train  = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

y_train_labels = le.fit_transform(y_train)
y_test_labels = le.fit_transform(y_test)

# Baselie prediction

In [10]:
# print(get_random_prediction(X_test, y_test, 2) * 100)

# Random Forest

In [10]:
# model = RandomForestClassifier()
# model.fit(X_train, y_train)
# print(model.score(X_test, y_test) * 100)

62.33333333333333


# k Means

In [12]:
# nr_clusters = len(set(y_train))
# print(nr_clusters)

In [13]:
# km = MiniBatchKMeans(random_state=42)

# sil = KElbowVisualizer(km, k=(4,10), metric='silhouette', timings=False)

# sil.fit(X_train)        # Fit the data to the visualizer
# sil.show()        # Finalize and render the figure

In [14]:
# distortion = KElbowVisualizer(km, k=(4,15), timings=False)

# distortion.fit(X_train)        # Fit the data to the visualizer
# distortion.show()        # Finalize and render the figure

In [11]:
kmeans = KMeans(n_clusters=8)
kmeans.fit(X_train)

KMeans()

In [12]:
print(kmeans.labels_)
print(y_train_labels)

[0 0 7 ... 3 7 5]
[1 6 6 ... 0 0 5]


In [None]:
print(silhouette_score(data, estimator[-1].labels_,metric="euclidean", sample_size=300, random_state=rs)