# Hierarchische Clusteranalyse

In [2]:
from sklearn.datasets import make_blobs
from sklearn.cluster import AgglomerativeClustering, KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import squareform
from sklearn.preprocessing import normalize
import pandas as pd
from matplotlib import pyplot as plt
import numpy as np

In [3]:
import sklearn
sklearn.__version__

'0.22.1'

Daten importieren

In [4]:
df = pd.read_csv("../songs_longtexts.csv")

FileNotFoundError: [Errno 2] File ../songs_longtexts.csv does not exist: '../songs_longtexts.csv'

spezielle Daten auswählen, in dem Fall zwei bestimmte Künstler aus der Spalte "artist"

In [None]:
data1 = df[df['artist'] == 'Eminem']
data2 = df[df['artist'] == 'ABBA']

spezielle Daten auswählen, in dem Fall zwei Subgenre aus der Spalte "genre1"

In [None]:
data1 = df[df['genre1'] == 'alternative rock']
data2 = df[df['genre1'] == 'dance pop']

spezielle Daten auswählen, in dem Fall zwei Genre aus der Spalte "Genre"

In [None]:
data1 = df[df['Genre1'] == 'Hip Hop']
data2 = df[df['Genre1'] == 'Latin']

Daten zusammenführen

In [None]:
data = pd.concat([data1, data2], axis = 0)

Daten vorbereiten, in dem Fall für die Spalte "POS" 

In [None]:
tf_idf_vectorizor = TfidfVectorizer()
tf_idf = tf_idf_vectorizor.fit_transform(data.POS.values.astype(str))
tf_idf_norm = normalize(tf_idf)
tf_idf_array = tf_idf_norm.toarray()

In [None]:
data

# HC - Agglomeratives Clusterverfahren

In [None]:
def linkage_matrix(n_samples, children, distances):
    """
    create a linkage matrix for the dendogram method in scipy
    n_samples: int, number of samples
    children: list of lists, clustered data points (should be 2)
    distances: list of distances between nodes
    """
    # Create linkage matrix
    

    # create the counts of samples under each node
    counts = np.zeros(children.shape[0])
    for i, merge in enumerate(children):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    return np.column_stack([children, distances, counts]).astype(float)

PCA davor und HC nicht reduziert

In [None]:
sklearn_pca = PCA(n_components = 2)
Y_sklearn = sklearn_pca.fit_transform(tf_idf_array)
model = AgglomerativeClustering(n_clusters=None, distance_threshold=0).fit(Y_sklearn)
link_matrix = linkage_matrix(Y_sklearn.shape[0], model.children_, model.distances_)
plt.figure(figsize=(12,8))
plt.title('HC')

dendrogram(link_matrix, labels = data.artist.values, leaf_font_size=10);
plt.savefig('../images/HC/hc_alternative_rock_dance_pop.png', bbox_inches = "tight")

# Dendrogramme erstellen 

https://joernhees.de/blog/2015/08/26/scipy-hierarchical-clustering-and-dendrogram-tutorial/#Selecting-a-Distance-Cut-Off-aka-Determining-the-Number-of-Clusters

Abgeschnittene Dendrogramme

In [None]:
def fancy_dendrogram(*args, **kwargs):
    max_d = kwargs.pop('max_d', None)
    if max_d and 'color_threshold' not in kwargs:
        kwargs['color_threshold'] = max_d
    annotate_above = kwargs.pop('annotate_above', 0)

    ddata = dendrogram(*args, **kwargs)

    if not kwargs.get('no_plot', False):
        plt.title('Hierarchical Clustering Dendrogram (truncated)')
        plt.xlabel('sample index or (cluster size)')
        plt.ylabel('distance')
        for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            if y > annotate_above:
                plt.plot(x, y, 'ro', c=c)
                plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                             textcoords='offset points',
                             va='top', ha='center')
        if max_d:
            plt.axhline(y=max_d, c='k')
    return ddata

Angaben der Distanzen innerhalb des Dendrogramms

In [None]:
model = AgglomerativeClustering(n_clusters=None, distance_threshold=0).fit(tf_idf_array)
link_matrix = linkage_matrix(tf_idf_array.shape[0], model.children_, model.distances_)
fancy_dendrogram(
    link_matrix,
    truncate_mode='lastp',
    p=12,
    leaf_rotation=90.,
    leaf_font_size=12.,
    above_threshold_color='y',
    show_contracted=True,
    annotate_above=10  # useful in small plots so annotations don't overlap
    max_d=max_d,
)
plt.savefig('../images/HC/hc_latin_hiphop.png', bbox_inches = "tight")
plt.show()

Markierungen der Entfernungen innerhalb des Dendrogramms

In [None]:
def augmented_dendrogram(*args, **kwargs):
    
    ddata = dendrogram(*args, **kwargs)

    if not kwargs.get('no_plot', False):
        plt.title('Hierarchical Clustering Dendrogram (truncated)')
        plt.xlabel('sample index or (cluster size)')
        plt.ylabel('distance')
        for i, d, c in zip(ddata['icoord'], ddata['dcoord'], ddata['color_list']):
            x = 0.5 * sum(i[1:3])
            y = d[1]
            plt.plot(x, y, 'ro', c = c)
            plt.annotate("%.3g" % y, (x, y), xytext=(0, -5),
                         textcoords='offset points',
                         va='top', ha='center')

    return ddata

In [None]:
model = AgglomerativeClustering(n_clusters=None, distance_threshold=0).fit(tf_idf_array)
Z = linkage(tf_idf_array, 'ward')
augmented_dendrogram(
    Z,
    truncate_mode='lastp',
    p=12,
    leaf_rotation=90.,
    leaf_font_size=12.,
    above_threshold_color='y',
    show_contracted=True)
plt.savefig('../images/HC/hc_latin_hiphop.png', bbox_inches = "tight")
plt.show()

Linkage Matrix generieren -> Ward ist eine Methode, die die Distanzen zwischen neu geformten Clustern  berechnet 

In [None]:
Z = linkage(tf_idf_array, 'ward')
Z[:20]

Linkage Matrix generieren 

In [None]:
model = AgglomerativeClustering(n_clusters=None, distance_threshold=0).fit(tf_idf_array)
link_matrix = linkage_matrix(tf_idf_array.shape[0], model.children_, model.distances_)
link_matrix[:20]