# Clustering

In [237]:
import numpy as np
from math import sqrt
import matplotlib.pyplot as plt
import scipy
from scipy.cluster.hierarchy import dendrogram, linkage

In [203]:
%matplotlib notebook

In [204]:
X = [[0.6, 1.9], #a
     [1.8, 1.6], #b 
     [2.7, 2.0], #c
     [3.0, 2.1], #d 
     [3.0, 2.6], #e
     [3.1, 4.5], #f
     [3.8, 0.6], #g
     [4.2, 2.7], #h
    ]
names = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']

In [192]:
def cosin(v1, v2):
    return sum(i*j for i, j in zip(v1,v2)) / (sqrt(sum(map(lambda x: x*x, v1))) * sqrt(sum(map(lambda x: x*x, v2))))

In [236]:
Complete = linkage(X, method='single', metric='cosine') 
Complete[:,2] = np.log(Complete[:,2]) + 20 

plt.title('Single linkage')
plt.xlabel('names')
plt.ylabel('distance')
dendrogram(Complete, labels = names, color_threshold=10, orientation='top')
plt.show()

<IPython.core.display.Javascript object>

In [218]:
Complete = linkage(X, method='complete', metric='cosine') 
Complete[:,2] = np.log(Complete[:,2]) + 20 

plt.title('Complete linkage')
plt.xlabel('names')
plt.ylabel('distance')
dendrogram(Complete, labels = names, color_threshold=10, orientation='top')
plt.show()

<IPython.core.display.Javascript object>

In [271]:
def union(clusters, mod):
    pairs_clusters = []
    for i in range(0, len(clusters) - 1):
        for j in range(i + 1, len(clusters)):
            pairs_clusters.append([(i, clusters[i]), (j, clusters[j])])
    
    index1 = pairs_clusters[0][0][0]
    index2 = pairs_clusters[0][1][0]  
    sim_clusters = []
    if mod == 'single_link':
        sim_value = sim_single_link(pairs_clusters[0][0][1], pairs_clusters[0][1][1])
        sim_clusters = pairs_clusters[0]
        for pair in pairs_clusters[1:]:
            value = sim_single_link(pair[0][1], pair[1][1])
            if value > sim_value:
                sim_value = value
                sim_clusters = pair
                index1 = pair[0][0]
                index2 = pair[1][0]
    elif mod == 'complete_link':
        sim_value = sim_complete_link(pairs_clusters[0][0][1], pairs_clusters[0][1][1])
        sim_clusters = pairs_clusters[0]
        for pair in pairs_clusters[1:]:
            value = sim_complete_link(pair[0][1], pair[1][1])
            if value > sim_value:
                sim_value = value
                sim_clusters = pair
                index1 = pair[0][0]
                index2 = pair[1][0]

    joined_clusters = []
    for i, cluster in enumerate(clusters):
        if i != index1 and i != index2:
            joined_clusters.append(cluster)
    joined_clusters.append(clusters[index1] + clusters[index2])
    return sim_clusters, joined_clusters

In [264]:
def sim_single_link(c1, c2):
    pairs = get_pairs(c1, c2)
    max_sim_value = cosin(pairs[0][0], pairs[0][1])
    for pair in pairs[1:]:
        if cosin(pair[0], pair[1]) > max_sim_value:
            max_sim_value = cosin(pair[0], pair[1])
    return max_sim_value

In [265]:
def sim_complete_link(c1, c2):
    pairs = get_pairs(c1, c2)
    min_sim_value = cosin(pairs[0][0], pairs[0][1])
    for pair in pairs[1:]:
        if cosin(pair[0], pair[1]) < min_sim_value:
            min_sim_value = cosin(pair[0], pair[1])
    return min_sim_value

In [266]:
def get_pairs(c1, c2):
    pairs = []
    for item1 in c1:
        for item2 in c2:
            pairs.append([item1, item2])
    return pairs

In [267]:
def agglomerative_clustering(X, names, mod):
    clusters = [[value] for value in X]
    k = 0
    while len(clusters)!=1:
        sim_clusters, clusters = union(clusters, mod)
        X1 = [item[0] for item in sim_clusters[0][1]]
        Y1 = [item[1] for item in sim_clusters[0][1]]
        X2 = [item[0] for item in sim_clusters[1][1]]
        Y2 = [item[1] for item in sim_clusters[1][1]]
        x = [item[0] for item in X]
        y = [item[1] for item in X]
        
        plt.figure(figsize=(7,7))
        plt.title(mod)
        for i in range(len(names)):
            plt.text(X[i][0]+.01, X[i][1]+.1, names[i])
        plt.scatter(x, y, s=40, c='black', marker='o')
        plt.scatter(X1, Y1, s=110, c='red', marker='o')
        plt.scatter(X2, Y2, s=110, c='green', marker='o')
        plt.savefig('./Clusters/' + str(k) + '.png')
        plt.show()
        k += 1

## Images of Single Link Agglomerative Clustering

#### На каждом этапе объединяющиеся класетры представлены красным и зеленым цветом

In [235]:
agglomerative_clustering(X, names, 'single_link')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Images of Complete Link Agglomerative Clustering

In [272]:
agglomerative_clustering(X, names, 'complete_link')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>