In [1]:
# default_exp utils.clusterization

In [3]:
! pip install pyclustering

Collecting pyclustering
  Downloading pyclustering-0.10.1.2.tar.gz (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 7.2 MB/s eta 0:00:01
Collecting Pillow>=5.2.0
  Downloading Pillow-8.2.0-cp36-cp36m-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 32.6 MB/s eta 0:00:01
Building wheels for collected packages: pyclustering
  Building wheel for pyclustering (setup.py) ... [?25ldone
[?25h  Created wheel for pyclustering: filename=pyclustering-0.10.1.2-py3-none-any.whl size=2395105 sha256=51deee957a1775079a5e9e71acbb383af11cb1b6a306647375519dc586752d86
  Stored in directory: /root/.cache/pip/wheels/5c/69/89/146543430cba41ea0dd0c553a8a325367ce91dba20cf1c9086
Successfully built pyclustering
Installing collected packages: Pillow, pyclustering
Successfully installed Pillow-8.2.0 pyclustering-0.10.1.2
You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


## clusterization


In [10]:
#export
import logging

import sentencepiece as sp

from pyclustering.cluster.kmedoids import kmedoids
from pyclustering.utils.metric import euclidean_distance_square
from pyclustering.cluster.silhouette import silhouette, silhouette_ksearch_type, silhouette_ksearch

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score, pairwise_distances_argmin_min

import numpy as np
from abc import ABC
from typing import Tuple

# Configs
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

## Distance metrics

In order to allow flexible implementation of several clustering techniques, a base CustomDistance class is defined.

In [6]:
# export

class CustomDistance(ABC):
    def compute_distance(self, x, y) -> float:
        """
        Computes the distance between 2 vectors according to a 
        particular distance metric
        :param x: Vector
        :param y: Vector
        :return: 
        """
        pass

In [7]:
# export

class EuclideanDistance(CustomDistance):
    """Euclidean implementation of distance"""
    def compute_distance(self, x, y) -> float:
        return euclidean_distance_square(x, y)

## Utils

In [11]:
# export

# Uses PCA first and then t-SNE
def reduce_dims(feature_vectors, dims = 2):
    # hyperparameters from https://towardsdatascience.com/visualising-high-dimensional-datasets-using-pca-and-t-sne-in-python-8ef87e7915b
    pca = PCA(n_components=50)
    pca_features = pca.fit_transform(feature_vectors)

    tsne = TSNE(n_components=dims, verbose=1, perplexity=40, n_iter=300)
    tsne_features = tsne.fit_transform(pca_features)
    
    return tsne_features

In [19]:
# export

def get_silhouette(samples1, samples2):
    cluster1, medoid_id1, kmedoid_instance1 = run_kmedoids(samples1, 1)
    cluster2, medoid_id2, kmedoid_instance12 = run_kmedoids(samples2, 1)
    cluster2 = np.array([[len(samples1) + x for x in cluster2[0]]])
    samples = np.concatenate((samples1, samples2), axis=0)
    clusters = np.concatenate((cluster1, cluster2), axis=0)
    score = sum(silhouette(samples, clusters).process().get_score()) / len(samples)
    
    return score

## k-means

In [10]:
# export
def k_means(feature_vectors, k_range=[2, 3]):
    # finding best k
    bst_k          = k_range[0]
    bst_silhouette = -1
    bst_labels     = None
    bst_centroids  = None
    bst_kmeans     = None
    for k in k_range:
        kmeans = KMeans(n_clusters = k)
        kmeans.fit(feature_vectors)

        labels    = kmeans.predict(feature_vectors)
        centroids = kmeans.cluster_centers_
        
        silhouette_avg = silhouette_score(feature_vectors, labels)
        if silhouette_avg > bst_silhouette:
            bst_k          = k
            bst_silhouette = silhouette_avg
            bst_labels     = labels
            bst_centroids  = centroids
            bst_kmeans     = kmeans
    logger.info(f'Best k = {bst_k} with a silhouette score of {bst_silhouette}')
    
    centroid_mthds = pairwise_distances_argmin_min(bst_centroids, feature_vectors)
    return bst_labels, bst_centroids, bst_kmeans, centroid_mthds

In [13]:
# export

def clusterize(feature_vecs, k_range = [2], dims = 2):
    feature_vectors = reduce_dims(np.array(list(zip(*feature_vecs))[1]), dims = dims)
    experimental_vectors = feature_vectors#[:len(feature_vectors) * 0.1]
    labels, centroids, kmeans, centroid_mthds = k_means(experimental_vectors, k_range = k_range)
    return (feature_vectors, centroid_mthds, labels, centroids, kmeans)

In [14]:
# export

def find_best_k(samples):
    search_instance = silhouette_ksearch(samples, 2, 10, algorithm=silhouette_ksearch_type.KMEDOIDS).process()
    amount = search_instance.get_amount()
    scores = search_instance.get_scores()
    
    print(f"Best Silhouette Score for k = {amount}: {scores[amount]}")
    
    return amount

In [15]:
# export
def run_kmedoids(samples, k):
    initial_medoids = list(range(k))
    # Create instance of K-Medoids algorithm.
    kmedoids_instance = kmedoids(samples, initial_medoids)
    kmedoids_instance.process()
    clusters = kmedoids_instance.get_clusters()
    medoid_ids = kmedoids_instance.get_medoids()
    
    return clusters, medoid_ids, kmedoids_instance

In [11]:
# export

def perform_clusterize_kmedoids(data: np.array, dims: int = 2) -> Tuple:
    reduced_data = reduce_dims(data, dims = dims)
    k = find_best_k(reduced_data)
    
    clusters, medoid_ids, kmedoids_instance = run_kmedoids(reduced_data, k)
    
    return reduced_data, clusters, medoid_ids, kmedoids_instance

In [16]:
# export
def clusterize_kmedoids(samples, dims = 2):
    samples = reduce_dims(np.array(list(zip(*samples))[1]), dims = dims)
    k = find_best_k(samples)
    clusters, medoid_ids, kmedoids_instance = run_kmedoids(samples, k)
    
    return samples, clusters, medoid_ids, kmedoids_instance

In [17]:
# export
def new_clusterize_kmedoids(h_samples, m1_samples, m2_samples, m3_samples, dims = 2):
    samples = np.concatenate((h_samples, m1_samples, m2_samples, m3_samples), axis=0)
    samples = reduce_dims(samples, dims = dims) # np.array(list(zip(*samples)))[0], dims = dims)
    h_samples, m1_samples, m2_samples, m3_samples = samples[:len(h_samples)], samples[len(h_samples):len(h_samples) + len(m1_samples)], samples[len(h_samples) + len(m1_samples):len(h_samples) + len(m1_samples) + len(m2_samples)], samples[len(h_samples) + len(m1_samples) + len(m2_samples):]
    h_k = find_best_k(h_samples)
    h_clusters, h_medoid_ids, h_kmedoids_instance = run_kmedoids(h_samples, h_k)
    m1_k = find_best_k(m1_samples)
    m1_clusters, m1_medoid_ids, m1_kmedoids_instance = run_kmedoids(m1_samples, m1_k)
    m2_k = find_best_k(m2_samples)
    m2_clusters, m2_medoid_ids, m2_kmedoids_instance = run_kmedoids(m2_samples, m2_k)
    m3_k = find_best_k(m3_samples)
    m3_clusters, m3_medoid_ids, m3_kmedoids_instance = run_kmedoids(m3_samples, m3_k)
    
    return (
        (h_samples, h_clusters, h_medoid_ids, h_kmedoids_instance),
        (m1_samples, m1_clusters, m1_medoid_ids, m1_kmedoids_instance),
        (m2_samples, m2_clusters, m2_medoid_ids, m2_kmedoids_instance),
        (m3_samples, m3_clusters, m3_medoid_ids, m3_kmedoids_instance)
    )

## Prototypes and criticisms

In [18]:
# export

def gen_criticisms(samples, prototypes, n = None, distance = None):
    if n is None: n = len(prototypes)
    if distance is None:
        distance = EuclideanDistance()
    crits = []
    for x in samples:
        mean_dist_x = 0.
        for x_i in samples:
            mean_dist_x += distance.compute_distance(x, x_i)
        mean_dist_x = mean_dist_x / len(x)
        
        mean_dist_proto = 0.
        for z_j in prototypes:
            mean_dist_proto += distance.compute_distance(x, z_j)
        mean_dist_proto = mean_dist_proto / len(prototypes)
        
        crits.append(mean_dist_x - mean_dist_proto)
    
    crits = np.array(crits)
    crit_ids = crits.argsort()[-n:][::-1]
    
    return crits, crit_ids

In [12]:
from nbdev.export import notebook2script
notebook2script()

Converted 0.0_mgmnt.prep.i.ipynb.
Converted 0.1_mgmnt.prep.conv.ipynb.
Converted 0.3_mgmnt.prep.bpe.ipynb.
Converted 0.6_mgmnt.prep.nltk.ipynb.
Converted 0.7_mgmnt.prep.files_mgmnt.ipynb.
Converted 0.8_mgmnt.prep.bpe_tokenization.ipynb.
Converted 1.0_exp.i.ipynb.
Converted 1.1_exp.info-[inspect].ipynb.
Converted 1.1_exp.info.ipynb.
Converted 1.2_exp.csnc.ipynb.
Converted 1.2_exp.gen.code.ipynb.
Converted 1.3_exp.csnc_python.ipynb.
Converted 10.0_utils.clusterization.ipynb.
Converted 10.1_utils.visualization.ipynb.
Converted 2.0_repr.codebert.ipynb.
Converted 2.0_repr.i.ipynb.
Converted 2.1_repr.codeberta.ipynb.
Converted 2.1_repr.roberta.train.ipynb.
Converted 2.2_repr.roberta.eval.ipynb.
Converted 2.3_repr.word2vec.train.ipynb.
Converted 2.6_repr.word2vec.eval.ipynb.
Converted 2.7_repr.distmetrics.ipynb.
Converted 2.8_repr.sentence_transformers.ipynb.
Converted 3.1_mining.unsupervised.traceability.eda.ipynb.
Converted 3.2_mining.unsupervised.eda.traceability.d2v.ipynb.
This cell doesn