In [2]:
# default_exp interpretability.prototypes_criticisms

In [1]:
# export

import numpy as np
import pandas as pd

from typing import Tuple, List, Optional, Any, Iterable
from abc import ABC, abstractmethod

from pathlib import Path
# ds4se
import ds4se
from ds4se.utils.clusterization import *
from ds4se.utils.visualization import *

## prototypes_criticisms

> Protoypes and criticisms analysis based on [Section 6.3](https://christophm.github.io/interpretable-ml-book/proto.html) section of the [Interpretable Machine Learning book](https://christophm.github.io/interpretable-ml-book/)

## Utils

In [2]:
# export

def load_np_vectors(path: str) -> np.array:
    """
    :param path: Location of the .npy files to be loaded
    
    :return: Np array corresponding to the loaded vectors
    """
    path = Path(path)
    if not path.exists():
        msg = "Vectors could not be found"
        logging.error(msg)
        raise Exception(msg)
    return np.load(str(path))

In [3]:
# export

def reshape_vectors(vectors):
    result = []
    for v in vectors:
        result.append(v.reshape(500,1).T)
    return np.array(result)

## Parameterization

In [4]:
params = {
    "vectors_path": "/tf/main/dvc-ds4se/results/d2v_vectors",
    "plots_path": "/tf/main/dvc-ds4se/results/plotting"
}

## Find Prototypes

The authors of the book use a method based on the  maximum mean discrepancy. However, they highlight the fact that any* clustering algorithm can be used.

\* The clustering algorithm used to find prototypes should return <i>real</i> data points as centroids.

In [6]:
# export

class Clusterizer(ABC):
    def __init__(self):
        pass
    
    @abstractmethod
    def perform_clusterization(self, data_vectors: np.ndarray, dims: Optional[int]=2):
        """
        Perform clusterization on a given dataset
        
        """
        pass

In [7]:
# export

class KMedoidsClusterizer(Clusterizer):
    def __init__(self):
        # TODO
        super().__init__()
        
    def perform_clusterization(self, data_vectors: np, dims: Optional[int]=2) -> Tuple:
        """
        Perform clusterization using k-medoids
        - First perform dimensionality reduction by means of PCA + t-SNE
        - Finds best k
        
        :param data_vectors:
        :param dims: Int. indicating the number of dimensions for
                     dim. reduction
                     
        :return: Tuple (reduced data, clusters, medoid ids, k_medoids_instance (pyclustering obj.))
        """
        return perform_clusterize_kmedoids(data_vectors, dims)
        

In [None]:
# TODO: Integrate gravitational clustering

In [8]:
test_vectors = load_np_vectors(f"{params['vectors_path']}/doc2_vec_sample_java_df.npy")

In [9]:
test_vectors

array([[-0.37762475, -0.38540062, -0.01277232, ...,  0.45781606,
         0.19079459,  0.26853234],
       [-0.17740302, -0.01685066,  0.3694753 , ..., -0.18609756,
        -0.0205524 , -0.05089067],
       [-0.6700186 ,  0.37853143, -0.04435137, ...,  1.0894055 ,
         0.01246784, -0.19507329],
       ...,
       [ 0.2327957 ,  0.30608505, -0.06560351, ...,  0.2620101 ,
         0.41106102, -0.15154514],
       [ 0.05280151,  0.4234941 , -0.5358744 , ...,  0.02667005,
         0.4553478 ,  0.18505383],
       [-0.02004011,  0.30035594,  0.02699976, ...,  0.19926223,
         0.04953865, -0.06755862]], dtype=float32)

In [10]:
km_clusterizer = KMedoidsClusterizer()

In [None]:
cluster_result = km_clusterizer.perform_clusterization(test_vectors, dims=2)

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 10000 samples in 0.001s...
[t-SNE] Computed neighbors for 10000 samples in 1.431s...
[t-SNE] Computed conditional probabilities for sample 1000 / 10000
[t-SNE] Computed conditional probabilities for sample 2000 / 10000
[t-SNE] Computed conditional probabilities for sample 3000 / 10000
[t-SNE] Computed conditional probabilities for sample 4000 / 10000
[t-SNE] Computed conditional probabilities for sample 5000 / 10000
[t-SNE] Computed conditional probabilities for sample 6000 / 10000
[t-SNE] Computed conditional probabilities for sample 7000 / 10000
[t-SNE] Computed conditional probabilities for sample 8000 / 10000
[t-SNE] Computed conditional probabilities for sample 9000 / 10000
[t-SNE] Computed conditional probabilities for sample 10000 / 10000
[t-SNE] Mean sigma: 1.332237
[t-SNE] KL divergence after 250 iterations with early exaggeration: 89.484192
[t-SNE] KL divergence after 300 iterations: 3.684873


In [14]:
feat_vectors, clusters, medoid_ids, kmedoids_instance = cluster_result

In [None]:
print("ahhhhhhhhhhhh")

In [12]:
# TODO: Integrate Gravitational clustering

## Find Criticisms

Points are selected as criticisms where the distribution of prototypes differs from the distribution of the data

In [70]:
class CriticismFinder:
    
    def get_critisicms(data, prototypes: List, n: Optional[int]=None,
                       distance: Optional[Any]=None) ->Tuple:
        """
        :param data: Dataset
        :param prototypes: List of found prototypes
        :param n: Numbers of criticisms to find
        :param distance: Distance object instantiating appropriate distance calculation
        
        :return: Tuple (criticisms points, criticisms ids)
        """
        crit_points, crit_ids = gen_criticisms(data, prototypes, n, distance)
        return crit_points, crit_ids

In [90]:
crit_finder = CriticismFinder()

In [None]:
# TODO: crit_finder.get_criticisms(feat_vectors, )

## Gaussian mixture models

In [None]:
params['plots_path']

## Perform plotting

In [None]:
from nbdev.export import notebook2script
notebook2script()