In [13]:
import sys
path = ".."
if path not in sys.path:
    sys.path.insert(0, path)

In [14]:
import numpy as np
from data_retrieval import lipade_groundtruth
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from absolute_path import absolutePath
import umap
import clustering
import evaluators
import csv
import os

corpus = "lipade_groundtruth"
methodType = "image"

In [15]:
_,_,y = lipade_groundtruth.getDataset(mode="similar")
num_clusters = max(y) + 1

In [16]:
def sortByScore(keyValue):
    _, score = keyValue
    return float(score)

# Clustering

## From distance

In [17]:
path = absolutePath + f"representation/{methodType}/results/distance/{corpus}/"

Dengrogram threshold

In [18]:
def best_threshold(distance, min=0, max=1, precision=1000):
    thresholds = np.linspace(min, max, precision)
    _, _, f1s = evaluators.p_r_f1_byThresholds(thresholds, distance, y)
    index = np.argmax(f1s)
    predicted_y = clustering.getPredictionFromThreshold(thresholds[index], distance)
    kappa = evaluators.kappa(y, predicted_y)
    return predicted_y, kappa

Application

In [19]:
methods = [best_threshold]

with open('evaluation/' + corpus + '/scores.csv', mode='r') as infile:
    reader = csv.reader(infile)
    f1_dict = {(rows[0], rows[1]): rows[2] for rows in reader}

for npy in os.listdir(path):
    distance = np.load(path + npy)
    for method in methods:
        predicted_y, kappa = method(distance)
        f1_dict[(npy.split('.')[0], method.__name__)] = "{:.3f}".format(kappa)
        np.save("clusters/" + corpus + "/" + method.__name__ + "/" + npy, predicted_y)

with open('evaluation/' + corpus + '/scores.csv', mode='w') as outfile:
    res_dict = csv.writer(outfile)
    kv = list(f1_dict.items())
    kv.sort(reverse=True, key=sortByScore)
    for key, value in kv:
        res_dict.writerow(list(key) + [value])

Thresholds: 100%|██████████| 1000/1000 [00:08<00:00, 113.04it/s]
Thresholds: 100%|██████████| 1000/1000 [00:06<00:00, 148.40it/s]
Thresholds: 100%|██████████| 1000/1000 [00:06<00:00, 151.76it/s]
Thresholds: 100%|██████████| 1000/1000 [00:06<00:00, 149.30it/s]
Thresholds: 100%|██████████| 1000/1000 [00:06<00:00, 152.36it/s]
Thresholds: 100%|██████████| 1000/1000 [00:06<00:00, 149.13it/s]
Thresholds: 100%|██████████| 1000/1000 [00:06<00:00, 148.36it/s]
Thresholds: 100%|██████████| 1000/1000 [00:08<00:00, 114.89it/s]
Thresholds: 100%|██████████| 1000/1000 [00:06<00:00, 143.57it/s]
Thresholds: 100%|██████████| 1000/1000 [00:06<00:00, 150.25it/s]
Thresholds: 100%|██████████| 1000/1000 [00:08<00:00, 115.50it/s]
Thresholds: 100%|██████████| 1000/1000 [00:06<00:00, 144.40it/s]
Thresholds: 100%|██████████| 1000/1000 [00:06<00:00, 152.25it/s]
Thresholds: 100%|██████████| 1000/1000 [00:08<00:00, 117.06it/s]
Thresholds: 100%|██████████| 1000/1000 [00:06<00:00, 155.82it/s]
Thresholds: 100%|████████

## From representation

In [20]:
path = absolutePath + f"representation/{methodType}/results/raw/{corpus}/"

UMAP + K-Means

In [21]:
def umap_k_means(representation):
    representation = umap.UMAP(n_components=10, random_state=42).fit_transform(representation)
    kmeans_umap = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
    predicted_y = kmeans_umap.fit_predict(representation)
    kappa = evaluators.kappa(predicted_y, y)
    return predicted_y, kappa

UMAP + GMM

In [22]:
def umap_gmm(representation):
    representation = umap.UMAP(n_components=10, random_state=42).fit_transform(representation)
    gmm_umap = GaussianMixture(n_components=num_clusters, random_state=42)
    predicted_y = gmm_umap.fit_predict(representation)
    kappa = evaluators.kappa(predicted_y, y)
    return predicted_y, kappa

Application

In [23]:
methods = [umap_k_means, umap_gmm]

with open('evaluation/' + corpus + '/scores.csv', mode='r') as infile:
    reader = csv.reader(infile)
    f1_dict = {(rows[0], rows[1]): rows[2] for rows in reader}

for npy in os.listdir(path):
    representation = np.load(path + npy)
    for method in methods:
        predicted_y, kappa = method(representation)
        f1_dict[(npy.split('.')[0], method.__name__)] = "{:.3f}".format(kappa)
        np.save("clusters/" + corpus + "/" + method.__name__ + "/" + npy, predicted_y)

with open('evaluation/' + corpus + '/scores.csv', mode='w') as outfile:
    res_dict = csv.writer(outfile)
    kv = list(f1_dict.items())
    kv.sort(reverse=True, key=sortByScore)
    for key, value in kv:
        res_dict.writerow(list(key) + [value])

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
