In [None]:
import copy
import json

import pandas as pd
import seaborn as sns
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.decomposition import PCA
from scipy.cluster import hierarchy
from keras.models import Sequential
from tqdm import tqdm

In [None]:
data_dir = './data/All'


def preprocess(images, labels):
    return tf.keras.applications.resnet50.preprocess_input(images), labels


img_height, img_width = 320, 240
batch_size = 64

print('Loading dataset')
dataset = tf.keras.preprocessing.image_dataset_from_directory(
    data_dir,
    seed=0,
    label_mode='categorical',
    image_size=(img_height, img_width),
    batch_size=batch_size, )

class_names = dataset.class_names
class_name_map = {}
for i in range(len(class_names)):
    class_name_map[i] = class_names[i]

print('Preprocessing')
dataset = dataset.map(preprocess)

In [None]:
resnet_model = Sequential()

pretrained_model = tf.keras.applications.ResNet50(include_top=False,
                                                  input_shape=(320, 240, 3),
                                                  pooling='avg',
                                                  weights='imagenet')
for layer in pretrained_model.layers:
    layer.trainable = False
resnet_model.add(pretrained_model)

y = []
features = []

model = resnet_model

print('Extracting features')
for samples, labels in tqdm(dataset):
    predictions = model.predict(samples, verbose=0)
    features.extend(predictions)
    y.extend(labels)

X = pd.DataFrame(features)
y = np.argmax(y, axis=1)


In [None]:
X['Label'] = y
X['Label'] = X['Label'].map(class_name_map)
categories = X.groupby('Label').mean()
categories.head()

In [None]:
print('Clustering')

linkage='single'

n_clusters = 12

clustering = AgglomerativeClustering(compute_distances=True, n_clusters=n_clusters, linkage=linkage)
clusters = clustering.fit_predict(categories)

no_of_observations = np.arange(2, clustering.children_.shape[0]+2)
linkage_matrix = np.column_stack([clustering.children_, clustering.distances_, no_of_observations]).astype(float)

plt.figure(figsize=(10, 6))
hierarchy.dendrogram(linkage_matrix, labels=class_names, leaf_font_size=8, color_threshold=0, truncate_mode='lastp', p=n_clusters)
plt.show()

cluster_map = {}
for i in range(len(class_names)):
    cluster_map[i] = clusters[i]
    print(f'{class_names[i]}: cluster {clusters[i]}')
X['Label'] = y
X['Label'] = X['Label'].map(cluster_map)

In [None]:
silhouette_avg = silhouette_score(X.drop('Label', axis=1), X['Label'])
print("silhouette_avg:", silhouette_avg)

sample_silhouette_values = silhouette_samples(X.drop('Label', axis=1), X['Label'])

fig, (ax1) = plt.subplots(1, 1)
fig.set_size_inches(8, 8)
ax1.set_xlim([-1, 1])
ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

y_lower = 10
for i in range(n_clusters):
    cluster_silhouette_values = sample_silhouette_values[X['Label'] == i]
    cluster_silhouette_values.sort()
    size_cluster = cluster_silhouette_values.shape[0]

    y_upper = size_cluster + y_lower
    color = cm.nipy_spectral(float(i) / n_clusters)
    ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7)
    ax1.text(-0.05, y_lower + 0.5 * size_cluster, str(i))
    y_lower = y_upper + 10

ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")

ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

ax1.set_yticks([])
ax1.set_xticks([-1, -0.8, -0.6, -0.4, -0.2, 0, 0.2, 0.4, 0.6, 0.8, 1])

plt.show()


In [None]:
silhouette_scores = pd.DataFrame(columns=['Silhouette', 'y', 'difference'])
linkage_matrix = None
n_cuts = 3

linkage='single'

for n in range(2,13):
    print(n)
    n_clusters = n

    clustering = AgglomerativeClustering(compute_distances=True, n_clusters=n_clusters, linkage=linkage)
    clusters = clustering.fit_predict(categories)

    no_of_observations = np.arange(2, clustering.children_.shape[0]+2)
    linkage_matrix = np.column_stack([clustering.children_, clustering.distances_, no_of_observations]).astype(float)

    cluster_map = {}
    for i in range(len(class_names)):
        cluster_map[i] = clusters[i]
        # print(f'{class_names[i]}: cluster {clusters[i]}')
    X['Label'] = y
    X['Label'] = X['Label'].map(cluster_map)

    silhouette_avg = silhouette_score(X.drop('Label', axis=1), X['Label'])
    print("silhouette_avg:", silhouette_avg)

    if n == 12:
        silhouette_scores.loc[n] = [silhouette_avg, (linkage_matrix[12-n][2] - 0)/2 + 0, linkage_matrix[12-n][2]]
        fig, ax1 = plt.subplots()
        plt.figure(figsize=(10, 6))
        hierarchy.dendrogram(linkage_matrix, labels=class_names, leaf_font_size=8, color_threshold=0, truncate_mode='lastp', p=n_clusters, ax=ax1, distance_sort=True, leaf_rotation=45)
        ax2 = ax1.twiny()
        ax2.axis('off')
        ax2.set_xlim(-0.05,0.05)
        ax2.barh(silhouette_scores['y'].to_numpy(), silhouette_scores['Silhouette'].to_numpy(), height=linkage_matrix[0][2]/20, color=(1, 0, 0, 0.5))

        # only show best cuts
        silhouette_scores = silhouette_scores.nlargest(n_cuts, ['difference'])
        silhouette_scores.sort_index(inplace=True, ascending=True)

        for row in silhouette_scores.itertuples():
            ax2.axhline(y=row.y, color=(1, 0, 0, 0.5))
            ax2.text(y=row.y, x=1.05, s=round(row.Silhouette, 4), fontsize='x-small')

        ax2.plot()

    else:
        silhouette_scores.loc[n] = [silhouette_avg, (linkage_matrix[12-n][2] - linkage_matrix[12-n-1][2])/2 + linkage_matrix[12-n-1][2], linkage_matrix[12-n][2] - linkage_matrix[12-n-1][2]]


In [None]:
print(silhouette_scores)

filename = './aggolomerativeOntologySingle.json'

splits = {}
split = n
largest_y = 0
for row in linkage_matrix:
    splits[split] = [[row[0], row[1]], row[2]]
    largest_y = row[2]
    split += 1
print(splits, split)
print(linkage_matrix)

levels = []
for i in range(n_cuts):
    cut = silhouette_scores['y'].iloc[i]
    splits_copy = {}
    for key, split in splits.items():
        if largest_y >= split[1] > cut:
            children = split[0]
            split_copy = copy.deepcopy(split)
            splits_copy[key] = split_copy
            for child in children:
                if child in splits.keys() and largest_y >= splits[child][1] > cut:
                    if child in splits_copy.keys():
                        split_copy[0].extend(splits_copy[child][0])
                        splits_copy.pop(child)
                    else:
                        split_copy[0].extend(splits[child][0])
                    split_copy[0].remove(child)
    levels.append(splits_copy)
    largest_y = cut

print(levels)
ontology = {}
ontology["name"] = "dataset"
children = {}
ontology["children"] = children
for key, split in levels[0].items():
    for child in split[0]:
        children[child] = {}
print(ontology)

def update_ontology(level, parent):
    for child in parent:
        if child in levels[level].keys():
            for sub_child in levels[level][child][0]:
                parent[child][sub_child] = {}
        if not parent[child]:
            parent[child][child] = {}
        if level+1 < n_cuts:
            update_ontology(level+1, parent[child])

update_ontology(1, ontology["children"])
print(ontology)

def restructure_ontology(parent):
    children = []
    for key, child in parent.items():
        sub_children = restructure_ontology(child)
        new_child = {"name": str(key),
                     "children": sub_children}
        if not sub_children:
            new_child["leaf"] = class_names[int(key)]
        children.append(new_child)

    return children

ontology["children"] = restructure_ontology(ontology["children"])

with open(filename, 'w') as ontologyFile:
    json.dump(ontology, ontologyFile, indent=4)