In [1]:
%load_ext autoreload
%autoreload 2

import sklearn
import os, sys
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.cluster import KMeans, AgglomerativeClustering, SpectralClustering
from scipy import interpolate
from matplotlib import pyplot as plt
from matplotlib.colors import LogNorm

sys.path.insert(1, os.path.join(sys.path[0], ".."))
sys.path.append('../../')
from reproduction.pipeline.load import load_data
from reproduction import analysis

# Disable Warnings
import warnings; warnings.simplefilter('ignore')
tf.logging.set_verbosity(tf.logging.WARN)

In [2]:
def gridsearch(start, step, stop, max_samples=5000, sample_steps=4, trials=30):
    with open(ENCODER_DEF,"r") as f:
            encoder = tf.keras.models.model_from_json(f.read())
    encoder.load_weights(ENCODER_WEIGHTS)
    
    samples = np.logspace(np.log10(start+2), np.log10(max_samples), num=sample_steps).astype(int)
    search_results = [] # Force initialization
    # iterate on sampling 
    for i in samples:
        # iterate on clustering
        for j in range(start, stop, step):
                if i/j > 2: # Agglomerative clustering must have leaves with at least 2 elements
                    print('Samples: ', i,' Clusters: ',j)
                    minfoac = []
                    for trial in range(trials):
                        data = analysis.AEData(load_data(DATA, encoder.input_shape[1:]), n=i)
                        data.add_encoder(encoder)
                        N_CLUSTERS = j
                        ag1 = AgglomerativeClustering(n_clusters=N_CLUSTERS).fit_predict(data.encs[:int(i/2)])
                        ag2 = AgglomerativeClustering(n_clusters=N_CLUSTERS).fit_predict(data.encs)
                        minfoac.append(sklearn.metrics.adjusted_mutual_info_score(ag1, ag2[:int(i/2)]))
                    minfo_mean = np.nanmean(minfoac)
                    minfo_std = np.nanstd(minfoac)
                    search_results.append((i, N_CLUSTERS, minfo_mean, minfo_std))
                    print('Average Mutual information: ', minfo_mean, 'MI_STD: ', minfo_std, 'Precision: ', np.count_nonzero(~np.isnan(minfoac)))
    return search_results
    

In [3]:
DATA = "/project/foster/clouds/data/2015_05/*.tfrecord"
ENCODER_DEF = "/home/rlourenco/rdcep_clouds/output/m9-22_oceans/encoder.json"
ENCODER_WEIGHTS = "/home/rlourenco/rdcep_clouds/output/m9-22_oceans/encoder.h5"
# N_CLUSTERS = 10

In [None]:
result = gridsearch(2,1,40,max_samples=10000, sample_steps=10)

Samples:  9  Clusters:  2
Instructions for updating:
Use `tf.data.Dataset.batch(..., drop_remainder=True)`.
Average Mutual information:  0.5444444444444445 MI_STD:  0.5269291422835278 Precision:  30
Samples:  9  Clusters:  3
Average Mutual information:  0.5354398544495964 MI_STD:  0.3789628814819357 Precision:  30
Samples:  9  Clusters:  4
Average Mutual information:  -4.7090467064810746e-18 MI_STD:  8.994920560887689e-17 Precision:  27
Samples:  22  Clusters:  2
Average Mutual information:  0.6087440135861429 MI_STD:  0.3990821932635022 Precision:  30
Samples:  22  Clusters:  3


In [None]:
result
--

In [None]:
df = pd.DataFrame(result)
df.columns = ["Samples", "Clusters", "MInfo"]

# Mutual Information Plots

### 28 Samples, 13 clusters maximum

In [None]:
%matplotlib inline

plt.xlim(0,40)
plt.ylim(0,1)
y_28 =  df.loc[df['Samples'] == 28]
x_28 = y_28['Clusters']
plt.plot(x_28, y_28["MInfo"])
plt.show()


### 200 Samples, 40 clusters maximum

In [None]:
%matplotlib inline

plt.xlim(0,40)
plt.ylim(0,1)
y_200 =  df.loc[df['Samples'] == 200]
x_200 = y_200['Clusters']
plt.plot(x_200, y_200["MInfo"])
plt.show()

### 1414 Samples, 40 clusters maximum

In [None]:
%matplotlib inline

plt.xlim(0,40)
plt.ylim(0,1)
y_200 =  df.loc[df['Samples'] == 1414]
x_200 = y_200['Clusters']
plt.plot(x_200, y_200["MInfo"])
plt.show()

### 10000 Samples, 13 clusters maximum

In [None]:
%matplotlib inline

plt.xlim(0,40)
plt.ylim(0,1)
y_200 =  df.loc[df['Samples'] == 10000]
x_200 = y_200['Clusters']
plt.plot(x_200, y_200["MInfo"])
plt.show()

#### Best result seems to be when using 200 samples, with 9 clusters, with a MI score of 0.84.

In [None]:
---

In [None]:
# result = gridsearch(2,1,500)
N_CLUSTERS = 9 #result[0]
# HiMI = #result[1]

In [None]:
with open(ENCODER_DEF,"r") as f:
    encoder = tf.keras.models.model_from_json(f.read())
encoder.load_weights(ENCODER_WEIGHTS)

In [None]:
data = analysis.AEData(load_data(DATA, encoder.input_shape[1:]), n=200)
data.add_encoder(encoder)

In [None]:
m1 = AgglomerativeClustering(n_clusters=N_CLUSTERS)
m1.fit(data.encs[:100])
ag1 = AgglomerativeClustering(n_clusters=N_CLUSTERS).fit_predict(data.encs[:100])

m2 = AgglomerativeClustering(n_clusters=N_CLUSTERS)
m2.fit(data.encs)
ag2 = AgglomerativeClustering(n_clusters=N_CLUSTERS).fit_predict(data.encs)

In [None]:
# Number of bins
nbins=N_CLUSTERS

plt.figure(figsize=(8,8))
plt.hist2d(ag1, ag2[:100], bins=nbins);
plt.title("Cluster overlap")
plt.xticks(range(0, nbins, 5)); plt.yticks(range(0, nbins, 5));
plt.xlabel("ag 10000"); plt.ylabel("ag 5000");
plt.colorbar()
print('Mutual Information score: ',sklearn.metrics.adjusted_mutual_info_score(ag1, ag2[:100]))

In [None]:
overlap = data.imgs[:100,:,:,0][ag1 == ag2[:100]]
overlap_c = ag1[ag1 == ag2[:100]]



In [None]:
fig, ax = plt.subplots(5, 8, figsize=(16, 10))

clu = data.imgs[:100,:,:,0][ag1 == 2]

for i, a in enumerate(ax.ravel()):
    a.imshow(clu[i], cmap="bone")
    a.set_xticks([])
    a.set_yticks([])



In [None]:

from scipy.cluster.hierarchy import dendrogram

In [None]:
def plot_dendrogram(model, **kwargs):

    # Children of hierarchical clustering
    children = model.children_

    # Distances between each pair of children
    # Since we don't have this information, we can use a uniform one for plotting
    distance = np.arange(children.shape[0])

    # The number of observations contained in each cluster level
    no_of_observations = np.arange(2, children.shape[0]+2)

    # Create linkage matrix and then plot the dendrogram
    linkage_matrix = np.column_stack([children, distance, no_of_observations]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)



That's hard to interpretate...

In [None]:
plt.figure(figsize=(500,100))
plot_dendrogram(m2, labels=m2.labels_)