In [None]:
from clustering import LGKUtils
from robustness import verify_regions, save_verified_regions

class MockLGKCluster:
    def __init__(self, regions, categories):
        self.__regions = regions
        self.__categories = categories
    
    def get_regions(self, sort=True):
        return sorted(self.__regions, key=lambda r:(r.n, r.density), reverse=True)
    
    def get_categories(self):
        return self.__categories

def verify_subset_regions(startidx, endidx, save=True):
    lgkc = LGKUtils.load('../artifacts/test/lgkm.pkl')
    regions, categories = lgkc.get_regions(sort=True)[startidx:endidx], lgkc.get_categories()
    mock_lgkc = MockLGKCluster(regions, categories)
    nnet_path = '../network/models/latest/model.nnet'
    vregions = verify_regions(nnet_path, mock_lgkc, nmin=2, eprec=0.0001, rpad=10, verbose=1)
    if save: save_verified_regions(vregions, outdir=f'../artifacts/test/vr_{startidx}_{endidx-1}')
    return vregions

results = []
for i in range(5, 10):
    start, end = i*100, (i+1)*100
    print(f'verifying regions {start}-{end-1}')
    results.extend(verify_subset_regions(start, end))
results

In [None]:
from clustering import LGKUtils

n_outputs = 5
verification_csv = '../data/latest/verification.csv'
X_orig, y_orig = LGKUtils.load_dataset(verification_csv, n_outputs)

In [None]:
X_orig.shape, y_orig.shape

In [None]:
import numpy as np
from scipy import stats

def remove_outliers(X, y, tolerance):
    idxs = np.where((np.abs(stats.zscore(X)) < tolerance).all(axis=1))[0]
    return X[idxs], y[idxs]

In [None]:
from clustering import LGKClustering, LGKUtils

X, y = X_orig.copy(), y_orig.copy()
X, y = remove_outliers(X, y, 10)
lgkmc = LGKClustering().fit(X, y, init_centroid='rand') # rand, first, none

In [None]:
zsz

In [None]:
X.shape, y.shape

In [None]:
LGKUtils.save(lgkmc, outdir='../artifacts/test')dss

In [None]:
LGKUtils.print_regions(lgkmc)

In [None]:
print('n == 1: %d' % sum([1 for r in lgkmc.get_regions() if r.n == 1]))
print('n > 1: %d' % sum([1 for r in lgkmc.get_regions() if r.n > 1]))
print('n >= 10: %d' % sum([1 for r in lgkmc.get_regions() if r.n >= 10]))
print('n >= 100: %d' % sum([1 for r in lgkmc.get_regions() if r.n >= 100]))
print('n >= 1000: %d' % sum([1 for r in lgkmc.get_regions() if r.n >= 1000]))
print('n >= 10000: %d' % sum([1 for r in lgkmc.get_regions() if r.n >= 10000]))

In [None]:
r0 = lgkmc.get_regions(sort=True)[0]
r0.n, r0.radius, r0.density

In [None]:
r0.X.shape

In [None]:
# from clustering import LGKUtils
# from robustness import verify_region
# from tot_net import TOTNet

# net = TOTNet('../network/models/latest/model.nnet')
# lgkc = LGKUtils.load('../artifacts/test/lgkm.pkl')
# r, ncategories, eprec = lgkc.get_regions(sort=True)[0], len(lgkc.get_categories()), 0.0001
# vr = verify_region(net, r, ncategories, eprec, rpad=1, verbose=1)
# vr

In [None]:
from clustering import LGKUtils, LGKClustering
from robustness import verify_regions, save_verified_regions

nnet_path = '../network/models/latest/model.nnet'
lgkc = LGKUtils.load('../artifacts/test/lgkm.pkl')
vregions = verify_regions(nnet_path, lgkc, nmin=10, eprec=0.0001, rpad=10, verbose=1)
vregions

In [None]:
save_verified_regions(vregions, outdir='../artifacts/vregions')

In [None]:
import os
import seaborn as sns
import pandas as pd
from matplotlib import pyplot as plt
from utils import TOTUtils, tohex

lgkmc = LGKMeansUtils.load('../artifacts/lgkm.pkl')
regions = lgkmc.get_regions(sort=False)
viz_X = [x for r in regions for x in r.X] + [r.centroid for r in regions]
viz_y = [y for r in regions for y in r.y] + [n_outputs+r.category for r in regions]

features, categories = TOTUtils.get_feature_names(), TOTUtils.get_category_names()
viz_df = pd.DataFrame(viz_X, columns=features)
viz_df['y'] = viz_y

colors = [tohex(r,g,b) for r,g,b in sns.color_palette('rainbow_r', n_outputs)]
palette = {i:colors[i if i < n_outputs else i-n_outputs] for i in range(n_outputs*2)}
markers = ['o' if i < n_outputs else 'D' for i in range(n_outputs*2)]

In [None]:
g = sns.pairplot(viz_df, hue='y', corner=True, palette=palette, markers=markers, plot_kws=dict(alpha=0.5, s=10))
g = g.add_legend()
g.savefig('../artifacts/lgkmtest.png', dpi=300)
# plt.show()

In [None]:
g = sns.PairGrid(viz_df, hue='y', vars=features, palette=palette, hue_kws={'marker': markers})
g = g.map_diag(sns.kdeplot)
# g = g.map_lower(sns.scatterplot, edgecolor='w', s=20, alpha=0.5)
g = g.map_upper(sns.kdeplot, shade=True, shade_lowest=False)
g = g.map_upper(sns.kdeplot, shade=True)
# g = g.add_legend({i:l for i,l in enumerate(categories)})
g.savefig('../artifacts/lgkm_kde_test.png', dpi=300)
# plt.show()

In [None]:
# 2D PCA

# import seaborn as sns
# import pandas as pd
# import numpy as np
# from sklearn.decomposition import PCA #Principal Component Analysis
# import plotly as py
# import plotly.graph_objs as go
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# from utils import TOTUtils, tohex
# from clustering import LGKMeansUtils

# n_outputs = 5
# features = TOTUtils.get_feature_names()
# colors = [tohex(r,g,b) for r,g,b in sns.color_palette('rainbow_r', n_outputs)]
# lgkmc = LGKMeansUtils.load('../artifacts/lgkm.pkl')
# regions = lgkmc.get_regions(sort=False)[0:1000]

# X, y, rX = zip(*[(r.X[i], r.y[i], ri) for ri,r in enumerate(regions) for i in range(r.n)])
# viz_df = pd.DataFrame(X, columns=features)
# viz_df['y'] = y
# viz_df['region'] = rX
# plot_X = viz_df.drop(['y', 'region'], axis=1)
# pcs_2d = pd.DataFrame(PCA(n_components=2).fit_transform(plot_X), columns=['pc1', 'pc2'])
# plot_X = pd.concat([plot_X, pcs_2d], axis=1, join='inner')
# data = []
# for i,r in enumerate(regions):
#     r_X = plot_X[viz_df['region'] == i]
#     trace = go.Scatter(x=r_X['pc1'], y=r_X['pc2'], mode='markers', name=f'region_{i}', marker=dict(color=colors[r.category]), text=None)
#     data.append(trace)
# layout = dict(title='Regions in 2D using PCA',
#               xaxis=dict(title='PC1', ticklen=5, zeroline=False),
#               yaxis=dict(title='PC2', ticklen=5, zeroline=False))
# fig = dict(data=data, layout=layout)
# iplot(fig)

In [None]:
# # 3D PCA
# import seaborn as sns
# import pandas as pd
# import numpy as np
# from sklearn.decomposition import PCA #Principal Component Analysis
# import plotly as py
# import plotly.graph_objs as go
# from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# from utils import TOTUtils, tohex
# from clustering import LGKMeansUtils

# n_outputs = 5
# features = TOTUtils.get_feature_names()
# lgkmc = LGKMeansUtils.load('../artifacts/lgkm.pkl')
# regions = lgkmc.get_regions(sort=False)[0:100]
# colors = [tohex(r,g,b) for r,g,b in sns.color_palette('rainbow_r', n_outputs)]

# X, y, rX = zip(*[(r.X[i], r.y[i], ri) for ri,r in enumerate(regions) for i in range(r.n)])
# viz_df = pd.DataFrame(X, columns=features)
# viz_df['y'] = y
# viz_df['region'] = rX
# plot_X = viz_df.drop(['y', 'region'], axis=1)
# pcs_3d = pd.DataFrame(PCA(n_components=3).fit_transform(plot_X), columns=['pc1', 'pc2', 'pc3'])
# plot_X = pd.concat([plot_X, pcs_3d], axis=1, join='inner')
# data = []
# for i,r in enumerate(regions):
#     r_X = plot_X[viz_df['region'] == i]
#     trace = go.Scatter3d(x=r_X['pc1'], y=r_X['pc2'], z=r_X['pc3'], mode='markers', name=f'region_{i}', marker=dict(color=colors[r.category]), text=None)
#     data.append(trace)
# layout = dict(title='Regions in 2D using PCA',
#               xaxis=dict(title='PC1', ticklen=5, zeroline=False),
#               yaxis=dict(title='PC2', ticklen=5, zeroline=False))
# fig = dict(data=data, layout=layout)
# iplot(fig)

In [None]:
features = TOTUtils.get_feature_names()
lgkmc = LGKMeansUtils.load('../artifacts/lgkm.pkl')
regions = lgkmc.get_regions(sort=True)
np.unique([r.category for r in regions[:100]])

In [None]:
# T-SNE
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
from utils import TOTUtils, tohex
from clustering import LGKMeansUtils
from IPython.display import display

features = TOTUtils.get_feature_names()
lgkmc = LGKMeansUtils.load('../artifacts/lgkm.pkl')
regions = lgkmc.get_regions(sort=True)
regions = regions[:20]

X, y, rX = zip(*[(r.X[i], r.y[i], ri) for ri,r in enumerate(regions) for i in range(r.n)])
cX, cy, crX = zip(*[(r.centroid, r.category, ri) for ri,r in enumerate(regions)])
n_outputs = np.unique(y).shape[0]
colors = [tohex(r,g,b) for r,g,b in sns.color_palette('rainbow_r', n_outputs)]
df = pd.DataFrame(X, columns=features)
df['y'] = y
df['region'] = rX
df['centroid'] = 0
# df['radius'] = radii
c_df = pd.DataFrame(cX, columns=features)
c_df['y'] = cy
c_df['region'] = crX
c_df['centroid'] = 1
# c_df['radius'] = cradii
viz_df = pd.concat([df, c_df], ignore_index=True)
plot_X = viz_df.drop(['y', 'region', 'centroid'], axis=1)
plot_Meta = viz_df[['region', 'y', 'centroid']]

perplexity = 100
dimensions = 3
outline_regions = True

display(f'Performing TSNE on {len(regions)} regions...')
tcs = [f'tc{i+1}' for i in range(dimensions)]
tsne = pd.DataFrame(TSNE(n_components=dimensions, perplexity=perplexity).fit_transform(plot_X), columns=tcs)
if dimensions == 1:
    tsne['tc2'] = 0
plot_X = pd.concat([plot_X, tsne], axis=1, join='inner')
traces, shapes = [], []

display(f'Plotting {len(regions)} regions as {dimensions}D TSNE...')
for i,r in enumerate(regions):
    r_X = plot_X[(plot_Meta['region'] == i) & (plot_Meta['centroid'] == 0)]
    c_X = plot_X[(plot_Meta['region'] == i) & (plot_Meta['centroid'] == 1)]
    rname, cname = f'r_{i}', f'c_{i}'
    color = colors[r.category]
    marker = dict(color=color, size=2)
    cmarker = dict(marker, line=dict(color='#444444', width=1))
    circle = dict(type='circle', xref='x', yref='y', fillcolor=color, line_color=color, opacity=0.11)
    if dimensions < 3:
        tc1, tc2 = tcs[0], tcs[1]
        x, y, cx, cy = r_X[tc1], r_X[tc2], c_X[tc1], c_X[tc2]
        x0, y0, x1, y1 = x.min(), y.min(), x.max(), y.max()
        traces.append(go.Scatter(x=x, y=y, mode='markers', name=rname, marker=marker))
        traces.append(go.Scatter(x=cx, y=cy, mode='markers', name=cname, marker=cmarker, showlegend=False))
        if outline_regions:
            shapes.append(dict(circle, x0=x0, y0=y0, x1=x1, y1=y1))
    else:
        tc1, tc2, tc3 = tcs[0], tcs[1], tcs[2]
        x, y, z, cx, cy, cz = r_X[tc1], r_X[tc2], r_X[tc3], c_X[tc1], c_X[tc2], c_X[tc3]
        traces.append(go.Scatter3d(x=x, y=y, z=z, mode='markers', name=rname, marker=marker))
        traces.append(go.Scatter3d(x=cx, y=cy, z=cz, mode='markers', marker=cmarker, showlegend=False))
        if outline_regions:
            traces.append(go.Mesh3d(alphahull=5, opacity=.1, x=x, y=y, z=z, color=color, showscale=False, showlegend=False))

title = f'Regions in {dimensions}D using T-SNE (p={perplexity})'
axis, xtitle, ytitle = dict(title='', ticklen=5, zeroline=False), 'TC1', ('TC2' if dimensions > 1 else '')
layout = dict(title=title, xaxis=dict(axis, title=xtitle), yaxis=dict(axis, title=ytitle), showlegend=True)

fig = go.Figure(layout=layout)
for t in traces:
    fig.add_trace(t)
if len(shapes) > 0:
    fig.update_layout(shapes=shapes)

fig.update_layout(showlegend=False)
# fig.show()

outdir = '../artifacts/plots'
outpath = os.path.join(outdir, 'post-tsne-test.html')
if not os.path.exists(outdir): os.makedirs(outdir)
fig.write_html(outpath)
display(f'wrote plot to {outpath}')

In [None]:
# np.unique([r.category for r in regions])
regions

In [None]:
# shapes
viz_df['region'].value_counts()

In [None]:
# LGKMeansUtils.plot_regions(lgkmc, save=True, outdir='../artifacts')

In [None]:
# from numpy import unique
# from numpy import where
# from sklearn.datasets import make_classification
# from sklearn.cluster import OPTICS
# from matplotlib import pyplot
# # define the model
# model = OPTICS(eps=0.8, min_samples=10)
# # fit model and predict clusters
# yhat = model.fit_predict(X)
# # retrieve unique clusters
# clusters = unique(yhat)
# # create scatter plot for samples from each cluster
# for cluster in clusters:
# 	# get row indexes for samples with this cluster
# 	row_ix = where(yhat == cluster)
# 	# create scatter of these samples
# 	pyplot.scatter(X[row_ix, 0], X[row_ix, 1])
# # show the plot
# pyplot.show()

In [None]:
from clustering import LGKMeansUtils
X_orig, y_orig = LGKMeansUtils.load_dataset('../data/latest/verification.csv', 5)

In [None]:
import random
import numpy as np
from sklearn.cluster import DBSCAN

def sample(X, y, frac=1):
    idxs = random.sample(range(X.shape[0]), int(X.shape[0] * frac))
    return X[idxs], y[idxs]

X, y = X_orig.copy(), y_orig.copy()
X, y = sample(X, y, frac=0.02)

dbscan = DBSCAN(eps=2.35, min_samples=2).fit(X)
print(f'n_inputs:{X.shape[0]}, n_clusters:{np.unique(dbscan.labels_).shape[0]}')

In [None]:
import random
import numpy as np
from sklearn.cluster import MeanShift, AffinityPropagation, SpectralClustering, estimate_bandwidth
from sklearn.decomposition import PCA
from clustering import LGKMeansUtils

def sample(X, y, frac=1):
    idxs = random.sample(range(X.shape[0]), int(X.shape[0] * frac))
    return X[idxs], y[idxs]

X, y = X_orig.copy(), y_orig.copy()
X, y = sample(X, y, frac=0.001)
# X = PCA(n_components=2).fit_transform(X)

# bandwidth = estimate_bandwidth(X)
# ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
# ms.fit(X)
# labels = ms.labels_
# cluster_centers = ms.cluster_centers_

# af = AffinityPropagation(preference=-50).fit(X)
# cluster_centers_indices = af.cluster_centers_indices_
# labels = af.labels_

sc = SpectralClustering(n_clusters=110)
sc.fit(X)
labels = sc.labels_

labels_unique, label_counts = np.unique(labels, return_counts=True)
n_clusters = labels_unique.shape[0]

correct = [np.unique([y[i] for i in c]).shape[0] == 1 for c in [[i for i,l in enumerate(labels) if l==c] for c in range(n_clusters)]]
print(f'number of clusters: {n_clusters} ({X.shape[0]} inputs)')
print(f'num correct: {len([i for i in correct if i])}')
print(f'num incorrect: {len([i for i in correct if not i])}')
print(f'cluster sizes: {label_counts}')

In [None]:

import random
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth
from clustering import LGKMeansUtils

# X, y = X_orig.copy(), LGKMeansUtils.reduce_classes(y_orig.copy())
X, y = X_orig.copy(), y_orig.copy()
randidxs = random.sample(range(X.shape[0]), 10000)
X, y = X[randidxs], y[randidxs]
n_inputs, n_categories = X.shape[0], np.unique(y, axis=0).shape[0]
# X, y = PCA(n_components=2).fit_transform(X, y), np.array([tocat(yi, n_categories) for yi in y])
n_features, n_targets = X.shape[1], y.shape[1]

regions, remaining = [], [(X, y)]
while remaining:
    X, y = remaining.pop(0)
    n = np.unique(y, axis=0).shape[0]
    model = MeanShift(n_clusters=n)
    clusters = model.fit_predict(np.concatenate([X, y], axis=1), categorical=list(range(n_features, n_features+n_targets)))
    for c in np.unique(clusters):
        # TODO: update region to support centroid's y value
        centroid = model.cluster_centroids_[0][c]
        xis = np.where(clusters == c)[0]
        Xc, yc = X[xis], y[xis]
        if np.unique(yc, axis=0).shape[0] == 1:
            # TODO: update region class to support onehot
            regions.append(LGKMeansRegion(centroid, Xc, np.array([np.where(yi==1)[0][0] for yi in yc])))
        else:
            remaining.append((Xc, yc))

print(f'identified {len(regions)} regions from {n_inputs} inputs of {n_categories} categories')
savepath = os.path.join('../artifacts', 'lgkp-regions.pkl')
pickle.dump(regions, open(savepath, 'wb'))
print(f'saved regions to {savepath}')

In [None]:
cluster_ids, cluster_counts = np.unique(dbscan.labels_, return_counts=True)
n_clusters = cluster_ids.shape[0]
clusters = [[i for i,l in enumerate(dbscan.labels_) if l==c] for c in range(n_clusters)]
correct = [np.unique([y[i] for i in c]).shape[0] == 1 for c in clusters]
print(f'num correct: {len([i for i in correct if i])}')
print(f'num incorrect: {len([i for i in correct if not i])}')
print(f'cluster counts: {cluster_counts}')