In [1]:
import numpy as np
import pandas as pdcoding
import gzip

In [2]:
#SOMToolbox Parser
from SOMToolBox_Parse import SOMToolBox_Parse
idata = SOMToolBox_Parse("datasets\\iris\\iris.vec").read_weight_file()
weights = SOMToolBox_Parse("datasets\\iris\\iris.wgt.gz").read_weight_file()

In [3]:
#HitHistogram
def HitHist(_m, _n, _weights, _idata):
    hist = np.zeros(_m * _n)
    for vector in _idata: 
        position =np.argmin(np.sqrt(np.sum(np.power(_weights - vector, 2), axis=1)))
        hist[position] += 1

    return hist.reshape(_m, _n)

#U-Matrix - implementation
def UMatrix(_m, _n, _weights, _dim):
    U = _weights.reshape(_m, _n, _dim)
    U = np.insert(U, np.arange(1, _n), values=0, axis=1)
    U = np.insert(U, np.arange(1, _m), values=0, axis=0)
    #calculate interpolation
    for i in range(U.shape[0]): 
        if i%2==0:
            for j in range(1,U.shape[1],2):
                U[i,j][0] = np.linalg.norm(U[i,j-1] - U[i,j+1], axis=-1)
        else:
            for j in range(U.shape[1]):
                if j%2==0: 
                    U[i,j][0] = np.linalg.norm(U[i-1,j] - U[i+1,j], axis=-1)
                else:      
                    U[i,j][0] = (np.linalg.norm(U[i-1,j-1] - U[i+1,j+1], axis=-1) + np.linalg.norm(U[i+1,j-1] - U[i-1,j+1], axis=-1))/(2*np.sqrt(2))

    U = np.sum(U, axis=2) #move from Vector to Scalar

    for i in range(0, U.shape[0], 2): #count new values
        for j in range(0, U.shape[1], 2):
            region = []
            if j>0: region.append(U[i][j-1]) #check left border
            if i>0: region.append(U[i-1][j]) #check bottom
            if j<U.shape[1]-1: region.append(U[i][j+1]) #check right border
            if i<U.shape[0]-1: region.append(U[i+1][j]) #check upper border

            U[i,j] = np.median(region)

    return U

#SDH - implementation
def SDH(_m, _n, _weights, _idata, factor, approach):
    import heapq

    sdh_m = np.zeros( _m * _n)

    cs=0
    for i in range(factor): cs += factor-i

    for vector in _idata:
        dist = np.sqrt(np.sum(np.power(_weights - vector, 2), axis=1))
        c = heapq.nsmallest(factor, range(len(dist)), key=dist.__getitem__)
        if (approach==0): # normalized
            for j in range(factor):  sdh_m[c[j]] += (factor-j)/cs 
        if (approach==1):# based on distance
            for j in range(factor): sdh_m[c[j]] += 1.0/dist[c[j]] 
        if (approach==2): 
            dmin, dmax = min(dist[c]), max(dist[c])
            for j in range(factor): sdh_m[c[j]] += 1.0 - (dist[c[j]]-dmin)/(dmax-dmin)

    return sdh_m.reshape(_m, _n)

In [4]:
import panel as pn
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')

hithist = hv.Image(HitHist(weights['ydim'], weights['ydim'], weights['arr'], idata['arr'])).opts(xaxis=None, yaxis=None) 
um = hv.Image(UMatrix(weights['ydim'], weights['ydim'], weights['arr'], 4)).opts(xaxis=None, yaxis=None) 
sdh = hv.Image(SDH(weights['ydim'], weights['ydim'], weights['arr'], idata['arr'], 25, 0)).opts(xaxis=None, yaxis=None)   

hv.Layout([hithist.relabel('HitHist').opts(cmap='kr'), 
           um.relabel('U-Matrix').opts(cmap='jet'), sdh.relabel('SDH').opts(cmap='viridis')])

C) Evaluation Report
1) Perform and document (!) the testing of the components you coded by defining and
evaluating suitable tests to evaluate the correctness and robustness of the coded modules.

2) For systematic evaluation of tasks a-h, pick the Chainlink Data Set and the 10-Clusters
dataset from
http://www.ifs.tuwien.ac.at/dm/somtoolbox/datasets.html 						--> already available in pysomvis datasets

3) Train a 10x10 (small) and a 100x60 (large) SOM. Make sure that the SOMs are properly
trained, i.e. that the structures to be expected in the SOM become clearly visible by identifying
suitable parameters for the initial neighborhood radius and initial learning rate.

In [12]:
# code goes here
from minisom import MiniSom
from pysomvis import PySOMVis #

idata_chainlink   = SOMToolBox_Parse("datasets/chainlink/chainlink.vec").read_weight_file()
idata_10clusters   = SOMToolBox_Parse("datasets/10clusters/10clusters.vec").read_weight_file()



small_som_chainlink = MiniSom(10, 10, 3, sigma=7, learning_rate=0.7)
small_som_chainlink.train(idata_chainlink['arr'], 10000)
large_som_chainlink = MiniSom(100, 60, 3, sigma=7, learning_rate=0.7)
large_som_chainlink.train(idata_chainlink['arr'], 10000)

vis = PySOMVis(weights=small_som_chainlink._weights, input_data=idata_chainlink['arr'])
vis._mainview
vis = PySOMVis(weights=large_som_chainlink._weights, input_data=idata_chainlink['arr'])
vis._mainview

# small_som_10clusters = MiniSom(10, 10, 10, learning_rate=0.7)
# small_som_10clusters.train(idata_10clusters['arr'], 1000)
# large_som_10clusters = MiniSom(100, 60, 10, learning_rate=0.7)
# large_som_10clusters.train(idata_10clusters['arr'], 1000)

BokehModel(combine_events=True, render_bundle={'docs_json': {'9ec403e0-5442-4745-8407-03b50587e1eb': {'version…

4) Show the visualizations, providing examples with different parameter settings and
comparisons that allow a validation of the correctness of the implementation. Specifically,
test a few extreme values for the parameter settings.

5) Where an identical visualization exists in the JÁVA SOM toolbox, read a SOM pre-trained
with the JAVA SOM Toolbox (import functions are provided in the notebook) and compare
your visualization with the one produced by the Java SOMToolbox (using either the pre
trained SOMs provided with the toolbox, or any that your colleagues who do the analytics
option of the exercise share with you). --> aligned SOM not part of JAVA SOM Toolbox

6) Provide (export/print) the notebook as separate PDF report that comprises all information.
Hence, the PDF export of the report needs to contain the fully-computed notebook with the
according visualizations shown as results and the information that can be derived from the
visualizations clearly described and semantically analyzed. Make sure that each visualization
includes the parameter setting applied. Specifically, the PDF export needs to contain:
- the implementation developed, explaining key parts of the code of each cell.
- the way the code was systematically tested for correctness, including the test cases
as part of the notebook.
- the evaluations performed under item 3) above, demonstrating the correctness of the
implementation, and the information gained.
- Where applicable: Comparison of the visualization with the identical visualizations
(reading the same trained SOM files) using the SOM Java Toolbox


In [None]:
#probably some more code