In [1]:
import numpy as np
import pandas as pdcoding
import gzip

In [2]:
#SOMToolbox Parser
from SOMToolBox_Parse import SOMToolBox_Parse
idata = SOMToolBox_Parse("datasets/iris/iris.vec").read_weight_file()
weights = SOMToolBox_Parse("datasets/iris/iris.wgt.gz").read_weight_file()

In [3]:
#HitHistogram
def HitHist(_m, _n, _weights, _idata):
    hist = np.zeros(_m * _n)
    for vector in _idata: 
        position =np.argmin(np.sqrt(np.sum(np.power(_weights - vector, 2), axis=1)))
        hist[position] += 1

    return hist.reshape(_m, _n)

#U-Matrix - implementation
def UMatrix(_m, _n, _weights, _dim):
    U = _weights.reshape(_m, _n, _dim)
    U = np.insert(U, np.arange(1, _n), values=0, axis=1)
    U = np.insert(U, np.arange(1, _m), values=0, axis=0)
    #calculate interpolation
    for i in range(U.shape[0]): 
        if i%2==0:
            for j in range(1,U.shape[1],2):
                U[i,j][0] = np.linalg.norm(U[i,j-1] - U[i,j+1], axis=-1)
        else:
            for j in range(U.shape[1]):
                if j%2==0: 
                    U[i,j][0] = np.linalg.norm(U[i-1,j] - U[i+1,j], axis=-1)
                else:      
                    U[i,j][0] = (np.linalg.norm(U[i-1,j-1] - U[i+1,j+1], axis=-1) + np.linalg.norm(U[i+1,j-1] - U[i-1,j+1], axis=-1))/(2*np.sqrt(2))

    U = np.sum(U, axis=2) #move from Vector to Scalar

    for i in range(0, U.shape[0], 2): #count new values
        for j in range(0, U.shape[1], 2):
            region = []
            if j>0: region.append(U[i][j-1]) #check left border
            if i>0: region.append(U[i-1][j]) #check bottom
            if j<U.shape[1]-1: region.append(U[i][j+1]) #check right border
            if i<U.shape[0]-1: region.append(U[i+1][j]) #check upper border

            U[i,j] = np.median(region)

    return U

#SDH - implementation
def SDH(_m, _n, _weights, _idata, factor, approach):
    import heapq

    sdh_m = np.zeros( _m * _n)

    cs=0
    for i in range(factor): cs += factor-i

    for vector in _idata:
        dist = np.sqrt(np.sum(np.power(_weights - vector, 2), axis=1))
        c = heapq.nsmallest(factor, range(len(dist)), key=dist.__getitem__)
        if (approach==0): # normalized
            for j in range(factor):  sdh_m[c[j]] += (factor-j)/cs 
        if (approach==1):# based on distance
            for j in range(factor): sdh_m[c[j]] += 1.0/dist[c[j]] 
        if (approach==2): 
            dmin, dmax = min(dist[c]), max(dist[c])
            for j in range(factor): sdh_m[c[j]] += 1.0 - (dist[c[j]]-dmin)/(dmax-dmin)

    return sdh_m.reshape(_m, _n)

In [None]:
import panel as pn
import holoviews as hv
from holoviews import opts
hv.extension('bokeh')

hithist = hv.Image(HitHist(weights['ydim'], weights['ydim'], weights['arr'], idata['arr'])).opts(xaxis=None, yaxis=None) 
um = hv.Image(UMatrix(weights['ydim'], weights['ydim'], weights['arr'], 4)).opts(xaxis=None, yaxis=None) 
sdh = hv.Image(SDH(weights['ydim'], weights['ydim'], weights['arr'], idata['arr'], 25, 0)).opts(xaxis=None, yaxis=None)   

hv.Layout([hithist.relabel('HitHist').opts(cmap='kr'), 
           um.relabel('U-Matrix').opts(cmap='jet'), sdh.relabel('SDH').opts(cmap='viridis')])

In [5]:
import minisom

class AlignedSomLayer(minisom.MiniSom):
    def __init__(self, n, i, featuresA, x, y, input_len, sigma=1.0, learning_rate=0.5,
                 decay_function=minisom.asymptotic_decay,
                 neighborhood_function='gaussian', topology='rectangular',
                 activation_distance='euclidean', random_seed=42):
        super().__init__(x, y, input_len, sigma, learning_rate, decay_function, neighborhood_function, topology, activation_distance, random_seed)
        self.n = n
        self.i = i
        self.featuresA = featuresA
        
        self.underyling_activation_function = self._activation_distance
        self._activation_distance = self.activation_scale_wrapper
        
        #self.rescale(featuresA, n, i)

    def activation_scale_wrapper(self, x, w):
        # print(x)
        n = self.n
        i = self.i
        scaleA = (n-i-1)/(n-1)
        scaleB = i/(n-1)

        factor = []
        for j in range(self._input_len):
            if j in self.featuresA:
                factor.append(scaleA)
            else:
                factor.append(scaleB)
        
        xcopy = x.copy()
        wcopy = w.copy()

        # print(f"layer {i}")
        # print("old")
        # print(xcopy)
        # print(wcopy)

        xcopy *= np.array(factor)
        wcopy *= np.array(factor)


        # print("new")
        # print(xcopy)
        # print(wcopy)

        return self.underyling_activation_function(xcopy, wcopy)



    def rescale(self, featuresA, n, i):
        scaleA = (n-i-1)/(n-1)
        scaleB = i/(n-1)

        factor = []
        for i in range(self._input_len):
            if i in featuresA:
                factor.append(scaleA)
            else:
                factor.append(scaleB)

        self._weights *= factor


    def update_scaled(self, x, win, t, max_iteration, scaling_factor):
        """
        Updates the weights of the neurons. Scaled by the scaling Factor.
        In practice, this factor can be something like 1 / how far the layer is from the "pivot" layer

        Parameters
        ----------
        x : np.array
            Current pattern to learn.
        win : tuple
            Position of the winning neuron for x (array or tuple).
        t : int
            rate of decay for sigma and learning rate
        max_iteration : int
            If use_epochs is True:
                Number of epochs the SOM will be trained for
            If use_epochs is False:
                Maximum number of iterations (one iteration per sample).
        scaling_factor: float
            factor the update is scaled by
        """
        eta = self._decay_function(self._learning_rate, t, max_iteration)
        # sigma and learning rate decrease with the same rule
        sig = self._decay_function(self._sigma, t, max_iteration)
        # improves the performances
        g = self.neighborhood(win, sig)*eta*scaling_factor
        # w_new = eta * neighborhood_function * (x-w)
        self._weights += np.einsum('ij, ijk->ijk', g, x-self._weights)

    def update_3d(self, x, win, t, max_iteration, chosen_layer_index):
        """Updates the weights of the neurons.

        Parameters
        ----------
        x : np.array
            Current pattern to learn.
        win : tuple
            Position of the winning neuron for x (array or tuple).
        t : int
            rate of decay for sigma and learning rate
        max_iteration : int
            If use_epochs is True:
                Number of epochs the SOM will be trained for
            If use_epochs is False:
                Maximum number of iterations (one iteration per sample).
        """
        win_3d = win


        eta = self._decay_function(self._learning_rate, t, max_iteration)
        # sigma and learning rate decrease with the same rule
        sig = self._decay_function(self._sigma, t, max_iteration)
        # improves the performances
        g = self.neighborhood(win, sig)*eta
        # w_new = eta * neighborhood_function * (x-w)
        self._weights += np.einsum('ij, ijk->ijk', g, x-self._weights)

    # def _cosine_distance(self, x, w):
    #     num = (w * x).sum(axis=2)
    #     denum = multiply(linalg.norm(w, axis=2), linalg.norm(x))
    #     return 1 - num / (denum+1e-8)

    # def _euclidean_distance(self, x, w):
    #     return linalg.norm(subtract(x, w), axis=-1)

    # def _manhattan_distance(self, x, w):
    #     return linalg.norm(subtract(x, w), ord=1, axis=-1)

    # def _chebyshev_distance(self, x, w):
    #     return max(subtract(x, w), axis=-1)


# import minisom
class AlignedSom:
    def __init__(self, x, y, input_len, n, featuresA, sigma=1.0, learning_rate=0.5,
                 decay_function=minisom.asymptotic_decay,
                 neighborhood_function='gaussian', topology='rectangular',
                 activation_distance='euclidean', random_seed=42):
        """
        initializes the Aligned Som class.
        n guides how many maps are generated
        featuresA sets which features are in the A set (The rest is in B). This should be a list of indices < input_len

        All the other params get handed over to the "children soms"
        """
        self.n = n
        self.featuresA = featuresA
        self.x = x
        self.y = y
        self._random_seed = random_seed

        self._learning_rate = learning_rate
        self._sigma = sigma
        self._input_len = input_len

        self.topology = topology

        self._decay_function = decay_function
        
        self._random_generator = np.random.RandomState(random_seed)

        self._layers = [AlignedSomLayer(n, i, featuresA, x, y, input_len, sigma=sigma, learning_rate=learning_rate, 
                                        decay_function=decay_function, 
                                        neighborhood_function=neighborhood_function, topology=topology, 
                                        activation_distance=activation_distance, random_seed=random_seed
                                        ) for i in range(n)]
        
        self._weights = [self._layers[i]._weights for i in range(n)]

    def get_scaled_vector(self, x, i):
        scaleA = (self.n - 1 - i)/(self.n-1)
        scaleB = i/(self.n-1)

        xnew = x.copy()
        
        for j in range(len(x)):
            if j in self.featuresA:
                xnew[j] *= scaleA
            else:
                xnew[j] *= scaleB
        
        return xnew

    def train(self, data, num_iteration,
              random_order=False, verbose=False, use_epochs=False):
        """
        Trains the SOM.

        Parameters
        ----------
        data : np.array or list
            Data matrix.

        num_iteration : int
            If use_epochs is False, the weights will be
            updated num_iteration times. Otherwise they will be updated
            len(data)*num_iteration times.

        random_order : bool (default=False)
            If True, samples are picked in random order.
            Otherwise the samples are picked sequentially.

        verbose : bool (default=False)
            If True the status of the training will be
            printed each time the weights are updated.

        use_epochs : bool (default=False)
            If True the SOM will be trained for num_iteration epochs.
            In one epoch the weights are updated len(data) times and
            the learning rate is constat throughout a single epoch.
        """
        
    
        # self._check_iteration_number(num_iteration)
        # self._check_input_len(data)

        random_generator = None
        if random_order:
            random_generator = self._random_generator
        iterations = minisom._build_iteration_indexes(len(data), num_iteration,
                                              verbose, random_generator,
                                              use_epochs)
        if use_epochs:
            def get_decay_rate(iteration_index, data_len):
                return int(iteration_index / data_len)
        else:
            def get_decay_rate(iteration_index, data_len):
                return int(iteration_index)

        for t, iteration in enumerate(iterations):
            chosen_layer_index = self._random_generator.randint(low=0, high=self.n)
            decay_rate = get_decay_rate(t, len(data))
            
            cur_training_sample = data[iteration]
            winner = self._layers[chosen_layer_index].winner(cur_training_sample)


            for update_layer_index in range(self.n):
                diff = chosen_layer_index - update_layer_index
                if diff < 0:
                    diff = -diff
                
                scaling_factor = 1/(1+diff)
                self._layers[update_layer_index].update_scaled(cur_training_sample, winner,
                        decay_rate, num_iteration, scaling_factor)
                # self._layers[update_layer_index].update_3d(cur_training_sample, winner,
                #         decay_rate, num_iteration, chosen_layer_index)
                
        
        # for t, iteration in enumerate(iterations):
        #     decay_rate = get_decay_rate(t, len(data))
        #     self.update(data[iteration], self.winner(data[iteration]),
        #                 decay_rate, num_iteration)
        if verbose:
            print('\n quantization error:', self.quantization_error(data))



    


In [6]:
alignedSom = AlignedSom(2, 2, 10, 11, [0, 1, 2])

data = [[1]*10,[2]*10]
alignedSom.train(data, 10)

In [18]:
# import panel as pn
# import holoviews as hv
# from holoviews import opts
# hv.extension('bokeh')

class AlignedSomVis():
    def __init__(self, weights, chosen_visulization=None, input_data=None):
        self._weights = weights
        self._idata = input_data

        self._num_layers = len(self._weights)
        # images = [hv.Image(chosen_visulization()) for i in range(self._num_layers)]
        xdim = self._weights[0].shape[0]
        ydim = self._weights[0].shape[1]
        self._images = [hv.Image(HitHist(xdim, ydim, self._weights[i], self._idata)).opts(xaxis=None, yaxis=None) for i in range(self._num_layers)]

        # hithist = hv.Image(HitHist(weights['ydim'], weights['ydim'], weights['arr'], idata['arr'])).opts(xaxis=None, yaxis=None) 
        # um = hv.Image(UMatrix(weights['ydim'], weights['ydim'], weights['arr'], 4)).opts(xaxis=None, yaxis=None) 
        # sdh = hv.Image(SDH(weights['ydim'], weights['ydim'], weights['arr'], idata['arr'], 25, 0)).opts(xaxis=None, yaxis=None)   

        # hv.Layout([hithist.relabel('HitHist').opts(cmap='kr'), 
        #         um.relabel('U-Matrix').opts(cmap='jet'), sdh.relabel('SDH').opts(cmap='viridis')])

    # def show(self):
        self._mainview = hv.Layout([self._images[i].relabel(f'Layer {i}').opts(cmap='kr') for i in range(self._num_layers)])

In [8]:
# from pysomvis import PySOMVis, OBJECTS_CLASSES
# # import pysomvis
# from visualizations.complane import ComponentPlane
# from visualizations.dmatrix import DMatrix
# from visualizations.hithistogram import HitHist
# from visualizations.sdh import SDH
# from visualizations.qerror import QError
# from visualizations.umatrix import UMatrix
# from visualizations.upmatrix import UStar_PMatrix
# from visualizations.neighbourhood_graph import NeighbourhoodGraph
# from visualizations.clustering import Clustering
# from visualizations.metromap import MetroMap
# from visualizations.piechart import PieChart
# from visualizations.chessboard import Chessboard
# from visualizations.somstreamvis import SOMStreamVis
# from visualizations.sky_metaphor import SkyMetaphor
# from visualizations.topographic_error import TopographicError
# from visualizations.intrinsic_distance import IntrinsicDistance
# from visualizations.activityhist import ActivityHist
# from visualizations.minimumSpanningTree import MinimumSpanningTree
# from visualizations.cluster_connection import ClusterConnection
# from mnemonics.mnemonicSOM import MnemonicSOM

# from holoviews.streams import Pipe, Buffer
# from controls.controllers import MainController, PointOptions, SegmentOptions

# class AlignedSomVisFirstAttempt(PySOMVis):
#     # def __init__(self, weights, input_data=None):
#     #     self._weights = weights
#     #     self._idata = input_data

#     def __init__(self, weights, m=None, n=None, dimension=None, input_data=None, classes_names=None, classes=None, component_names=None):
        
#         self._height = self._width = 500
#         self._pipe = Pipe(data=[])
#         self._pipe_points = Pipe(data=[])
#         self._pipe_paths = Pipe(data=[])
#         self._visualizations = []

#         self._weights = weights
#         #check ratio of the input map
#         if len(self._weights[0].shape)==3 and m==None and n==None and dimension==None:
#             self._m = self._weights[0].shape[0]
#             self._n = self._weights[0].shape[1]
#             self._dim = self._weights[0].shape[2]
#             for layer_index in range(len(self._weights)):
#                 self._weights[layer_index] = self._weights[layer_index].reshape(-1, self._dim)
#         else:
#             self._m = m
#             self._n = n
#             self._dim = dimension

#         self._idata = input_data
        
#         if input_data is not None:
#             self._distance = np.linalg.norm(self._idata[:, None, :] - self._idata[None, :, :], axis=-1)
        
#         if classes is not None: self._classes = classes.astype(int) 
#         else:       self._classes = classes
#         if component_names is not None: self._component_names = component_names
#         else:                           self._component_names = None
#         if classes_names is not None: self._classes_names = classes_names
#         else:                         self._classes_names = None


#         self._plot = None
#         self._maincontrol = MainController(self._interpolation, self._rotate, self._flip, self._visualizations, OBJECTS_CLASSES, name='')
#         self._pointoptions = PointOptions(name="Points")        
#         self._segmentoptions = SegmentOptions(name="Segments")
#         self._point_segment_options = pn.Tabs(self._pointoptions, self._segmentoptions)
#         self._mainp = pn.Column(pn.panel(self._maincontrol, default_layout=pn.Row, width=700))

#         self._xlim = (-.5*self._m/self._n,.5*self._m/self._n) if self._m>self._n else (-.5,.5)
#         self._ylim = (-.5*self._n/self._m,.5*self._n/self._m) if self._n>self._m else (-.5,.5)
#         #_COLOURS_93
#         self._Image = hv.DynamicMap(hv.Image, streams=[self._pipe]).apply.opts(cmap=self._maincontrol.param.colormap, 
#             width=self._width, height=self._height, xlim=self._xlim, ylim=self._ylim)
#         self._Paths = hv.DynamicMap(hv.Segments, streams=[self._pipe_paths]).apply.opts(alpha='alpha', line_width=self._segmentoptions.param.size, 
#                                                                                                                     color=self._segmentoptions.param.color)
#         self._Points = hv.DynamicMap(hv.Points, streams=[self._pipe_points]).apply.opts(size=self._pointoptions.param.size, color=self._pointoptions.param.color,
#                                                                                                                     marker=self._pointoptions.param.marker)
        
#         self._pdmap = pn.Column(self._Image * self._Paths * self._Points)

#         self._controls = pn.Row()
#         self._somstreamvis = pn.Row()
#         self._mainview = pn.Column(pn.Column(self._mainp, pn.Row(self._pdmap, self._controls)), pn.Column(self._somstreamvis))
       
#         self._visualizations.append(ComponentPlane(self))
#         if input_data is not None: self._visualizations.append(HitHist(self))
#         self._visualizations.append(UMatrix(self))
#         self._visualizations.append(DMatrix(self))
#         if input_data is not None:
#             self._visualizations.append(UStar_PMatrix(self))
#             self._visualizations.append(SDH(self))
#             self._visualizations.append(PieChart(self))
#             self._visualizations.append(NeighbourhoodGraph(self))
#             self._visualizations.append(Chessboard(self))
#             self._visualizations.append(Clustering(self))
#             self._visualizations.append(MetroMap(self))
#             self._visualizations.append(QError(self))
#             self._visualizations.append(SOMStreamVis(self))     
#             self._visualizations.append(SkyMetaphor(self)) 
#             self._visualizations.append(TopographicError(self)) 
#             self._visualizations.append(IntrinsicDistance(self)) 
#             self._visualizations.append(ActivityHist(self))

#         self._visualizations.append(MinimumSpanningTree(self))
#         self._visualizations.append(ClusterConnection(self))
#         self._visualizations.append(MnemonicSOM(self))
#         self._visualizations[0]._activate_controllers()
    
#     def _rotate(self, k):
#         for layer_index in range(len(self._weights)):
#             self._weights[layer_index] = np.rot90(self._weights[layer_index].reshape(self._m, self._n, self._dim), k).reshape(-1,self._dim)
#         self._pipe.send(np.rot90(self._pipe.data, k)) #TODO: this also in loop?
#         if self._m != self._n: 
#             self._m, self._n = self._n, self._m
#             self._ylim, self._xlim = self._xlim, self._ylim
#             self._pdmap[0] = pn.Column(self._Image.opts(xlim=self._xlim, ylim=self._ylim) * self._Points * self._Paths)

#     def _flip(self, horizontal):
#         if horizontal:
#             for layer_index in range(len(self._weights)):
#                 self._weights[layer_index] = np.fliplr(self._weights[layer_index].reshape(self._m, self._n, self._dim)).reshape(-1,self._dim)
#             self._pipe.send(np.fliplr(self._pipe.data)) #TODO: this also in loop?
#         else:
#             for layer_index in range(len(self._weights)):
#                 self._weights[layer_index] = np.flipud(self._weights[layer_index].reshape(self._m, self._n, self._dim)).reshape(-1,self._dim)
#             self._pipe.send(np.flipud(self._pipe.data)) #TODO: this also in loop?

C) Evaluation Report
1) Perform and document (!) the testing of the components you coded by defining and
evaluating suitable tests to evaluate the correctness and robustness of the coded modules.

2) For systematic evaluation of tasks a-h, pick the Chainlink Data Set and the 10-Clusters
dataset from
http://www.ifs.tuwien.ac.at/dm/somtoolbox/datasets.html 						--> already available in pysomvis datasets

3) Train a 10x10 (small) and a 100x60 (large) SOM. Make sure that the SOMs are properly
trained, i.e. that the structures to be expected in the SOM become clearly visible by identifying
suitable parameters for the initial neighborhood radius and initial learning rate.

In [16]:
# code goes here
from minisom import MiniSom
# from pysomvis import PySOMVis #

idata_chainlink   = SOMToolBox_Parse("datasets/chainlink/chainlink.vec").read_weight_file()
idata_10clusters   = SOMToolBox_Parse("datasets/10clusters/10clusters.vec").read_weight_file()

A_set_chainlink = [0, 1]
A_set_10clusters = [0, 2, 4, 6, 8]

num_layers_chainlink = 10
num_layers_10clusters = 10

small_som_chainlink = AlignedSom(10, 10, 3, num_layers_chainlink, A_set_chainlink, sigma=7, learning_rate=0.7)
# small_som_chainlink = MiniSom(10, 10, 3, sigma=7, learning_rate=0.7)
small_som_chainlink.train(idata_chainlink['arr'], 10000)
# large_som_chainlink = MiniSom(100, 60, 3, sigma=7, learning_rate=0.7)
# large_som_chainlink.train(idata_chainlink['arr'], 10000)



In [None]:
vis = AlignedSomVis(weights=small_som_chainlink._weights, input_data=idata_chainlink['arr'])
vis._mainview
# vis = PySOMVis(weights=small_som_chainlink._weights, input_data=idata_chainlink['arr'])
# vis._mainview
# vis = PySOMVis(weights=large_som_chainlink._weights, input_data=idata_chainlink['arr'])
# vis._mainview

# small_som_10clusters = MiniSom(10, 10, 10, learning_rate=0.7)
# small_som_10clusters.train(idata_10clusters['arr'], 1000)
# large_som_10clusters = MiniSom(100, 60, 10, learning_rate=0.7)
# large_som_10clusters.train(idata_10clusters['arr'], 1000)


4) Show the visualizations, providing examples with different parameter settings and
comparisons that allow a validation of the correctness of the implementation. Specifically,
test a few extreme values for the parameter settings.

5) Where an identical visualization exists in the JÁVA SOM toolbox, read a SOM pre-trained
with the JAVA SOM Toolbox (import functions are provided in the notebook) and compare
your visualization with the one produced by the Java SOMToolbox (using either the pre
trained SOMs provided with the toolbox, or any that your colleagues who do the analytics
option of the exercise share with you). --> aligned SOM not part of JAVA SOM Toolbox

6) Provide (export/print) the notebook as separate PDF report that comprises all information.
Hence, the PDF export of the report needs to contain the fully-computed notebook with the
according visualizations shown as results and the information that can be derived from the
visualizations clearly described and semantically analyzed. Make sure that each visualization
includes the parameter setting applied. Specifically, the PDF export needs to contain:
- the implementation developed, explaining key parts of the code of each cell.
- the way the code was systematically tested for correctness, including the test cases
as part of the notebook.
- the evaluations performed under item 3) above, demonstrating the correctness of the
implementation, and the information gained.
- Where applicable: Comparison of the visualization with the identical visualizations
(reading the same trained SOM files) using the SOM Java Toolbox


In [11]:
#probably some more code