In [None]:
# Read data from Java SOMToolbox
from SOMToolBox_Parse import SOMToolBox_Parse
idata   = SOMToolBox_Parse("datasets/iris/iris.vec").read_weight_file()
weights = SOMToolBox_Parse("datasets/iris/iris.wgt.gz").read_weight_file()
classes = SOMToolBox_Parse("datasets/iris/iris.cls").read_weight_file()

In [None]:
# Visualization by PySOMVis
from pysomvis import PySOMVis

vis = PySOMVis(weights=weights['arr'], m=weights['ydim'],n=weights['xdim'],
                dimension=weights['vec_dim'], input_data=idata['arr'],
                classes=classes['arr'][:,1], component_names=classes['classes_names'])
vis._mainview

ValueError: Supplied cmap jet not found among matplotlib, bokeh, or colorcet colormaps.

Column
    [0] Column
        [0] Column
            [0] Row(margin=(5, 10), width=700)
                [0] StaticText(value='<b></b>')
                [1] Select(options=OrderedDict({'Component Pl...]), value=0)
                [2] Select(options=OrderedDict({'PiYG': 'PiYG...]), value='jet')
                [3] Button(name='↶')
                [4] Button(name='↷')
                [5] Button(name='↔')
                [6] Button(name='↕')
                [7] Checkbox(name='interpolation')
        [1] Row
            [0] Column
                [0] HoloViews(DynamicMap)
            [1] Row
                [0] Column
                    [0] Column(margin=(5, 10), name='Component Planes')
                        [0] StaticText(value='<b>Component Planes</b>')
                        [1] IntSlider(end=3, name='setosa')
                    [1] Str(str)
    [1] Column
        [0] Row()

In [7]:
# Use any library for training SOM map (e.x. MiniSOM, SOMOClu, SOMpy, PopSOM etc.)
from pysomvis import PySOMVis
from minisom import MiniSom    

som = MiniSom(10, 10, 4)
som.train(idata['arr'], 10000)

vis = PySOMVis(weights=som._weights, input_data=idata['arr'])
vis._mainview

BokehModel(combine_events=True, render_bundle={'docs_json': {'3fc9f66c-f9c9-43c6-896d-067ad04b87a8': {'version…

# Dataset
##  Select a data set from the OpenML Machine Learning Repository 

(http://www.openml.org) with the
following requirements:<br>
a. minimum 1000 instances,<br>
b. minimum 20 attributes,<br>
c. minimum 4 class labels (for visualizing class distributions on the map).<br>
Alternatively, you can also<br>
 opt to create an artificial dataset, preferably via parameterized scripts (in Matlab, Java, R,<br>
Python…) similar to the 10-Gaussians dataset, creating data of different densities combining<br>
i. Data on a finite area of a 1-d (line), 2-d, 3-d, 5-d hyperplanes<br>
ii. Data on (hyper-)spheres with different radius as well as Gaussians<br>
iii. Linear data sets in different intertwined settings<br>
iv. Other cluster characteristics that you find interesting<br>

## Register the dataset you picked with your group number in the TUWEL Wiki.<br>
 You must make sure<br>
that your dataset is unique, i.e. no two groups may take the same data set! (first come, first serve -<br>
do it early to get a data set that you also find interesting to work.)<br>

## Create a machine-actionable description of the dataset following Croissant / Schema.org<br>
descriptions for datasets (c.f. Croissant: https://neurips.cc/virtual/2024/poster/97627,<br>
https://docs.mlcommons.org/croissant/docs/croissant-spec.html; schema.org:<br>
https://schema.org/Dataset, c.f. the JSON example provided at https://schema.org/Dataset#eg-0478)<br>

## Analyze and describe the characteristics of the dataset (size, attribute types as discussed in class,<br>
value ranges, sparsity, min/max values, outliers, missing values, correlations, ...), and describe this<br>
in the report. Also, describe any hypotheses you might have concerning the distribution of the data,<br>
number of clusters and their relationship, majority/minority classes.<br>

# Preprocessing: 

Get the data into the form needed for training SOMs. Describe your preprocessing<br>
steps (e.g. transcoding, scaling), why you did it and how you did it. Specifically, if your dataset turns<br>
out to be extremely large (very high-dimensional and huge number of vectors so that it does not fit<br>
into memory for training SOMs) you may choose to apply subsampling for the training data.<br>

In [15]:
## TODO LOADING AND PREPROCESSING THE DATA

# C) SOM Training and Analysis

## 1) Train a reasonably sized „regular“ SOM

In [16]:
from minisom import MiniSom
from sklearn.datasets import load_iris
import numpy as np
from pysomvis import PySOMVis

def calculate_som_size(data_size, fraction=0.1):
    """Calculate SOM dimensions based on the dataset size."""
    # Calculate the total number of units based on the data size and fraction
    total_units = int(data_size * fraction)
    # Use the square root to determine SOM dimensions (assumes a square grid)
    som_dim = int(np.sqrt(total_units))
    return som_dim, som_dim

def train_som(dataloader, **params):
    """
    Train a Self-Organizing Map (SOM) with specified parameters.

    Parameters:
    - dataloader (function): A function that returns the dataset. The dataset should have 
      'data' (features) and 'target' (labels) attributes.
    - size_fraction (float): Fraction of the dataset used to calculate the SOM size (default is 0.1).
    - sigma (float): The neighborhood radius used for the SOM (default is 1.0). Controls how far 
      neighboring neurons will be affected by the learning process.
    - learning_rate (float): The learning rate for the SOM (default is 0.5). It determines how 
      much the weights of the SOM are adjusted during training.
    - num_iterations (int): The number of iterations to train the SOM (default is 1000).
    - random_seed (int, optional): The random seed for initializing the SOM. If None, a random seed is used.
      Providing a fixed value ensures reproducibility.
    - neighborhood_function (str): The type of neighborhood function used for training ('gaussian' is default).
      This determines the shape of the neighborhood that is affected during training.

    Returns:
    - som (MiniSom): The trained SOM object.
    - X (array): The input features used for training.
    - y (array): The target labels corresponding to the input features.
    - class_names (array): The class names of the target labels.
    """
    
    # Load dataset using the provided dataloader function
    data = dataloader()
    X = data.data  # Extract the data (already normalized by the dataloader)
    y = data.target
    class_names = data.target_names

    # Calculate SOM size based on the number of data points and specified fraction
    som_x, som_y = calculate_som_size(len(X), fraction=params.get('size_fraction', 0.1))

    # Get the dimensionality of the input data (number of features per data point)
    input_len = X.shape[1]

    # Initialize SOM with the calculated dimensions and other specified parameters
    som = MiniSom(som_x, som_y, input_len, 
                  sigma=params.get('sigma', 1.0),  # Neighborhood radius
                  learning_rate=params.get('learning_rate', 0.5),  # Learning rate
                  neighborhood_function=params.get('neighborhood_function', 'gaussian'),  # Neighborhood function
                  random_seed=params.get('random_seed', None))  # Random seed for initialization

    # Train the SOM using the normalized data and specified number of iterations
    som.train(X, params.get('num_iterations', 1000))

    # Return trained SOM and the input data
    return som, X, y, class_names

def iris_dataloader():
    """Load and normalize Iris dataset."""
    data = load_iris()
    # Normalize data to range [0, 1]
    data.data = (data.data - data.data.min(axis=0)) / (data.data.max(axis=0) - data.data.min(axis=0))
    return data


Train a SOM with „regular“ size (i.e. number of units as a certain fraction of the number of data<br>
items) and reasonable training parameters (sufficiently large initial neighborhood, learning<br>
rate; provide a justification for the selection of the parameters. NOTE: Learning rates for SOMs<br>
differ from those usually encountered in Deep Neural Networks, c.f. lecture)<br>

In [17]:
# TODO TRAIN SOM WITH REG SIZE

params = {
    'size_fraction': 0.4,
    'sigma': 1.5,
    'learning_rate': 0.3,
    'num_iterations': 5000,
    'random_seed': 42  # Specify a random seed for initialization
}

# Call the train_som function with the updated parameters
som, X, y, class_names = train_som(iris_dataloader, **params)

Analyse in detail the class distribution, cluster structure, quantization errors, topology
violations.<br> a) Can you identify the border effect and magnification factors.<br> b) How well do class
distribution and cluster structure match?<br> c) Which classes fall into sub-clusters, which classes
are split across clusters, which classes mix in clusters.<br> d) How is the quantization error
distributed on the map, how does this correspond with perceived cluster separation and
quality?

In [18]:
#TODO above
weights = {
    'arr': som.get_weights(),
    'xdim': som._weights.shape[0],
    'ydim': som._weights.shape[1],
    'vec_dim': som._weights.shape[2]
}

# Initialize PySOMVis with correct m and n
# vis = PySOMVis(weights=weights['arr'], 
#                m=weights['xdim'],  # m should be xdim
#                n=weights['ydim'],  # n should be ydim
#                dimension=weights['vec_dim'], 
#                input_data=X, 
#                classes=y, 
#                component_names=class_names)

# # Display the visualization
# vis._mainview()

vis = PySOMVis(weights=som._weights, input_data=X)
vis._mainview


**Describe and compare the structures found**(providing detailed info on visualizations and
parameters)

In [19]:
#TODO show structres

#TODO comparision text

## 2) Analyze different initializations of the SOM

Train one further „regular-sized“ SOM using the same training parameters as above, but using
a different random seed for initializing the SOM.

In [20]:
params["random_seed"] = 43

# Train the SOM using the updated parameters (same dataset, different seed)
som2, X, y, class_names = train_som(iris_dataloader, **params)


**Show and describe** <br> a) how the cluster structures and class distributions shift on the two
SOMs,<br> b) the effect on topology violations, cluster relationships, etc.<br> c) Which clusters show
a stable relationship, which ones change their relative position?<br> d) Which data instances are
stably mapped with similar data instances, which change a lot? Are they part of the same
clusters?

In [21]:
# Visualize the first SOM (som)
vis1 = PySOMVis(weights=som._weights, input_data=X)
vis1._mainview

# Visualize the second SOM (som2) with a different random seed
vis2 = PySOMVis(weights=som2._weights, input_data=X)
vis2._mainview

#TODO comparision text

**Describe and compare the structures found** (providing detailed info on visualizations and
parameters)

In [22]:
#TODO show structures

#todo comparision text

## 3) Analyze different map sizes

Train 2 additional SOMs varying the size (very small / very large) (provide reasons for choice
of sizes)<br>
Train each map with rather large neighborhood radius and high learning rate (provide reasons
for the definition of „high“!)

In [23]:
# Train a very small SOM (small grid, less resolution)
params_small = {
    'size_fraction': 0.05,  # Very small grid size (5% of the data size)
    'sigma': 3.0,  # Large neighborhood radius (larger value for more smoothing)
    'learning_rate': 0.9,  # High learning rate (close to 1 for fast convergence)
    'num_iterations': 5000,
    'random_seed': 44  # Different seed for initialization
}
som_small, X, y, class_names = train_som(iris_dataloader, **params_small)

# Train a very large SOM (large grid, higher resolution)
params_large = {
    'size_fraction': 1.0,  # Very large grid size (100% of the data size)
    'sigma': 3.0,  # Large neighborhood radius (larger value for more smoothing)
    'learning_rate': 0.9,  # High learning rate (close to 1 for fast convergence)
    'num_iterations': 5000,
    'random_seed': 45  # Different seed for initialization
}
som_large, X, y, class_names = train_som(iris_dataloader, **params_large)




Analyse in detail the<br> a) class distribution,<br> b) cluster structure,<br> c) quantization errors,<br> d)
topology violations. Also,<br> e) analyze how clusters shift, change in relative size, and how their
relative position to each other changes or remains the same.<br> f) Check for aspects such as
magnification factors. What is the resulting granularity of clusters visible on the small and large
maps? Are the same clusters visible in the very large map as in the regular map?

In [25]:


# Visualize the very small SOM
vis_small = PySOMVis(weights=som_small._weights, input_data=X)
vis_small._mainview




In [26]:
# Visualize the very large SOM
vis_large = PySOMVis(weights=som_large._weights, input_data=X)
vis_large._mainview

**Describe and compare the structures found** (providing detailed info on visualizations and
parameters)

In [None]:
#todo maybe show structures

#todo comparision text  

## 4) Analyze different initial neighborhood radius settings

Train the very large SOM as specified above, but with a much too small neighborhood radius.

In [None]:
#TODO above

Analyse the<br> a) cluster structure,<br> b) quantization errors,<br> c) topology violations.<br> d) In how far
does this map differ from the very large map trained with a correct/high initial neighborhood
radius?

In [None]:
#TODO above

**Describe and compare the structures found** (what is the effect of a „too small“
neighborhood radius? How to detect it?)

In [None]:
#TODO maybe show structures.

#TODO describe from above

## 5) Analyze different initial learning rates

Train the regular-sized SOM as specified above, but with a (I) much too large / (II) much too<br>
small learning rate (provide justification for the setting of the parameter)

In [None]:
#TODO above

Analyse for both (I) and (II)<br> a) cluster structure,<br> b) quantization errors,<br> c) topology violations.<br>
d) In how far do these two maps differ from the well-trained map analyzed above?

In [None]:
#TODO above

Describe and compare the structures found (how can you detect „too small“ learning<br>
rates? When do they start to make sense?

In [None]:
#TODO maybe show structures

#todo comparision text

## 6) Analyze different max iterations

Train a regular SOM using 2, 5, 10, 50, 100, 1000, 5000, 10000 iterations

In [None]:
#TODO above

Analyse cluster structure. <br>a) When do cluster structures start to emerge?<br> b) After how many
iterations do they stabilize?<br> c) How can you tell from the quality measures whether the map is
stable?<br> d) Which visualizations help you discover not-yet stable SOM mappings?

In [None]:
#TODO above

Describe and compare the structures found (how can you detect „too small“ learning
rates? When do they start to make sense?

In [None]:
#TODO maybe show structures

#todo comparision text

## 7) Detailed analysis of an „Optimal SOM“

Train a SOM using what you consider to be „optimal parameters“ based on sub-tasks 1-6.

In [None]:
#TODO train best som

Describe the final model following the FAIR4ML schema (cf.
https://doi.org/10.5281/zenodo.14002310, https://rda-fair4ml.github.io/FAIR4MLschema/
release/0.1.0/index.html, https://github.com/RDA-FAIR4ML/FAIR4ML-schema)

#todo above

### SUBTASKS a-e

Provide a detailed interpretation of the cluster/class structures using a combination of<br>
visualizations and their parameter settings. Describe the findings in detail, specifically<br>
analyzing and providing rationale for

#### a 

Cluster densities / cardinalities, shapes: what can you tell about the cluster sizes<br>
shapes, their cardinalities and densities? Can you observe areas of higher/lower<br>
densities? Compare different visualizations that support (or contradict) your hypothesis<br>
and reason/explain why they do so.

In [None]:
#TODO above 

#### b

Hierarchical cluster relationships: can you detect any hierarchies in the data? How do<br>
they seem to be structured? Which clusters are similar, which are very distant, how<br>
could they be related? Compare different visualizations that support (or contradict)<br>
your hypothesis and reason/explain why they do so.

In [None]:
#TODO above

#### c

Topological relations / violations: in which areas can you observe topology violations?<br>
What types of violations do you observe in which areas of the map (i.e. actual violations<br>
due to bad training or the inherent structure of the data vs. cluster data that is mapped<br>
onto the plane). In how far do different visualizations agree on these violations?<br>
Compare different visualizations that support (or contradict) your hypothesis and<br>
reason/explain why they do so.

In [None]:
#TODO above

#### d 

Class distribution: Which classes are mapped onto which parts of the map? How do<br>
they relate to each other? In how far does the class distribution match the cluster<br>
structure? Which classes are well-separated, which ones less so? What might be the<br>
reason for these overlaps? Is the mapping less correct in these regions (e.g. higher<br>
error measures)? Are these areas well-separated. Which classes form homogeneous<br>
clusters, which form sub-clusters, how similar are these sub-clusters?

In [None]:
#TODO above

#### e

Quality of the map in terms of vector quantization and topology violation: is the quality<br>
homogeneous, are there certain areas or classes where the quality of the mapping is<br>
lower, others where it is higher?

In [None]:
#TODO above

# D) Summarize your findings



## 1

Summarize your overall findings and lessons learned:<br>
a. Which parameters have what kind of influence on the SOM?<br>
b. How sensitive is the setting of these parameters <br>
c. Which visualizations are most useful to reveal what kind of information? Which combination

WRITE SUMMARY

## 2

(optional) Provide feedback on the exercise in general: which parts were useful / less useful; which<br>
other kind of experiment would have been interesting, … (this section is, obviously, optional and will<br>
not be considered for grading. You may also decide to provide that kind of feedback anonymously via<br>
the feedback mechanism in TISS – in any case we would appreciate learning about it to adjust the<br>
exercises for next year.)

P