In [1]:
import pandas as pd
import numpy as np
import pickle

import sys
sys.path.append('..')

from topomap.visualizations import plot_topomap_comparison_highlight, plot_hierarchical_treemap

from topomap import TopoTree
from topomap import TopoMap
from topomap.HierarchicalTopoMap import HierarchicalTopoMap

## Synthetic datasets

In [2]:
df_blobs = pd.read_csv('../data/3blobs.csv')
data_blobs = df_blobs[['x','y','z']].values

topomap_blobs = TopoMap()
proj_topomap_blobs = topomap_blobs.fit_transform(data_blobs)

topotree_blobs = TopoTree(min_box_size=0.05*data_blobs.shape[0])
topotree_blobs.mst = topomap_blobs.mst
topotree_blobs.sorted_edges = topomap_blobs.sorted_edges
comp_info_blobs = topotree_blobs.fit(data_blobs)
df_comp_blobs = pd.DataFrame.from_dict(comp_info_blobs)

In [3]:
fig = plot_hierarchical_treemap(df_comp_blobs)
fig.update_layout(title='TopoTree - 3 Blobs Dataset')
fig.show()

In [4]:
components_to_highligth = [0,1,2]

hiertopomap_blobs = HierarchicalTopoMap(components_to_scale=components_to_highligth)
hiertopomap_blobs.min_points_component = topotree_blobs.min_box_size
hiertopomap_blobs.mst = topomap_blobs.mst
hiertopomap_blobs.sorted_edges = topomap_blobs.sorted_edges
proj_hier_blobs = hiertopomap_blobs.fit_transform(data_blobs)

Scalling component 1 - Scalar: 20.000 - initial area: 163.191... final area: 65276.594.
Scalling component 0 - Scalar: 20.000 - initial area: 315.548... final area: 126219.070.
Scalling component 2 - Scalar: 20.000 - initial area: 66.657... final area: 26662.668.
[INFO] Number of edges hit. Edges processed: 798


In [5]:
fig = plot_topomap_comparison_highlight(proj_topomap_blobs, proj_hier_blobs, 
                                        components_to_highligth, df_comp_blobs,
                                        hiertopomap_blobs)
fig.update_layout(title='3 Blobs Dataset')
fig.show()

In [6]:
df_rings = pd.read_csv('../data/3rings.csv')
data_rings = df_rings[['x','y','z']].values

topomap_rings = TopoMap()
proj_topomap_rings = topomap_rings.fit_transform(data_rings)

topotree_rings = TopoTree(min_box_size=0.05*data_rings.shape[0])
topotree_rings.mst = topomap_rings.mst
topotree_rings.sorted_edges = topomap_rings.sorted_edges
comp_info_rings = topotree_rings.fit(data_rings)
df_comp_rings = pd.DataFrame.from_dict(comp_info_rings)

In [7]:
fig = plot_hierarchical_treemap(df_comp_rings)
fig.update_layout(title='TopoTree - 3 Rings Dataset')
fig.show()

In [8]:
components_to_highligth = [2,11,15,19,3,10]

hiertopomap_rings = HierarchicalTopoMap(components_to_scale=components_to_highligth)
hiertopomap_rings.min_points_component = topotree_rings.min_box_size
hiertopomap_rings.mst = topomap_rings.mst
hiertopomap_rings.sorted_edges = topomap_rings.sorted_edges
proj_hier_rings = hiertopomap_rings.fit_transform(data_rings)

Scalling component 10 - Scalar: 20.000 - initial area: 10.309... final area: 4123.742.
Scalling component 3 - Scalar: 20.000 - initial area: 3.274... final area: 1309.404.
Scalling component 11 - Scalar: 20.000 - initial area: 12.179... final area: 4871.769.
Scalling component 2 - Scalar: 20.000 - initial area: 3.726... final area: 1490.534.
Scalling component 19 - Scalar: 20.000 - initial area: 29.759... final area: 11903.457.
Scalling component 15 - Scalar: 20.000 - initial area: 8.441... final area: 3376.598.
[INFO] Number of edges hit. Edges processed: 3208


In [9]:
fig = plot_topomap_comparison_highlight(proj_topomap_rings, proj_hier_rings, 
                                        components_to_highligth, df_comp_rings,
                                        hiertopomap_rings)
fig.update_layout(title='3 Rings Dataset')
fig.show()

In [10]:
df_cavities = pd.read_csv('../data/2cavities.csv')
df_cavities = df_cavities.rename(columns={'X':'x', 'Y':'y', 'Z':'z'})
data_cavities = df_cavities[['x','y','z']].values

topomap_cavities = TopoMap()
proj_topomap_cavities = topomap_cavities.fit_transform(data_cavities)

topotree_cavities = TopoTree(min_box_size=0.06*data_cavities.shape[0])
topotree_cavities.mst = topomap_cavities.mst
topotree_cavities.sorted_edges = topomap_cavities.sorted_edges
comp_info_cavities = topotree_cavities.fit(data_cavities)
df_comp_cavities = pd.DataFrame.from_dict(comp_info_cavities)

In [11]:
fig = plot_hierarchical_treemap(df_comp_cavities)
fig.update_layout(title='TopoTree - 2 Cavities Dataset')
fig.show()

In [12]:
components_to_highligth = [8,7,0]

hiertopomap_cavities = HierarchicalTopoMap(components_to_scale=components_to_highligth)
hiertopomap_cavities.min_points_component = topotree_cavities.min_box_size
hiertopomap_cavities.mst = topomap_cavities.mst
hiertopomap_cavities.sorted_edges = topomap_cavities.sorted_edges
proj_hier_cavities = hiertopomap_cavities.fit_transform(data_cavities)

Scalling component 8 - Scalar: 14.759 - initial area: 136.756... final area: 29787.445.
Scalling component 7 - Scalar: 14.789 - initial area: 11.196... final area: 2448.645.
Scalling component 0 - Scalar: 20.000 - initial area: 49.325... final area: 19730.080.
[INFO] Number of edges hit. Edges processed: 4000


In [13]:
fig = plot_topomap_comparison_highlight(proj_topomap_cavities, proj_hier_cavities, 
                                        components_to_highligth, df_comp_cavities,
                                        hiertopomap_cavities)
fig.update_layout(title='2 Cavities Dataset')
fig.show()

## MFeat dataset

In [14]:
data_mfeat = pd.read_csv('../data/UCI/multiple+features/mfeat-kar', sep='\s+',
                         header=None,
                         names=['x'+str(i) for i in range(1,65)])
data_mfeat['class'] = 0
for i, row in data_mfeat.iterrows():
    data_mfeat.loc[i,'class'] = i//200

X_mfeat = data_mfeat.drop(['class'], axis=1)
y_mfeat = data_mfeat[['class']]

X_mfeat = X_mfeat.to_numpy()

In [15]:
topomap_mfeat = TopoMap()
proj_topomap_mfeat = topomap_mfeat.fit_transform(X_mfeat)

In [16]:
topotree_mfeat = TopoTree(min_box_size=0.01*X_mfeat.shape[0])
topotree_mfeat.mst = topomap_mfeat.mst
topotree_mfeat.sorted_edges = topomap_mfeat.sorted_edges
comp_info_mfeat = topotree_mfeat.fit(X_mfeat)

df_comp_mfeat = pd.DataFrame.from_dict(comp_info_mfeat)

In [17]:
fig = plot_hierarchical_treemap(df_comp_mfeat, color='died_at')
fig.update_layout(title='TopoTree - Times Square Dataset')
fig.show()

In [18]:
#components_to_highligth = [3,8,6,1,7,10]

hiertopomap_mfeat = HierarchicalTopoMap(k_components=10)
hiertopomap_mfeat.min_points_component = topotree_mfeat.min_box_size
hiertopomap_mfeat.mst = topomap_mfeat.mst
hiertopomap_mfeat.sorted_edges = topomap_mfeat.sorted_edges
proj_hier_mfeat = hiertopomap_mfeat.fit_transform(X_mfeat)

Scalling component 2 - Scalar: 10.178 - initial area: 71193.930... final area: 7374811.000.
Scalling component 5 - Scalar: 8.920 - initial area: 17256.732... final area: 1373089.000.
Scalling component 15 - Scalar: 8.089 - initial area: 19423.602... final area: 1271053.250.
Scalling component 7 - Scalar: 8.376 - initial area: 17530.398... final area: 1230013.375.
Scalling component 17 - Scalar: 8.467 - initial area: 51308.039... final area: 3678645.500.
Scalling component 8 - Scalar: 8.590 - initial area: 36685.348... final area: 2706930.250.
Scalling component 3 - Scalar: 8.233 - initial area: 85637.281... final area: 5805358.500.
Scalling component 23 - Scalar: 7.761 - initial area: 103876.141... final area: 6256718.500.
Scalling component 28 - Scalar: 7.396 - initial area: 92408.430... final area: 5054148.000.
Scalling component 26 - Scalar: 7.458 - initial area: 405295.531... final area: 22545974.000.
[INFO] Number of edges hit. Edges processed: 1998


In [19]:
components_to_highligth = hiertopomap_mfeat.components_to_scale

fig = plot_topomap_comparison_highlight(proj_topomap_mfeat, proj_hier_mfeat, 
                                        components_to_highligth, df_comp_mfeat,
                                        hiertopomap_mfeat
                                        )
fig.update_layout(height=600, width=1200, title='MFeat Dataset', 
                  legend= {'itemsizing': 'constant'},
                  xaxis=dict(showticklabels=False), 
                  yaxis=dict(showticklabels=False),
                  xaxis2=dict(showticklabels=False), 
                  yaxis2=dict(showticklabels=False)
                  )
fig.show()

## LLM datasets - MMLU Test

In [20]:
questions_data = pd.read_csv('../data/LLM/mmlu_val_questions_data.csv')
last_emb = pickle.load(open('../data/LLM/mmlu_val_last_emb.pkl', 'rb'))

last_emb.shape

(1482, 4096)

In [21]:
topomap_emb = TopoMap()
proj_topomap_emb = topomap_emb.fit_transform(last_emb)

In [22]:
topotree_emb = TopoTree(min_box_size=0.01*last_emb.shape[0])
topotree_emb.mst = topomap_emb.mst
topotree_emb.sorted_edges = topomap_emb.sorted_edges
comp_info_emb = topotree_emb.fit(last_emb)

df_comp_emb = pd.DataFrame.from_dict(comp_info_emb)

In [23]:
fig = plot_hierarchical_treemap(df_comp_emb, color='died_at')
fig.update_layout(title='TopoTree - LLM MMLU Dataset')
fig.show()

In [24]:
components_to_highligth = [0,1,3,4,7,9]

hiertopomap_emb = HierarchicalTopoMap(components_to_scale=components_to_highligth)
hiertopomap_emb.min_points_component = topotree_emb.min_box_size
hiertopomap_emb.mst = topomap_emb.mst
hiertopomap_emb.sorted_edges = topomap_emb.sorted_edges
proj_hier_emb = hiertopomap_emb.fit_transform(last_emb)

Scalling component 0 - Scalar: 20.000 - initial area: 79355.211... final area: 31742084.000.
Scalling component 1 - Scalar: 15.759 - initial area: 3601281.750... final area: 894311616.000.
Scalling component 3 - Scalar: 13.862 - initial area: 136905.281... final area: 26308768.000.
Scalling component 4 - Scalar: 13.093 - initial area: 138017.828... final area: 23660822.000.
Scalling component 7 - Scalar: 12.805 - initial area: 51898.219... final area: 8509063.000.
Scalling component 9 - Scalar: 4.741 - initial area: 618796.250... final area: 13906075.000.
[INFO] Number of edges hit. Edges processed: 1480


In [25]:
fig = plot_topomap_comparison_highlight(proj_topomap_emb, proj_hier_emb, 
                                        components_to_highligth, df_comp_emb,
                                        hiertopomap_emb)
fig.update_layout(height=600, width=1200, title='LLM MMLU Dataset', 
                  legend= {'itemsizing': 'constant'},
                  xaxis=dict(showticklabels=False), 
                  yaxis=dict(showticklabels=False),
                  xaxis2=dict(showticklabels=False), 
                  yaxis2=dict(showticklabels=False)
                  )
fig.show()

## LLM datasets - Amazon

In [26]:
df_amazon = pd.read_csv('../data/LLM/df_amazon.csv')
emb_amazon_all_layers = pickle.load(open('../data/LLM/embeddings_amazon.pickle', 'rb'))
emb_amazon = np.array(list(emb_amazon_all_layers[32].values()))

emb_amazon.shape

(1800, 4096)

In [27]:
topomap_amazon = TopoMap()
proj_topomap_amazon = topomap_amazon.fit_transform(emb_amazon)

In [28]:
topotree_amazon = TopoTree(min_box_size=0.005*emb_amazon.shape[0])
topotree_amazon.mst = topomap_amazon.mst
topotree_amazon.sorted_edges = topomap_amazon.sorted_edges
comp_info_amazon = topotree_amazon.fit(emb_amazon)

df_comp_amazon = pd.DataFrame.from_dict(comp_info_amazon)

In [29]:
fig = plot_hierarchical_treemap(df_comp_amazon, color='died_at')
fig.update_layout(title='TopoTree - LLM Amazon Dataset')
fig.show()

In [30]:
components_to_highligth = [8,15,10,14]

hiertopomap_amazon = HierarchicalTopoMap(components_to_scale=components_to_highligth)
hiertopomap_amazon.min_points_component = topotree_amazon.min_box_size
hiertopomap_amazon.mst = topomap_amazon.mst
hiertopomap_amazon.sorted_edges = topomap_amazon.sorted_edges
proj_hier_amazon = hiertopomap_amazon.fit_transform(emb_amazon)

Scalling component 14 - Scalar: 15.156 - initial area: 65951.930... final area: 15148604.000.
Scalling component 10 - Scalar: 15.663 - initial area: 24205.213... final area: 5937994.000.
Scalling component 8 - Scalar: 16.089 - initial area: 2465814.750... final area: 638309696.000.
Scalling component 15 - Scalar: 15.940 - initial area: 8248.676... final area: 2095779.625.
[INFO] Number of edges hit. Edges processed: 1798


In [31]:
fig = plot_topomap_comparison_highlight(proj_topomap_amazon, proj_hier_amazon, 
                                        components_to_highligth, df_comp_amazon,
                                        hiertopomap_amazon)
fig.update_layout(height=600, width=1200, title='LMM Amazon Dataset', 
                  legend= {'itemsizing': 'constant'},
                  xaxis=dict(showticklabels=False), 
                  yaxis=dict(showticklabels=False),
                  xaxis2=dict(showticklabels=False), 
                  yaxis2=dict(showticklabels=False)
                  )
fig.show()