In [4]:
import pandas as pd
import numpy as np
import pickle

from ucimlrepo import fetch_ucirepo 

import plotly.io as pio

import sys
sys.path.append('..')

from topomap import TopoTree
from topomap.visualizations import plot_hierarchical_treemap

## Synthetic datasets

In [5]:
df_blobs = pd.read_csv('../data/3blobs.csv')
data_blobs = df_blobs[['x','y','z']].values

topotree_blobs = TopoTree(min_box_size=0.05*data_blobs.shape[0])
comp_info_blobs = topotree_blobs.fit(data_blobs)
df_comp_blobs = pd.DataFrame.from_dict(comp_info_blobs)

In [6]:
fig = plot_hierarchical_treemap(df_comp_blobs)
fig.update_layout(title='TopoTree - 3 Blobs Dataset')
fig.show()

## 3 Rings

In [4]:
df_rings = pd.read_csv('../data/3rings.csv')
data_rings = df_rings[['x','y','z']].values

topotree_rings = TopoTree(min_box_size=0.05*data_rings.shape[0])
comp_info_rings = topotree_rings.fit(data_rings)
df_comp_rings = pd.DataFrame.from_dict(comp_info_rings)

In [5]:
fig = plot_hierarchical_treemap(df_comp_rings)
fig.update_layout(title='TopoTree - 3 Rings Dataset')
fig.show()

## 2 Cavities

In [6]:
df_cavities = pd.read_csv('../data/2cavities.csv')
df_cavities = df_cavities.rename(columns={'X':'x', 'Y':'y', 'Z':'z'})
data_cavities = df_cavities[['x','y','z']].values

topotree_cavities = TopoTree(min_box_size=0.06*data_cavities.shape[0])
comp_info_cavities = topotree_cavities.fit(data_cavities)
df_comp_cavities = pd.DataFrame.from_dict(comp_info_cavities)

In [7]:
fig = plot_hierarchical_treemap(df_comp_cavities)
fig.update_layout(title='TopoTree - 2 Cavities Dataset')
fig.show()

## MFeat dataset

In [8]:
data_mfeat = pd.read_csv('../data/UCI/multiple+features/mfeat-kar', sep='\s+',
                         header=None,
                         names=['x'+str(i) for i in range(1,65)])
data_mfeat['class'] = 0
for i, row in data_mfeat.iterrows():
    data_mfeat.loc[i,'class'] = i//200

X_mfeat = data_mfeat.drop(['class'], axis=1)

In [9]:
topotree = TopoTree(min_box_size=0.01*X_mfeat.shape[0])
comp_info_mfeat = topotree.fit(X_mfeat.to_numpy())

In [10]:
df_comp_mfeat = pd.DataFrame.from_dict(comp_info_mfeat)
df_comp_mfeat.head()

Unnamed: 0,id,size,points,persistence,created_at,children,parent,died_at,persistence_density
0,0,32,"[1566, 1582, 1494, 1496, 1554, 1462, 1500, 155...",0.504754,7.121154,5,2.0,7.625909,4.196221
1,1,25,"[1505, 1561, 1483, 1473, 1552, 1451, 1433, 150...",0.229002,7.396906,5,2.0,7.625909,3.278298
2,2,178,"[1505, 1561, 1483, 1473, 1552, 1451, 1433, 150...",2.713125,7.625909,66,10.0,10.339033,17.21631
3,3,154,"[279, 303, 244, 274, 227, 235, 342, 229, 246, ...",3.042566,9.170355,63,27.0,12.212921,12.609596
4,4,52,"[89, 187, 20, 93, 138, 42, 194, 80, 133, 32, 1...",1.085822,9.300947,15,13.0,10.386769,5.006369


In [11]:
fig = plot_hierarchical_treemap(df_comp_mfeat)
fig.update_layout(title='TopoTree - MFeat Dataset')
fig.show()

In [12]:
# pio.write_image(fig, '../images/TopoTree.png', scale=6, width=800, height=500)

## Iris dataset

In [12]:
iris = fetch_ucirepo(id=53) 
  
# data (as pandas dataframes) 
X = iris.data.features 

topotree = TopoTree(min_box_size=0.05*X.shape[0])
comp_info_iris = topotree.fit(X.to_numpy())

df_comp_iris = pd.DataFrame.from_dict(comp_info_iris)

In [13]:
fig = plot_hierarchical_treemap(df_comp_iris)
fig.update_layout(title='TopoTree - Iris dataset')
fig.show()

## LLM datasets - MMLU Test

In [14]:
questions_data = pd.read_csv('../data/LLM/mmlu_val_questions_data.csv')
last_emb = pickle.load(open('../data/LLM/mmlu_val_last_emb.pkl', 'rb'))

In [15]:
last_emb.shape

(1482, 4096)

In [16]:
topotree = TopoTree(min_box_size=last_emb.shape[0]*0.005)
comp_info_llm = topotree.fit(last_emb)

df_comp_llm = pd.DataFrame.from_dict(comp_info_llm)

In [17]:
fig = plot_hierarchical_treemap(df_comp_llm)
fig.update_layout(title='TopoTree - LLM dataset')
fig.show()

## LLM datasets - Amazon

In [18]:
df_amazon = pd.read_csv('../data/LLM/df_amazon.csv')
emb_amazon_all_layers = pickle.load(open('../data/LLM/embeddings_amazon.pickle', 'rb'))
emb_amazon = np.array(list(emb_amazon_all_layers[32].values()))

In [19]:
emb_amazon.shape

(1800, 4096)

In [20]:
topotree = TopoTree(min_box_size=emb_amazon.shape[0]*0.003)
comp_info_amazon = topotree.fit(emb_amazon)

df_comp_amazon = pd.DataFrame.from_dict(comp_info_amazon)

In [21]:
fig = plot_hierarchical_treemap(df_comp_amazon)
fig.update_layout(title='TopoTree - Amazon dataset')
fig.show()