## Load dataset


In [1]:
import torch, json, os, logging
import numpy as np
from hmc.dataset import HMCDatasetManager, initialize_dataset
from torch.utils.data import Dataset
import networkx as nx
import logging
import ast

In [2]:
from hmc.utils import __load_json__

In [3]:
from sklearn.impute import SimpleImputer
from sklearn import preprocessing

In [4]:
import pandas as pd

In [5]:
dataset_path = '/home/bruno/storage/data/datasets'

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [7]:
# Criar um logger
logger = logging.getLogger(__name__)

In [8]:
dataset_kwargs = {
    "output_path": "/home/bruno/storage/data/outputs",
    "device": "cuda",
    "is_local": False,
    "is_global": True,
    "input_scaler": True
}

In [9]:
dataset = initialize_dataset('seq_GO', dataset_path, **dataset_kwargs)

2025-02-26 20:51:27,524 - INFO - Loading dataset from /home/bruno/storage/data/datasets/gene-ontology-annotated-datasets/seq_GO.train.csv and /home/bruno/storage/data/datasets/gene-ontology-annotated-datasets/seq_GO-labels.json
2025-02-26 20:51:29,350 - INFO - Transforming labels
2025-02-26 20:51:30,236 - INFO - Transforming labels
2025-02-26 20:51:31,605 - INFO - Transforming labels


In [10]:
train, val, test = dataset.get_torch_dataset()

In [11]:
nodes = dataset.g.nodes()

In [12]:
nodes

NodeView(('GO0003674', 'root', 'GO0003774', 'GO0000146', 'GO0003777', 'GO0008574', 'GO0003824', 'GO0000150', 'GO0000249', 'GO0000385', 'GO0000386', 'GO0003969', 'GO0008251', 'GO0004133', 'GO0004134', 'GO0004135', 'GO0004386', 'GO0003678', 'GO0004003', 'GO0043140', 'GO0043141', 'GO0043138', 'GO0043139', 'GO0003724', 'GO0004004', 'GO0008026', 'GO0008639', 'GO0004842', 'GO0016976', 'GO0019776', 'GO0019777', 'GO0019949', 'GO0008641', 'GO0004839', 'GO0019778', 'GO0019779', 'GO0019781', 'GO0019948', 'GO0042292', 'GO0008686', 'GO0009975', 'GO0003963', 'GO0004016', 'GO0016491', 'GO0000170', 'GO0004497', 'GO0004506', 'GO0016709', 'GO0000254', 'GO0004502', 'GO0008398', 'GO0015997', 'GO0009055', 'GO0003954', 'GO0050136', 'GO0008137', 'GO0003959', 'GO0004128', 'GO0004148', 'GO0004362', 'GO0004783', 'GO0004791', 'GO0015039', 'GO0045153', 'GO0015002', 'GO0004129', 'GO0015036', 'GO0015035', 'GO0015037', 'GO0015038', 'GO0030508', 'GO0016614', 'GO0004368', 'GO0004457', 'GO0004458', 'GO0004460', 'GO0016

In [24]:
ds_train.levels

{0: ['root'],
 1: ['GO0003674', 'GO0005575', 'GO0008150'],
 2: ['GO0003774',
  'GO0003824',
  'GO0005198',
  'GO0005215',
  'GO0005488',
  'GO0016209',
  'GO0030188',
  'GO0030234',
  'GO0030528',
  'GO0031386',
  'GO0045182',
  'GO0060089',
  'GO0005576',
  'GO0031974',
  'GO0031975',
  'GO0032991',
  'GO0043226',
  'GO0044422',
  'GO0044464',
  'GO0000003',
  'GO0008152',
  'GO0009987',
  'GO0022414',
  'GO0022610',
  'GO0032502',
  'GO0040007',
  'GO0048511',
  'GO0050896',
  'GO0051179',
  'GO0051234',
  'GO0051235',
  'GO0051704',
  'GO0065007'],
 3: ['GO0000146',
  'GO0003777',
  'GO0000150',
  'GO0000249',
  'GO0000385',
  'GO0003969',
  'GO0004133',
  'GO0004386',
  'GO0008639',
  'GO0008641',
  'GO0008686',
  'GO0009975',
  'GO0016491',
  'GO0004362',
  'GO0004791',
  'GO0004601',
  'GO0016740',
  'GO0016787',
  'GO0043492',
  'GO0016829',
  'GO0016853',
  'GO0016874',
  'GO0017140',
  'GO0019239',
  'GO0032451',
  'GO0003735',
  'GO0005199',
  'GO0005200',
  'GO0017056',
  'G

In [25]:
ds_test.df

Unnamed: 0,features,categories
0,"[4.8, 0.7, 3.1, 3.6, 3.9, 5.5, 1.4, 6.0, 8.0, ...",GO0004519@GO0005739@GO0008150
1,"[5.2, 0.5, 3.6, 2.3, 6.5, 6.5, 1.4, 7.7, 7.9, ...",GO0004519@GO0005739@GO0006314@GO0008380
2,"[4.3, 1.1, 2.5, 2.2, 5.7, 6.3, 2.4, 9.2, 6.0, ...",GO0004519@GO0005739@GO0006316
3,"[5.7, 1.0, 2.1, 1.3, 9.4, 6.5, 3.4, 10.4, 1.8,...",GO0008121@GO0005750@GO0005739@GO0006122@GO0009060
4,"[2.9, 1.2, 3.3, 2.3, 5.4, 5.0, 1.9, 11.4, 8.9,...",GO0003723@GO0005739@GO0000372
...,...,...
1327,"[3.2, 0.0, 7.6, 8.3, 2.5, 4.5, 1.3, 7.6, 5.1, ...",GO0016455@GO0000119@GO0006366
1328,"[8.1, 1.4, 6.0, 4.6, 5.1, 4.9, 2.3, 5.7, 4.0, ...",GO0005096@GO0030127@GO0005737@GO0030433@GO0006...
1329,"[3.5, 1.2, 2.3, 8.1, 5.8, 7.0, 2.3, 4.7, 4.7, ...",GO0031202@GO0005682@GO0005685@GO0046540@GO0000398
1330,"[2.6, 0.5, 5.8, 5.4, 3.3, 4.9, 3.0, 5.3, 5.8, ...",GO0005515@GO0019898@GO0006914@GO0006623


In [26]:
ds_test.Y

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [27]:
ds_test.nodes_idx.get("GO0004519")

762

In [28]:
teste = ['12', '85', '404']

In [29]:
teste[:0]

[]

In [30]:
from collections import defaultdict

def build_complete_paths(pairs):
    """
    Constrói caminhos completos a partir de pares pai-filho.

    :param pairs: Lista de strings representando relações pai-filho (exemplo: "A.B").
    :return: Lista de strings representando os caminhos completos.
    """
    child_to_parent = {}
    roots = set()
    all_nodes = set()

    # Construir dicionário de relação filho -> pai
    for pair in pairs:
        parent, child = pair.split('.')
        child_to_parent[child] = parent
        all_nodes.update([parent, child])

    # Identificar raízes (nós que não são filhos de ninguém)
    roots = all_nodes - set(child_to_parent.keys())

    # Reconstruir os caminhos completos
    complete_paths = []
    for root in roots:
        stack = [(root, root)]  # (nó atual, caminho acumulado)

        while stack:
            node, path = stack.pop()
            children = [c for c, p in child_to_parent.items() if p == node]

            if children:
                for child in children:
                    stack.append((child, f"{path}.{child}"))
            else:
                complete_paths.append(path)  # Nó folha -> salvar caminho completo

    return complete_paths


In [18]:

labels = ["root.GO0003674", "GO0003674.GO0003774", "GO0003774.GO0000146", "GO0003774.GO0003777", "GO0003777.GO0043141", "GO0003678.GO0043138", "GO0043138.GO0043140", "GO0003678.GO0043139", "GO0043139.GO0043141", "GO0004386.GO0003724"
]

In [19]:
labels_file = "/home/bruno/storage/data/datasets/gene-ontology-annotated-datasets/seq_GO-labels.json"

In [20]:
labels_go = __load_json__(labels_file)['labels']

In [21]:
from tqdm.notebook import tqdm

In [22]:
def load_structure(labels):
    # Load labels JSON
    g = nx.DiGraph()
    for cat in tqdm(labels):
        terms = cat.split('.')
    
        g.add_edge(terms[1], terms[0])
        
    nodes = sorted(g.nodes(), key=lambda x: (len(x.split('.')), x))
    nodes_idx = dict(zip(nodes, range(len(nodes))))
    g_t = g.reverse()
    
    ### Save networkx graph
    # Para salvar em formato GraphML
    #nx.write_graphml(self.g, self.graph_path)
    
    A = nx.to_numpy_array(g, nodelist=nodes)

    return g, g_t

In [23]:
g, g_t = load_structure(labels_go)

  0%|          | 0/5839 [00:00<?, ?it/s]

In [24]:
nx.shortest_path_length(g_t, "root")

{'root': 0,
 'GO0003674': 1,
 'GO0005575': 1,
 'GO0008150': 1,
 'GO0003774': 2,
 'GO0003824': 2,
 'GO0005198': 2,
 'GO0005215': 2,
 'GO0005488': 2,
 'GO0016209': 2,
 'GO0030188': 2,
 'GO0030234': 2,
 'GO0030528': 2,
 'GO0031386': 2,
 'GO0045182': 2,
 'GO0060089': 2,
 'GO0005576': 2,
 'GO0031974': 2,
 'GO0031975': 2,
 'GO0032991': 2,
 'GO0043226': 2,
 'GO0044422': 2,
 'GO0044464': 2,
 'GO0000003': 2,
 'GO0008152': 2,
 'GO0009987': 2,
 'GO0022414': 2,
 'GO0022610': 2,
 'GO0032502': 2,
 'GO0040007': 2,
 'GO0048511': 2,
 'GO0050896': 2,
 'GO0051179': 2,
 'GO0051234': 2,
 'GO0051235': 2,
 'GO0051704': 2,
 'GO0065007': 2,
 'GO0000146': 3,
 'GO0003777': 3,
 'GO0000150': 3,
 'GO0000249': 3,
 'GO0000385': 3,
 'GO0003969': 3,
 'GO0004133': 3,
 'GO0004386': 3,
 'GO0008639': 3,
 'GO0008641': 3,
 'GO0008686': 3,
 'GO0009975': 3,
 'GO0016491': 3,
 'GO0016740': 3,
 'GO0016787': 3,
 'GO0016829': 3,
 'GO0016853': 3,
 'GO0016874': 3,
 'GO0017140': 3,
 'GO0019239': 3,
 'GO0032451': 3,
 'GO0003735': 3,
 '

In [37]:
level_by_node = nx.shortest_path_length(g_t, "root")

In [40]:
levels = {}
for node in nodes():
    depth = level_by_node.get(node)
    if depth not in levels:
        levels[depth] = []
    levels[depth].append(node)

In [41]:
levels

{1: ['GO0003674', 'GO0005575', 'GO0008150'],
 0: ['root'],
 2: ['GO0003774',
  'GO0003824',
  'GO0005198',
  'GO0005215',
  'GO0005488',
  'GO0016209',
  'GO0030188',
  'GO0030234',
  'GO0030528',
  'GO0031386',
  'GO0045182',
  'GO0060089',
  'GO0005576',
  'GO0031974',
  'GO0031975',
  'GO0032991',
  'GO0043226',
  'GO0044422',
  'GO0044464',
  'GO0000003',
  'GO0008152',
  'GO0009987',
  'GO0022414',
  'GO0022610',
  'GO0032502',
  'GO0040007',
  'GO0048511',
  'GO0050896',
  'GO0051179',
  'GO0051234',
  'GO0051235',
  'GO0051704',
  'GO0065007'],
 3: ['GO0000146',
  'GO0003777',
  'GO0000150',
  'GO0000249',
  'GO0000385',
  'GO0003969',
  'GO0004133',
  'GO0004386',
  'GO0008639',
  'GO0008641',
  'GO0008686',
  'GO0009975',
  'GO0016491',
  'GO0004362',
  'GO0004791',
  'GO0004601',
  'GO0016740',
  'GO0016787',
  'GO0043492',
  'GO0016829',
  'GO0016853',
  'GO0016874',
  'GO0017140',
  'GO0019239',
  'GO0032451',
  'GO0003735',
  'GO0005199',
  'GO0005200',
  'GO0017056',
  'G

In [42]:
level_by_node.get("GO0001310")

12

In [29]:
nx.ancestors(g_t, "GO0001310")

{'GO0001301',
 'GO0001304',
 'GO0001307',
 'GO0006139',
 'GO0006259',
 'GO0006323',
 'GO0006325',
 'GO0006996',
 'GO0007001',
 'GO0007580',
 'GO0008150',
 'GO0008152',
 'GO0009987',
 'GO0016043',
 'GO0016568',
 'GO0043170',
 'GO0043283',
 'GO0044237',
 'GO0044238',
 'GO0051276',
 'root'}

In [27]:
structured_paths = build_complete_paths(labels)
print(structured_paths)

NameError: name 'build_complete_paths' is not defined

In [16]:
ds_test.df.categories.values

array(['GO0004519@GO0005739@GO0008150',
       'GO0004519@GO0005739@GO0006314@GO0008380',
       'GO0004519@GO0005739@GO0006316', ...,
       'GO0031202@GO0005682@GO0005685@GO0046540@GO0000398',
       'GO0005515@GO0019898@GO0006914@GO0006623',
       'GO0004402@GO0005737@GO0016573'], dtype=object)

In [1]:
import pandas as pd

In [2]:
df_teste = pd.read_csv('/home/bruno/storage/data/datasets/fma_rock_electronic/test.csv', sep="|")

In [5]:
df_teste.iloc[0]['track_id'] 

np.int64(62870)

In [11]:
fma_path = os.path.join(dataset_path, 'fma_rock_electronic')

In [12]:
labels_json = os.path.join(fma_path, 'labels.json')

In [13]:
labels_json.split('/')[-1]

'labels.json'

In [294]:
t = "12.20.30.40"

In [300]:
nodes = t.split(".")
for idx in range(1, len(nodes)+1):
    local_label = nodes[:idx]
    if len(local_label) > 1:
        local_label = ".".join(local_label)
    else:
        local_label = local_label[0]
    
    print(local_label)
    print(ds_train.local_nodes_idx[idx-1])

12
{'15.183': 0, '12.25': 1, '12.98': 2, '15.495': 3, '15.468': 4, '15.695': 5, '15.337': 6, '12.45': 7, '12.31': 8, '15.297': 9, '12.66': 10, '12.359': 11, '12.314': 12, '15.181': 13, '15.42': 14, '15.182': 15, '12.85': 16, '15.296': 17, '12.27': 18, '12.88': 19, '15.185': 20, '15.184': 21, '15.286': 22, '12.58': 23, '12.26': 24, '15.236': 25, '12.70': 26}
12.20
{'15': 0, '12': 1}
12.20.30
{'15.468.491': 0, '12.25.71': 1, '12.31.101': 2, '12.31.167': 3, '12.25.89': 4, '12.45.53': 5, '15.181.401': 6, '15.182.400': 7, '12.85.404': 8, '12.25.111': 9, '15.297.240': 10, '12.25.64': 11, '12.26.113': 12, '12.25.109': 13, '12.31.439': 14}
12.20.30.40
{'12.25.109.361': 0}


In [296]:
nodes = t.split(".")
for idx in range(1, len(nodes)+1):
    local_label = nodes[:idx]
    if len(local_label) > 1:
        local_label = ".".join(local_label)
    else:
        local_label = local_label[0]
    print(self.local_nodes_idx[idx].get(local_label))

12
12.20
12.20.30
12.20.30.40


In [16]:
elements['nodes']

['12',
 '15',
 '12.25',
 '12.26',
 '12.27',
 '12.31',
 '12.314',
 '12.359',
 '12.45',
 '12.58',
 '12.66',
 '12.70',
 '12.85',
 '12.88',
 '12.98',
 '15.181',
 '15.182',
 '15.183',
 '15.184',
 '15.185',
 '15.236',
 '15.286',
 '15.296',
 '15.297',
 '15.337',
 '15.42',
 '15.468',
 '15.495',
 '15.695',
 '12.25.109',
 '12.25.111',
 '12.25.64',
 '12.25.71',
 '12.25.89',
 '12.26.113',
 '12.31.101',
 '12.31.167',
 '12.31.439',
 '12.45.53',
 '12.85.404',
 '15.181.401',
 '15.182.400',
 '15.297.240',
 '15.468.491',
 '12.25.109.361']

In [17]:
elements['A'][4]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [18]:
elements['graph']

<networkx.classes.digraph.DiGraph at 0x7a3d55357a70>

In [19]:
for a in nx.ancestors(elements['g_t'], '15.183'):
    print(a)

15


In [43]:
df_teste.categories

0      12.25.89
1      12.25.89
2      12.25.89
3      12.25.89
4      12.25.89
         ...   
104    12.25.89
105    12.25.89
106    12.25.89
107    12.25.89
108    12.25.89
Name: categories, Length: 109, dtype: object

In [44]:
Y = []

for df_teste.categories in df_teste.categories.values:
    y_ = np.zeros(len(elements['nodes']))
    for t in labels.split('@'):
        y_[[elements['nodes_idx'].get(a) for a in nx.ancestors(elements['g_t'], t)]] = 1
        y_[elements['nodes_idx'][t]] = 1
    Y.append(y_)
Y = np.stack(Y)

In [45]:
Y[0]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [65]:
features_path = os.path.join(dataset_path, 'fma', 'fma_large')

In [16]:
import os
import pandas as pd
import tensorflow as tf

In [33]:
def parse_tfr_element(element):
    # use the same structure as above; it's kinda an outline of the structure we now want to create
    data = {
        'emb': tf.io.FixedLenFeature([], tf.string),
        'track_id': tf.io.FixedLenFeature([], tf.int64),
    }

    content = tf.io.parse_single_example(element, data)

    track_id = content['track_id']
    emb = content['emb']

    # get our 'feature'-- our image -- and reshape it appropriately
    features = tf.io.parse_tensor(emb, out_type=tf.float32)
    return (features, track_id)


In [35]:
def load_features(dataset_path):
    dataset_path = [os.path.join(dataset_path, path) for path in os.listdir(dataset_path) if path.endswith('.tfrecord')]
    dataset = get_dataset(dataset_path)

    df = pd.DataFrame(
        dataset.as_numpy_iterator(),
        columns=['features', 'track_id']
    )

    df.dropna(inplace=True)

    
    return df

In [36]:
df = load_features(features_path)

2025-02-02 17:05:02.589689: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [37]:
df

Unnamed: 0,features,track_id
0,b'\x08\x01\x12\t\x12\x02\x08\x1d\x12\x03\x08\x...,2
1,b'\x08\x01\x12\t\x12\x02\x08\x1d\x12\x03\x08\x...,3
2,b'\x08\x01\x12\t\x12\x02\x08\x1d\x12\x03\x08\x...,5
3,b'\x08\x01\x12\t\x12\x02\x08\x1d\x12\x03\x08\x...,10
4,b'\x08\x01\x12\t\x12\x02\x08\x1d\x12\x03\x08\x...,20
...,...,...
104181,b'\x08\x01\x12\t\x12\x02\x08\x1d\x12\x03\x08\x...,138179
104182,b'\x08\x01\x12\t\x12\x02\x08\x1d\x12\x03\x08\x...,138180
104183,b'\x08\x01\x12\t\x12\x02\x08\x1d\x12\x03\x08\x...,138181
104184,b'\x08\x01\x12\t\x12\x02\x08\x1d\x12\x03\x08\x...,138182


In [12]:
ds_train.df

Unnamed: 0,features,categories
0,"[0.15, None, -0.22, 0.07, -0.15, -0.15, -0.21,...",GO0003709@GO0000127@GO0005739@GO0006384
1,"[-1.22, -0.27, -0.1, 0.23, -0.14, -0.71, 0.1, ...",GO0003746@GO0005840@GO0005853@GO0006414
2,"[-0.6, 1.01, 0.24, 0.65, -0.05, -0.53, -0.47, ...",GO0016887@GO0051082@GO0005832@GO0005634@GO0000...
3,"[0.25, -0.79, -0.22, -0.54, -0.03, -0.27, 0.17...",GO0003674@GO0016021@GO0042175@GO0006997@GO0007...
4,"[-0.12, -0.54, -0.12, -0.18, 0.0, -0.01, 0.12,...",GO0000703@GO0008534@GO0005634@GO0005739@GO0006...
...,...,...
1620,"[0.28, None, -0.45, -0.27, 0.04, 0.2, -0.18, 0...",GO0003674@GO0005575@GO0008150
1621,"[-0.18, -0.25, -0.03, -0.17, 0.23, -0.03, 0.43...",GO0015239@GO0005887@GO0042493@GO0015893
1622,"[0.12, -0.3, -0.14, 0.19, 0.14, 0.09, 0.25, -0...",GO0003702@GO0005634@GO0005737@GO0046685@GO0045944
1623,"[0.14, -1.18, -0.27, -0.18, -0.09, 0.04, -0.06...",GO0030611@GO0005575@GO0046685


In [10]:
len(ds_train.Y[0])

4126

In [11]:
ds_train.Y[0]

array([1., 1., 1., ..., 0., 1., 0.])

In [7]:
scaler = preprocessing.StandardScaler().fit(np.concatenate((ds_train.X_cont, ds_val.X_cont)))
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean').fit(np.concatenate((ds_train.X_cont, ds_val.X_cont)))

In [8]:
ds_val.X_count, ds_val.Y = scaler.transform(imp_mean.transform(ds_val.X_cont)), torch.tensor(ds_val.Y).to(device)
ds_train.X_count, ds_train.Y = scaler.transform(imp_mean.transform(ds_train.X_cont)), torch.tensor(ds_train.Y).to(device)