## Load dataset


In [1]:
import torch, json, os, logging
import numpy as np
from hmc.dataset import initialize_dataset


In [2]:
from hmc.utils import __load_json__

In [3]:
from sklearn.impute import SimpleImputer
from sklearn import preprocessing

In [4]:
import pandas as pd

In [5]:
dataset_path = '/home/bruno/storage/data/datasets'

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [7]:
ds_train, ds_val, ds_test = initialize_dataset('fma_rock_electronic', dataset_path, output_path='/home/bruno/storage/data/outputs')

2025-02-21 20:58:55,384 - INFO - Category: 15.183 - Length: 2
2025-02-21 20:58:55,385 - INFO - New max length: 2
2025-02-21 20:58:55,386 - INFO - Adding edge: 15.183 -> 15
2025-02-21 20:58:55,387 - INFO - Category: 12.25 - Length: 2
2025-02-21 20:58:55,388 - INFO - Adding edge: 12.25 -> 12
2025-02-21 20:58:55,388 - INFO - Category: 12.98 - Length: 2
2025-02-21 20:58:55,389 - INFO - Adding edge: 12.98 -> 12
2025-02-21 20:58:55,390 - INFO - Category: 15.495 - Length: 2
2025-02-21 20:58:55,391 - INFO - Adding edge: 15.495 -> 15
2025-02-21 20:58:55,391 - INFO - Category: 15.468.491 - Length: 3
2025-02-21 20:58:55,392 - INFO - New max length: 3
2025-02-21 20:58:55,393 - INFO - Adding edge: 15.468 -> 15
2025-02-21 20:58:55,393 - INFO - Category: 15.695 - Length: 2
2025-02-21 20:58:55,394 - INFO - Adding edge: 15.695 -> 15
2025-02-21 20:58:55,394 - INFO - Category: 15.337 - Length: 2
2025-02-21 20:58:55,394 - INFO - Adding edge: 15.337 -> 15
2025-02-21 20:58:55,394 - INFO - Category: 12.25.71

In [8]:
df_teste = pd.read_csv('/home/bruno/storage/data/datasets/fma_rock_electronic/test.csv', sep="|")

In [9]:
df_teste

Unnamed: 0,track_id,file_path,track_genre_top,categories
0,62870,/home/bruno/storage/data/fma/fma_large/062/062...,Rock,12.25.89
1,128705,/home/bruno/storage/data/fma/fma_large/128/128...,Rock,12.88@12.85
2,29500,/home/bruno/storage/data/fma/fma_large/029/029...,Rock,12.25.111@12.85
3,107152,/home/bruno/storage/data/fma/fma_large/107/107...,Electronic,15.184
4,4172,/home/bruno/storage/data/fma/fma_large/004/004...,Rock,12.25@12.27
...,...,...,...,...
104,60832,/home/bruno/storage/data/fma/fma_large/060/060...,Electronic,15
105,1746,/home/bruno/storage/data/fma/fma_large/001/001...,Rock,12
106,55906,/home/bruno/storage/data/fma/fma_large/055/055...,Rock,12.27@12.70
107,43157,/home/bruno/storage/data/fma/fma_large/043/043...,Electronic,15.468


In [10]:
import networkx as nx
from collections import defaultdict

def count_classes_per_level(graph, root=None):
    """
    Counts the number of classes per level in a hierarchical networkx graph.

    Parameters:
    - graph (networkx.DiGraph): Directed graph representing the hierarchy.
    - root (any, optional): The root node. If None, it is inferred as the node with no predecessors.

    Returns:
    - dict: A dictionary where keys are levels (depths) and values are the number of nodes at that level.
    """
    if root is None:
        # Assume root is the node with no predecessors
        root_candidates = [node for node in graph.nodes if graph.in_degree(node) == 0]
        if len(root_candidates) != 1:
            raise ValueError("Unable to determine a unique root node. Please specify one.")
        root = root_candidates[0]

    # Perform BFS to count nodes per level
    level_count = defaultdict(int)
    queue = [(root, 0)]  # (node, depth)

    while queue:
        node, depth = queue.pop(0)
        level_count[depth] += 1
        for child in graph.successors(node):
            queue.append((child, depth + 1))

    return dict(level_count)


In [11]:
fma_path = os.path.join(dataset_path, 'fma_rock_electronic')

In [12]:
labels_json = os.path.join(fma_path, 'labels.json')

In [13]:
labels_json.split('/')[-1]

'labels.json'

In [14]:
def load_structure(labels_json):
    # Load labels JSON
    elements = {}
    is_go = False
    is_fma = True
    output_path = "."
    labels_json_name = labels_json.split('/')[-1]
    graph_path = os.path.join(output_path, labels_json_name.replace('labels.json', 'graphml'))
    
    max_len = 0
    with open(labels_json, 'r') as f:
        categories = json.load(f)

    g = nx.DiGraph()
    for cat in categories['labels']:
        terms = cat.split('.')
        cat_len = len(terms)
        print(f'Category: {cat} - Length: {cat_len}')
        if cat_len > max_len:
            max_len = cat_len
            print(f'New max length: {max_len}')

        if is_go:
            g.add_edge(terms[1], terms[0])
        else:
            if len(terms) == 1:
                print(f'Adding edge: {terms[0]} -> root')
                g.add_node(terms[0])
            else:
                print(f'Adding edge: {".".join(terms[:2])} -> {".".join(terms[:1])}')
                for i in range(2, len(terms) + 1):
                    g.add_edge('.'.join(terms[:i]), '.'.join(terms[:i - 1]))
        # Ordenação dos nós para criar os elementos do grafo
        nodes = sorted(g.nodes(), key=lambda x: (len(x.split('.')), x))
        elements['nodes'] = nodes
        nodes_idx = dict(zip(nodes, range(len(nodes))))
        elements['nodes_idx'] = nodes_idx
        g_t = g.reverse()
        elements['g_t'] = g_t
                        
        elements['graph'] = g
        ### Save networkx graph
        # Para salvar em formato GraphML
        nx.write_graphml(g, graph_path)

    
    A =  nx.to_numpy_array(g, nodelist=nodes)
    elements['A'] = A
    return elements


In [15]:
elements = load_structure(labels_json)

Category: 15.183 - Length: 2
New max length: 2
Adding edge: 15.183 -> 15
Category: 12.25 - Length: 2
Adding edge: 12.25 -> 12
Category: 12.98 - Length: 2
Adding edge: 12.98 -> 12
Category: 15.495 - Length: 2
Adding edge: 15.495 -> 15
Category: 15.468.491 - Length: 3
New max length: 3
Adding edge: 15.468 -> 15
Category: 15.695 - Length: 2
Adding edge: 15.695 -> 15
Category: 15.337 - Length: 2
Adding edge: 15.337 -> 15
Category: 12.25.71 - Length: 3
Adding edge: 12.25 -> 12
Category: 12.45 - Length: 2
Adding edge: 12.45 -> 12
Category: 12.31.101 - Length: 3
Adding edge: 12.31 -> 12
Category: 15.297 - Length: 2
Adding edge: 15.297 -> 15
Category: 12.31.167 - Length: 3
Adding edge: 12.31 -> 12
Category: 12.25.89 - Length: 3
Adding edge: 12.25 -> 12
Category: 12.45.53 - Length: 3
Adding edge: 12.45 -> 12
Category: 12.66 - Length: 2
Adding edge: 12.66 -> 12
Category: 12.359 - Length: 2
Adding edge: 12.359 -> 12
Category: 12.314 - Length: 2
Adding edge: 12.314 -> 12
Category: 15 - Length: 1
A

In [16]:
elements['nodes']

['12',
 '15',
 '12.25',
 '12.26',
 '12.27',
 '12.31',
 '12.314',
 '12.359',
 '12.45',
 '12.58',
 '12.66',
 '12.70',
 '12.85',
 '12.88',
 '12.98',
 '15.181',
 '15.182',
 '15.183',
 '15.184',
 '15.185',
 '15.236',
 '15.286',
 '15.296',
 '15.297',
 '15.337',
 '15.42',
 '15.468',
 '15.495',
 '15.695',
 '12.25.109',
 '12.25.111',
 '12.25.64',
 '12.25.71',
 '12.25.89',
 '12.26.113',
 '12.31.101',
 '12.31.167',
 '12.31.439',
 '12.45.53',
 '12.85.404',
 '15.181.401',
 '15.182.400',
 '15.297.240',
 '15.468.491',
 '12.25.109.361']

In [17]:
elements['A'][4]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [18]:
elements['graph']

<networkx.classes.digraph.DiGraph at 0x7a3d55357a70>

In [19]:
for a in nx.ancestors(elements['g_t'], '15.183'):
    print(a)

15


In [43]:
df_teste.categories

0      12.25.89
1      12.25.89
2      12.25.89
3      12.25.89
4      12.25.89
         ...   
104    12.25.89
105    12.25.89
106    12.25.89
107    12.25.89
108    12.25.89
Name: categories, Length: 109, dtype: object

In [44]:
Y = []

for df_teste.categories in df_teste.categories.values:
    y_ = np.zeros(len(elements['nodes']))
    for t in labels.split('@'):
        y_[[elements['nodes_idx'].get(a) for a in nx.ancestors(elements['g_t'], t)]] = 1
        y_[elements['nodes_idx'][t]] = 1
    Y.append(y_)
Y = np.stack(Y)

In [45]:
Y[0]

array([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [65]:
features_path = os.path.join(dataset_path, 'fma', 'fma_large')

In [16]:
import os
import pandas as pd
import tensorflow as tf

In [33]:
def parse_tfr_element(element):
    # use the same structure as above; it's kinda an outline of the structure we now want to create
    data = {
        'emb': tf.io.FixedLenFeature([], tf.string),
        'track_id': tf.io.FixedLenFeature([], tf.int64),
    }

    content = tf.io.parse_single_example(element, data)

    track_id = content['track_id']
    emb = content['emb']

    # get our 'feature'-- our image -- and reshape it appropriately
    features = tf.io.parse_tensor(emb, out_type=tf.float32)
    return (features, track_id)


In [35]:
def load_features(dataset_path):
    dataset_path = [os.path.join(dataset_path, path) for path in os.listdir(dataset_path) if path.endswith('.tfrecord')]
    dataset = get_dataset(dataset_path)

    df = pd.DataFrame(
        dataset.as_numpy_iterator(),
        columns=['features', 'track_id']
    )

    df.dropna(inplace=True)

    
    return df

In [36]:
df = load_features(features_path)

2025-02-02 17:05:02.589689: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [37]:
df

Unnamed: 0,features,track_id
0,b'\x08\x01\x12\t\x12\x02\x08\x1d\x12\x03\x08\x...,2
1,b'\x08\x01\x12\t\x12\x02\x08\x1d\x12\x03\x08\x...,3
2,b'\x08\x01\x12\t\x12\x02\x08\x1d\x12\x03\x08\x...,5
3,b'\x08\x01\x12\t\x12\x02\x08\x1d\x12\x03\x08\x...,10
4,b'\x08\x01\x12\t\x12\x02\x08\x1d\x12\x03\x08\x...,20
...,...,...
104181,b'\x08\x01\x12\t\x12\x02\x08\x1d\x12\x03\x08\x...,138179
104182,b'\x08\x01\x12\t\x12\x02\x08\x1d\x12\x03\x08\x...,138180
104183,b'\x08\x01\x12\t\x12\x02\x08\x1d\x12\x03\x08\x...,138181
104184,b'\x08\x01\x12\t\x12\x02\x08\x1d\x12\x03\x08\x...,138182


In [12]:
ds_train.df

Unnamed: 0,features,categories
0,"[0.15, None, -0.22, 0.07, -0.15, -0.15, -0.21,...",GO0003709@GO0000127@GO0005739@GO0006384
1,"[-1.22, -0.27, -0.1, 0.23, -0.14, -0.71, 0.1, ...",GO0003746@GO0005840@GO0005853@GO0006414
2,"[-0.6, 1.01, 0.24, 0.65, -0.05, -0.53, -0.47, ...",GO0016887@GO0051082@GO0005832@GO0005634@GO0000...
3,"[0.25, -0.79, -0.22, -0.54, -0.03, -0.27, 0.17...",GO0003674@GO0016021@GO0042175@GO0006997@GO0007...
4,"[-0.12, -0.54, -0.12, -0.18, 0.0, -0.01, 0.12,...",GO0000703@GO0008534@GO0005634@GO0005739@GO0006...
...,...,...
1620,"[0.28, None, -0.45, -0.27, 0.04, 0.2, -0.18, 0...",GO0003674@GO0005575@GO0008150
1621,"[-0.18, -0.25, -0.03, -0.17, 0.23, -0.03, 0.43...",GO0015239@GO0005887@GO0042493@GO0015893
1622,"[0.12, -0.3, -0.14, 0.19, 0.14, 0.09, 0.25, -0...",GO0003702@GO0005634@GO0005737@GO0046685@GO0045944
1623,"[0.14, -1.18, -0.27, -0.18, -0.09, 0.04, -0.06...",GO0030611@GO0005575@GO0046685


In [10]:
len(ds_train.Y[0])

4126

In [11]:
ds_train.Y[0]

array([1., 1., 1., ..., 0., 1., 0.])

In [7]:
scaler = preprocessing.StandardScaler().fit(np.concatenate((ds_train.X_cont, ds_val.X_cont)))
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean').fit(np.concatenate((ds_train.X_cont, ds_val.X_cont)))

In [8]:
ds_val.X_count, ds_val.Y = scaler.transform(imp_mean.transform(ds_val.X_cont)), torch.tensor(ds_val.Y).to(device)
ds_train.X_count, ds_train.Y = scaler.transform(imp_mean.transform(ds_train.X_cont)), torch.tensor(ds_train.Y).to(device)