In [55]:
import os
import json
import tensorflow as tf
import pandas as pd

from tqdm.notebook import tqdm

In [56]:

args = pd.Series({
    "root_dir":"/mnt/disks/data/",
    "dataset_path":"/mnt/disks/data/fma/fma_large",
    "embeddings":"music_style",
    "train_id": "hierarchical_sample_test"
})


In [57]:

job_path = "/mnt/disks/data/fma/trains"


# In[15]:


train_path = os.path.join(job_path,args.train_id)


# In[16]:


base_path = os.path.join(args.root_dir,"fma")


# In[17]:


models_path = os.path.join(args.root_dir,"models")


metadata_path_fma = os.path.join(base_path,"fma_metadata")


# In[18]:


metadata_file = os.path.join(train_path,"metadata.json")


labels_file = os.path.join(train_path,"labels.json")


In [58]:


def __load_json__(path):
    with open(path, 'r') as f:
        tmp = json.loads(f.read())

    return tmp




In [59]:
import tensorflow as tf
import multiprocessing



class Dataset:
    def __init__(self, tfrecords_path, epochs, batch_size):
        self.tfrecords_path = tfrecords_path
        self.epochs = epochs
        self.batch_size = batch_size

    def list_files(self):
        return [os.path.join(tfrecords_path,file_path) for file_path in os.listdir(tfrecords_path)]

    def build(self):
        files = self.list_files()

        print("build_tf record: files_count: {} / batch_size: {} / epochs: {}".format(len(files), self.batch_size, self.epochs))

        ds = tf.data.TFRecordDataset(files, num_parallel_reads=multiprocessing.cpu_count())
                      

        return ds
    
   
    @staticmethod
    def __parse__(example):
        parsed = tf.parse_single_example(example, features={
            'emb' : tf.io.FixedLenFeature([], tf.string),
            'track_id' : tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True)
        })
        
        content = tf.io.parse_single_example(element, data)

        label = tf.cast(content['track_id'], tf.int32)
        label_hot = tf.one_hot(label1[0], label1[1])
        
        emb = content['emb']
        #get our 'feature'
        feature = tf.io.parse_tensor(emb, out_type=tf.float32)

        inp = {'emb': feature}
        out = {'global_output': label_hot}

        return inp, out


In [60]:

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))): # if value ist tensor
        value = value.numpy() # get value of tensor
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a floast_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_array(array):
  array = tf.io.serialize_tensor(array)
  return array



def parser(serialized_example):
    features_description = {'emb': tf.io.FixedLenFeature([], tf.string)}
    features = tf.io.parse_single_example(serialized_example, features_description)
    features = tf.io.decode_raw(features['emb'], tf.float32)
    return features


def parse_tfr_element(element):
    #use the same structure as above; it's kinda an outline of the structure we now want to create
    data = {
        'emb' : tf.io.FixedLenFeature([], tf.string),
        'track_id' : tf.io.FixedLenFeature([], tf.int64),
    }
    
    content = tf.io.parse_single_example(element, data)

    track_id = content['track_id']
    emb = content['emb']
    

    #get our 'feature'-- our image -- and reshape it appropriately
    feature = tf.io.parse_tensor(emb, out_type=tf.float32)
    return (feature, track_id)


def get_dataset(filename):
    #create the dataset
    dataset = tf.data.TFRecordDataset(filename)

    #pass every single feature through our mapping function
    dataset = dataset.map(
        parse_tfr_element
    )

    return dataset




In [61]:
import numpy as np


def load_dataset(path,dataset=args.embeddings):
    tfrecords_path = os.path.join(path,'tfrecords',dataset)
    
    
    tfrecords_path = [os.path.join(tfrecords_path,path) for path in os.listdir(tfrecords_path)]
    dataset = get_dataset(tfrecords_path)
    
    df = pd.DataFrame(
        dataset.as_numpy_iterator(),
        columns=['feature', 'track_id']
    )
        
    df.dropna(inplace=True)
    
    
    try:
        df.feature = df.feature.apply(lambda x: x[0] if x.shape[0] != 0 else None)
    except:
        print(x)
    
    return df
    


In [62]:
df = load_dataset(args.dataset_path,dataset=args.embeddings)

In [63]:
df

Unnamed: 0,feature,track_id
0,"[0.0008698702, 0.008201729, 0.018753221, -0.03...",124573
1,"[-0.00097459555, -0.0051385164, -0.024011323, ...",124574
2,"[0.039855253, 0.0076441965, -0.00922821, -0.04...",124575
3,"[0.0029335518, 0.020818433, 0.04269241, -0.016...",124576
4,"[0.057992022, -0.0510619, -0.048113894, -0.032...",124577
...,...,...
104181,"[-0.0080008805, 6.110469e-05, 0.18494046, 0.02...",94245
104182,"[0.017404526, 0.0132987695, 0.004312843, 0.048...",94246
104183,"[0.01726994, 0.005624622, 0.10627997, 0.007201...",94247
104184,"[0.07001238, -3.400445e-05, 0.03528729, 0.0683...",94248


In [64]:
df.dropna(inplace=True)

In [65]:
df

Unnamed: 0,feature,track_id
0,"[0.0008698702, 0.008201729, 0.018753221, -0.03...",124573
1,"[-0.00097459555, -0.0051385164, -0.024011323, ...",124574
2,"[0.039855253, 0.0076441965, -0.00922821, -0.04...",124575
3,"[0.0029335518, 0.020818433, 0.04269241, -0.016...",124576
4,"[0.057992022, -0.0510619, -0.048113894, -0.032...",124577
...,...,...
104181,"[-0.0080008805, 6.110469e-05, 0.18494046, 0.02...",94245
104182,"[0.017404526, 0.0132987695, 0.004312843, 0.048...",94246
104183,"[0.01726994, 0.005624622, 0.10627997, 0.007201...",94247
104184,"[0.07001238, -3.400445e-05, 0.03528729, 0.0683...",94248


In [66]:
tracks_df = pd.read_csv(os.path.join(train_path,"tracks.csv"))

In [67]:
labels = __load_json__(labels_file)

In [68]:
labels

{'label1': {'17': 0,
  '2': 1,
  '20': 2,
  '10': 3,
  '13': 4,
  '9': 5,
  '21': 6,
  '3': 7,
  '8': 8,
  '4': 9,
  '38': 10,
  '5': 11,
  '14': 12,
  '1235': 13,
  '12': 14,
  '15': 15},
 'label2': {'8-0': 0,
  '2-130': 1,
  '12-27': 2,
  '38-125': 3,
  '5-444': 4,
  '4-97': 5,
  '17-49': 6,
  '21-580': 7,
  '9-169': 8,
  '20-378': 9,
  '38-41': 10,
  '21-811': 11,
  '38-514': 12,
  '12-440': 13,
  '38-247': 14,
  '10-76': 15,
  '38-47': 16,
  '2-118': 17,
  '12-58': 18,
  '12-66': 19,
  '4-37': 20,
  '15-286': 21,
  '20-65': 22,
  '5-187': 23,
  '38-186': 24,
  '38-456': 25,
  '21-83': 26,
  '10-0': 27,
  '12-26': 28,
  '12-314': 29,
  '2-46': 30,
  '10-362': 31,
  '13-170': 32,
  '9-137': 33,
  '9-63': 34,
  '4-179': 35,
  '21-693': 36,
  '20-0': 37,
  '38-32': 38,
  '12-0': 39,
  '4-0': 40,
  '9-0': 41,
  '4-74': 42,
  '14-19': 43,
  '21-539': 44,
  '2-86': 45,
  '2-77': 46,
  '15-236': 47,
  '20-138': 48,
  '12-98': 49,
  '2-177': 50,
  '14-0': 51,
  '1235-0': 52,
  '12-88': 53,


In [69]:
tracks_df.labels_1.unique()

array([  15,    4,   12,   10,   17,    2,   38,    5, 1235,   21,    9,
         20,    3,   13,   14,    8])

In [70]:
tqdm.pandas()

In [71]:
tracks_df['labels_1'] = tracks_df.labels_1.astype(str).progress_apply(lambda x: labels['label1'][x])

tracks_df['labels_2'] = tracks_df.labels_2.astype(str).progress_apply(lambda x: labels['label2'][x])

tracks_df['labels_3'] = tracks_df.labels_3.astype(str).progress_apply(lambda x: labels['label3'][x])

tracks_df['labels_4'] = tracks_df.labels_4.astype(str).progress_apply(lambda x: labels['label4'][x])

tracks_df['labels_5'] = tracks_df.labels_5.astype(str).progress_apply(lambda x: labels['label5'][x])


  0%|          | 0/10419 [00:00<?, ?it/s]

  0%|          | 0/10419 [00:00<?, ?it/s]

  0%|          | 0/10419 [00:00<?, ?it/s]

  0%|          | 0/10419 [00:00<?, ?it/s]

  0%|          | 0/10419 [00:00<?, ?it/s]

In [72]:
tracks_df = tracks_df.merge(df, on='track_id')

In [112]:
genres_df = tracks_df.drop_duplicates(subset=['labels_5'])[['labels_1','labels_2','labels_3','labels_4','labels_5']]

In [119]:
genres_df

Unnamed: 0,labels_1,labels_2,labels_3,labels_4,labels_5
0,15,84,103,82,57
1,9,42,59,5,48
2,14,82,85,128,30
3,14,18,134,145,89
4,14,82,33,102,92
...,...,...,...,...,...
6791,1,94,62,42,142
6953,1,107,42,70,79
7370,1,66,116,12,9
7485,1,30,115,38,71


In [121]:

# Cria um dicionário que mapeia o ID de cada gênero musical aos IDs de seus subgêneros
genre_dict = {
    '<ROOT>':genres_df.labels_1.unique().tolist()}

def add_node(genre_id,parent_id):
    if pd.notna(parent_id):
        if parent_id not in genre_dict:
            genre_dict[parent_id] = []
        genre_dict[parent_id].append(genre_id)



for i, row in genres_df.iterrows():
    genre_id = row['labels_3']
    parent_id = row['labels_2']
    add_node(genre_id,parent_id)
    
    genre_id = row['labels_4']
    parent_id = row['labels_3']
    add_node(genre_id,parent_id)
    
    genre_id = row['labels_5']
    parent_id = row['labels_4']
    add_node(genre_id,parent_id)
    
    
    

# In[13]:


genre_dict



{'<ROOT>': [15, 9, 14, 3, 0, 1, 10, 11, 13, 6, 5, 2, 7, 4, 12, 8],
 84: [103, 100, 146, 119],
 103: [82, 123, 41],
 82: [57, 85, 33, 0, 7, 45, 118, 55, 33],
 42: [59, 142, 70],
 59: [5, 126, 145],
 5: [48, 120, 97],
 85: [128, 105, 135],
 128: [30, 78],
 18: [134, 65, 135],
 134: [145, 147],
 145: [89, 66],
 33: [102, 12, 56, 98],
 102: [92, 4, 113],
 31: [35, 151, 90],
 35: [81, 64, 68],
 81: [32, 19, 48],
 112: [36, 43, 107],
 36: [76, 54, 29],
 76: [137, 21, 148, 92],
 17: [138, 84, 54],
 138: [133, 6],
 133: [10, 32],
 106: [130, 49, 130, 26, 9],
 130: [104, 130, 24],
 104: [109, 17, 20],
 2: [61, 28, 35],
 61: [49, 46, 128],
 49: [69, 48, 89],
 14: [3, 120, 27],
 3: [50, 60, 139],
 50: [126, 15, 13],
 116: [149, 145, 139, 12],
 149: [119, 97],
 119: [111, 63],
 19: [6, 140, 73],
 6: [55, 72, 93],
 55: [11, 93, 87],
 93: [60, 27, 25, 112],
 60: [50, 17, 106],
 56: [147, 14, 5],
 147: [74, 64],
 74: [150, 33, 119],
 54: [99, 132, 79],
 99: [150, 82, 76],
 150: [29, 62],
 48: [44, 10

In [122]:
from sklearn import svm
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

from sklearn_hierarchical_classification.classifier import HierarchicalClassifier
from sklearn_hierarchical_classification.constants import ROOT
from sklearn_hierarchical_classification.metrics import h_fbeta_score, multi_labeled


In [123]:
base_estimator = make_pipeline(
    TruncatedSVD(n_components=24),
    svm.SVC(
        gamma=0.001,
        kernel="rbf",
        probability=True
    ),
)
clf = HierarchicalClassifier(
    base_estimator=base_estimator,
    class_hierarchy=genre_dict,
)

In [124]:
X_train, X_test, y_train, y_test = train_test_split(
    tracks_df.feature.values.tolist(),
    tracks_df.labels_5.values.tolist(),
    test_size=0.2,
    random_state=2,
)


In [None]:

clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))