In [1]:
import os
import json
import tensorflow as tf
import pandas as pd
import pickle
from tqdm.notebook import tqdm
import logging


import sys
sys.setrecursionlimit(1000000)


2023-04-16 14:05:13.409337: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:

args = pd.Series({
    "root_dir":"/mnt/disks/data/",
    "dataset_path":"/mnt/disks/data/fma/fma_large",
    "embeddings":"music_style",
    "train_id": "hierarchical_partition"
})


In [3]:

job_path = "/mnt/disks/data/fma/trains"


# In[15]:


train_path = os.path.join(job_path,args.train_id)


# In[16]:


base_path = os.path.join(args.root_dir,"fma")


# In[17]:


models_path = os.path.join(args.root_dir,"models")


metadata_path_fma = os.path.join(base_path,"fma_metadata")


# In[18]:


metadata_file = os.path.join(train_path,"metadata.json")


labels_file = os.path.join(train_path,"labels.json")


In [4]:


def __load_json__(path):
    with open(path, 'r') as f:
        tmp = json.loads(f.read())

    return tmp




In [5]:
import tensorflow as tf
import multiprocessing



class Dataset:
    def __init__(self, tfrecords_path, epochs, batch_size):
        self.tfrecords_path = tfrecords_path
        self.epochs = epochs
        self.batch_size = batch_size

    def list_files(self):
        return [os.path.join(tfrecords_path,file_path) for file_path in os.listdir(tfrecords_path)]

    def build(self):
        files = self.list_files()

        print("build_tf record: files_count: {} / batch_size: {} / epochs: {}".format(len(files), self.batch_size, self.epochs))

        ds = tf.data.TFRecordDataset(files, num_parallel_reads=multiprocessing.cpu_count())
                      

        return ds
    
   
    @staticmethod
    def __parse__(example):
        parsed = tf.parse_single_example(example, features={
            'emb' : tf.io.FixedLenFeature([], tf.string),
            'track_id' : tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True)
        })
        
        content = tf.io.parse_single_example(element, data)

        label = tf.cast(content['track_id'], tf.int32)
        label_hot = tf.one_hot(label1[0], label1[1])
        
        emb = content['emb']
        #get our 'feature'
        feature = tf.io.parse_tensor(emb, out_type=tf.float32)

        inp = {'emb': feature}
        out = {'global_output': label_hot}

        return inp, out


In [6]:

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))): # if value ist tensor
        value = value.numpy() # get value of tensor
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a floast_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_array(array):
  array = tf.io.serialize_tensor(array)
  return array



def parser(serialized_example):
    features_description = {'emb': tf.io.FixedLenFeature([], tf.string)}
    features = tf.io.parse_single_example(serialized_example, features_description)
    features = tf.io.decode_raw(features['emb'], tf.float32)
    return features


def parse_tfr_element(element):
    #use the same structure as above; it's kinda an outline of the structure we now want to create
    data = {
        'emb' : tf.io.FixedLenFeature([], tf.string),
        'track_id' : tf.io.FixedLenFeature([], tf.int64),
    }
    
    content = tf.io.parse_single_example(element, data)

    track_id = content['track_id']
    emb = content['emb']
    

    #get our 'feature'-- our image -- and reshape it appropriately
    feature = tf.io.parse_tensor(emb, out_type=tf.float32)
    return (feature, track_id)


def get_dataset(filename):
    #create the dataset
    dataset = tf.data.TFRecordDataset(filename)

    #pass every single feature through our mapping function
    dataset = dataset.map(
        parse_tfr_element
    )

    return dataset




In [7]:
import numpy as np


def load_dataset(path,dataset=args.embeddings):
    tfrecords_path = os.path.join(path,'tfrecords',dataset)
    
    
    tfrecords_path = [os.path.join(tfrecords_path,path) for path in os.listdir(tfrecords_path)]
    dataset = get_dataset(tfrecords_path)
    
    df = pd.DataFrame(
        dataset.as_numpy_iterator(),
        columns=['feature', 'track_id']
    )
        
    df.dropna(inplace=True)
    
    
    try:
        df.feature = df.feature.apply(lambda x: x[0] if x.shape[0] != 0 else None)
    except:
        print(x)
    
    return df
    


In [8]:
df = load_dataset(args.dataset_path,dataset=args.embeddings)

In [9]:
df

Unnamed: 0,feature,track_id
0,"[0.0008698702, 0.008201729, 0.018753221, -0.03...",124573
1,"[-0.00097459555, -0.0051385164, -0.024011323, ...",124574
2,"[0.039855253, 0.0076441965, -0.00922821, -0.04...",124575
3,"[0.0029335518, 0.020818433, 0.04269241, -0.016...",124576
4,"[0.057992022, -0.0510619, -0.048113894, -0.032...",124577
...,...,...
104181,"[-0.0080008805, 6.110469e-05, 0.18494046, 0.02...",94245
104182,"[0.017404526, 0.0132987695, 0.004312843, 0.048...",94246
104183,"[0.01726994, 0.005624622, 0.10627997, 0.007201...",94247
104184,"[0.07001238, -3.400445e-05, 0.03528729, 0.0683...",94248


In [10]:
df.dropna(inplace=True)

In [11]:
df

Unnamed: 0,feature,track_id
0,"[0.0008698702, 0.008201729, 0.018753221, -0.03...",124573
1,"[-0.00097459555, -0.0051385164, -0.024011323, ...",124574
2,"[0.039855253, 0.0076441965, -0.00922821, -0.04...",124575
3,"[0.0029335518, 0.020818433, 0.04269241, -0.016...",124576
4,"[0.057992022, -0.0510619, -0.048113894, -0.032...",124577
...,...,...
104181,"[-0.0080008805, 6.110469e-05, 0.18494046, 0.02...",94245
104182,"[0.017404526, 0.0132987695, 0.004312843, 0.048...",94246
104183,"[0.01726994, 0.005624622, 0.10627997, 0.007201...",94247
104184,"[0.07001238, -3.400445e-05, 0.03528729, 0.0683...",94248


In [12]:
tracks_df = pd.read_csv(os.path.join(train_path,"tracks.csv"))

In [13]:
tracks_df

Unnamed: 0,track_id,full_genre_id,labels_1,labels_2,labels_3,labels_4,labels_5
0,153877,"[38, '247']",38,38-247,38-247-0,38-247-0-0,38-247-0-0-0
1,27635,['38'],38,38-0,38-0-0,38-0-0-0,38-0-0-0-0
2,72586,"[12, 26, '113']",12,12-26,12-26-113,12-26-113-0,12-26-113-0-0
3,97875,['38'],38,38-0,38-0-0,38-0-0-0,38-0-0-0-0
4,64475,"[12, 25, '89']",12,12-25,12-25-89,12-25-89-0,12-25-89-0-0
...,...,...,...,...,...,...,...
43771,31013,['38'],38,38-0,38-0-0,38-0-0-0,38-0-0-0-0
43772,4502,"[12, '58']",12,12-58,12-58-0,12-58-0-0,12-58-0-0-0
43773,69261,"[12, 25, '111']",12,12-25,12-25-111,12-25-111-0,12-25-111-0-0
43774,53353,['12'],12,12-0,12-0-0,12-0-0-0,12-0-0-0-0


In [14]:
labels = __load_json__(labels_file)

In [15]:
tqdm.pandas()

In [16]:
# tracks_df.loc[:,'labels_1'] = tracks_df.labels_1.astype(str).progress_apply(lambda x: labels['label1'][x])

# tracks_df.loc[:,'labels_2'] = tracks_df.labels_2.astype(str).progress_apply(lambda x: labels['label2'][x])

# tracks_df.loc[:,'labels_3'] = tracks_df.labels_3.astype(str).progress_apply(lambda x: labels['label3'][x])

# tracks_df.loc[:,'labels_4'] = tracks_df.labels_4.astype(str).progress_apply(lambda x: labels['label4'][x])

# tracks_df.loc[:,'labels_5'] = tracks_df.labels_5.astype(str).progress_apply(lambda x: labels['label5'][x])


In [17]:
tracks_df = tracks_df.merge(df, on='track_id')

In [18]:
# genres_df = tracks_df.drop_duplicates(subset=['labels_5'])[['labels_1','labels_2','labels_3','labels_4','labels_5']]
genres_df = tracks_df.drop_duplicates(subset=['labels_2'])[['labels_1','labels_2']]

In [19]:
from sklearn import svm
from sklearn import tree
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

from sklearn_hierarchical_classification.classifier import HierarchicalClassifier
from sklearn_hierarchical_classification.constants import ROOT
from sklearn_hierarchical_classification.metrics import h_fbeta_score, multi_labeled


In [20]:

# Cria um dicionário que mapeia o ID de cada gênero musical aos IDs de seus subgêneros
genre_dict = {
    ROOT:genres_df.labels_1.unique().tolist()}

def add_node(genre_id,parent_id):
    if pd.notna(parent_id):
        if parent_id not in genre_dict:
            genre_dict[parent_id] = []
        genre_dict[parent_id].append(genre_id)



for i, row in genres_df.iterrows():
    genre_id = row['labels_2']
    parent_id = row['labels_1']
    add_node(genre_id,parent_id)

#     genre_id = row['labels_3']
#     parent_id = row['labels_2']
#     add_node(genre_id,parent_id)
    
#     genre_id = row['labels_4']
#     parent_id = row['labels_3']
#     add_node(genre_id,parent_id)
    
#     genre_id = row['labels_5']
#     parent_id = row['labels_4']
    # add_node(genre_id,parent_id)
    
    
    

# In[13]:


# genre_dict



In [21]:
genre_dict

{'<ROOT>': [38, 12],
 38: ['38-247',
  '38-0',
  '38-224',
  '38-250',
  '38-30',
  '38-32',
  '38-125',
  '38-456',
  '38-41',
  '38-514',
  '38-6',
  '38-47',
  '38-1',
  '38-22',
  '38-186'],
 12: ['12-26',
  '12-25',
  '12-0',
  '12-36',
  '12-27',
  '12-66',
  '12-45',
  '12-58',
  '12-314',
  '12-85',
  '12-359',
  '12-70',
  '12-88',
  '12-31',
  '12-98',
  '12-440']}

In [22]:
base_estimator = make_pipeline(
    TruncatedSVD(n_components=24),
    svm.SVC(
        gamma=0.001,
        kernel="rbf",
        probability=True
    ),
)

tree_estimator = tree.DecisionTreeClassifier()

clf = HierarchicalClassifier(
    base_estimator=tree_estimator,
    class_hierarchy=genre_dict,
    progress_wrapper=tqdm,
    feature_extraction="preprocessed"
)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    tracks_df.feature.values.tolist(),
    tracks_df.labels_2.astype(str).values.tolist(),
    test_size=0.2,
    random_state=42,
)


In [24]:
logging.disable(logging.CRITICAL)

In [None]:
model = clf.fit(X_train, y_train)

Building features:   0%|          | 0/34 [00:00<?, ?it/s]

Training base classifiers:   0%|          | 0/34 [00:00<?, ?it/s]

In [None]:

filename = os.path.join(train_path,'hsvm.model')
pickle.dump(model, open(filename, 'wb'))

y_pred = clf.predict(X_test)

print("Classification Report:\n", classification_report(y_test, y_pred))

In [27]:
print('cabou')

cabou
