In [1]:
import pandas as pd
import ast
import os
import json
import time
import logging
from datetime import datetime as dt

import numpy as np

In [2]:
from tqdm.notebook import tqdm

In [3]:
from sklearn import svm
from sklearn import tree
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from hiclass.MultiLabelHierarchicalClassifier import MultiLabelHierarchicalClassifier
from hiclass.MultiLabelLocalClassifierPerNode import MultiLabelLocalClassifierPerNode
from hiclass.metrics import precision, recall, f1

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
#from sklearn.gaussian_process import GaussianProcessClassifier
#from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.multiclass import OneVsRestClassifier


In [4]:
from dataset.dataset import load_features, load_dataset, pre_process

2024-05-08 21:19:13.596231: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
from utils.data import load 
from utils.dir import create_dir

In [6]:
# Configurando o nível de registro
logging.basicConfig(level=logging.INFO)

In [7]:
def __load_json__(path):
    with open(path, 'r') as f:
        tmp = json.loads(f.read())

    return tmp

In [27]:

args = pd.Series({
    "root_dir": "/mnt/disks/data/",
    "dataset_path": "/mnt/disks/data/fma/fma_large", 
    "metadata_path": "/mnt/disks/data/fma/fma_metadata", 
    "embeddings": "music_style",
    "sequence_size": 1280,
    "train_id": "hierarchical_hiclass"
})


In [28]:

job_path = "/mnt/disks/data/fma/trains"

# In[15]:


train_path = os.path.join(job_path, args.train_id)

# In[16]:


base_path = os.path.join(args.root_dir, "fma")

# In[17]:


models_path = os.path.join(args.root_dir, "models")


# In[18]:


metadata_file = os.path.join(train_path, "metadata.json")
labels_file = os.path.join(train_path, "labels.json")
hiclass_path = os.path.join(train_path, 'hiclass_baseline')

In [29]:
create_dir(hiclass_path)

True

In [30]:
metadata = __load_json__(metadata_file)

In [31]:
labels = __load_json__(labels_file)

In [32]:
labels

{'label_1': {'10': 0,
  '15': 1,
  '38': 2,
  '12': 3,
  '1235': 4,
  '9': 5,
  '17': 6,
  '3': 7,
  '2': 8,
  '21': 9,
  '4': 10,
  '5': 11,
  '8': 12,
  '13': 13,
  '14': 14,
  '20': 15},
 'label_1_name': {'10': 'Pop',
  '15': 'Electronic',
  '38': 'Experimental',
  '12': 'Rock',
  '1235': 'Instrumental',
  '9': 'Country',
  '17': 'Folk',
  '3': 'Blues',
  '2': 'International',
  '21': 'Hip-Hop',
  '4': 'Jazz',
  '5': 'Classical',
  '8': 'Old-Time / Historic',
  '13': 'Easy Listening',
  '14': 'Soul-RnB',
  '20': 'Spoken'},
 'label_1_inverse': [10,
  15,
  38,
  12,
  1235,
  9,
  17,
  3,
  2,
  21,
  4,
  5,
  8,
  13,
  14,
  20],
 'label_1_count': 16,
 'label_2': {'32': 1,
  '25': 2,
  '70': 3,
  '107': 4,
  '137': 5,
  '181': 6,
  '26': 7,
  '267': 8,
  '184': 9,
  '103': 10,
  '224': 11,
  '286': 12,
  '236': 13,
  '66': 14,
  '102': 15,
  '250': 16,
  '27': 17,
  '1': 18,
  '811': 19,
  '30': 20,
  '76': 21,
  '94': 22,
  '441': 23,
  '297': 24,
  '296': 25,
  '130': 26,
  '22

### Load baseline features

In [33]:
features = load(os.path.join(args.metadata_path, 'features.csv'))

In [34]:
#echonest = load(os.path.join(args.metadata_path, 'echonest.csv'))

In [35]:
features.columns.values

array([('chroma_cens', 'kurtosis', '01'),
       ('chroma_cens', 'kurtosis', '02'),
       ('chroma_cens', 'kurtosis', '03'),
       ('chroma_cens', 'kurtosis', '04'),
       ('chroma_cens', 'kurtosis', '05'),
       ('chroma_cens', 'kurtosis', '06'),
       ('chroma_cens', 'kurtosis', '07'),
       ('chroma_cens', 'kurtosis', '08'),
       ('chroma_cens', 'kurtosis', '09'),
       ('chroma_cens', 'kurtosis', '10'),
       ('chroma_cens', 'kurtosis', '11'),
       ('chroma_cens', 'kurtosis', '12'), ('chroma_cens', 'max', '01'),
       ('chroma_cens', 'max', '02'), ('chroma_cens', 'max', '03'),
       ('chroma_cens', 'max', '04'), ('chroma_cens', 'max', '05'),
       ('chroma_cens', 'max', '06'), ('chroma_cens', 'max', '07'),
       ('chroma_cens', 'max', '08'), ('chroma_cens', 'max', '09'),
       ('chroma_cens', 'max', '10'), ('chroma_cens', 'max', '11'),
       ('chroma_cens', 'max', '12'), ('chroma_cens', 'mean', '01'),
       ('chroma_cens', 'mean', '02'), ('chroma_cens', 'mean', '

### Load splited dataset

In [36]:
df_train, df_test = load_dataset(metadata)

### Norml labels

In [37]:
def norm_labels(label):
    label = ast.literal_eval(label)
    #label = [[elemento for elemento in sublist if elemento != 0] for sublist in label]
    return label
    

In [38]:
df_train['full_genre_id'] = df_train.full_genre_id.apply(lambda x: norm_labels(x))

In [39]:
df_train

Unnamed: 0,track_id,full_genre_id
0,70061,"[[12, 66, , , ], [9, 137, , , ], [17, 94, , , ]]"
1,67240,"[[12, , , , ]]"
2,83269,"[[12, 27, , , ], [10, 76, , , ]]"
3,109541,"[[15, 42, , , ]]"
4,640,"[[20, 65, 43, , ]]"
...,...,...
9918,55341,"[[10, , , , ], [15, , , , ]]"
9919,106930,"[[15, 236, , , ], [38, 47, , , ]]"
9920,27587,"[[12, 66, , , ], [38, , , , ]]"
9921,42667,"[[17, 103, , , ], [10, , , , ]]"


In [40]:
df_test['full_genre_id'] = df_test.full_genre_id.apply(lambda x: norm_labels(x))

In [41]:
#features[features.index == 128973]

#### Preprocessing

In [42]:
feature_sets = {
   # 'echonest_audio': ('echonest', 'audio_features'),
   # 'echonest_temporal': ('echonest', 'temporal_features'),
    #'mfcc': 'mfcc',
    #'mfcc/contrast/chroma/centroid/tonnetz': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'tonnetz'],
    'mfcc/contrast/chroma/centroid/zcr': ['mfcc', 'spectral_contrast', 'chroma_cens', 'spectral_centroid', 'zcr', 'tonnetz'],
}

In [43]:
classifiers = {
    #'XGB': xgb_estimator,
    #'LR': LogisticRegression(),
    #'kNN': KNeighborsClassifier(n_neighbors=5),
    #'SVCrbf': SVC(kernel='rbf',probability=True),
    #'SVCpoly1': SVC(kernel='poly', degree=1,probability=True),
    #'linSVC1': SVC(kernel="linear",probability=True),
    #GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
    #'DT': DecisionTreeClassifier(max_depth=5, max_features='sqrt'),
    #'RF': RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    #'AdaBoost': AdaBoostClassifier(n_estimators=10),
    'MLP1': MLPClassifier(hidden_layer_sizes=(100,), max_iter=2000),
    #'MLP2': MLPClassifier(hidden_layer_sizes=(200, 50), max_iter=2000),
    #'NB': GaussianNB(),
    #'QDA': QuadraticDiscriminantAnalysis(),
}

In [44]:

def converter_segundos(segundos):
    horas = segundos // 3600
    minutos = (segundos % 3600) // 60
    segundos = segundos % 60
    return horas, minutos, segundos

# In[125]:



def test_classifiers_features(classifiers, features_all, feature_sets, multi_label=False):
    columns = list(classifiers.keys()).insert(0, 'dim')
    times = pd.DataFrame(columns=classifiers.keys(), index=feature_sets.keys())
    for fset_name, fset in tqdm(feature_sets.items(), desc='features'):
        X_train, y_train = pre_process(df_train, features_all, fset, multi_label)
        X_test, y_test = pre_process(df_test, features_all, fset, multi_label)
        for clf_name, clf in tqdm(classifiers.items(), desc='classifiers'):  # tqdm_notebook(classifiers.items(), desc='classifiers', leave=False):
            t = time.process_time()
            df_test_pred = df_test.copy(deep=True)
            clf_path = os.path.join(hiclass_path, clf_name)
            create_dir(clf_path)
            logging.info(f'Init training step for {clf_name}')
            hclf = MultiLabelLocalClassifierPerNode(local_classifier=clf, n_jobs=8, verbose=1)
            hclf.fit(X_train, y_train)
            y_pred = hclf.predict(X_test)
            # Convertendo para lista de inteiros
            y_pred = [[[str(num) if num != '' else '' for num in sublist] for sublist in lista] for lista in y_pred]
            df_test_pred['y_pred'] = y_pred
            end_time = time.process_time() - t
            logging.info(f'End training with {end_time}')
            times.loc[fset_name, clf_name] = end_time
            # Escreve a duração do treinamento em um arquivo
            with open(os.path.join(clf_path,"time.txt"), "w") as f:
                f.write("Tempo de Treinamento: {} segundos".format(end_time))
            
            print("Tempo de Treinamento:", end_time, "segundos")
            df_test_pred.to_csv(os.path.join(clf_path,'predict.csv'))
    return times

In [None]:
#scores, times, y_pred = test_classifiers_features(classifiers, features, feature_sets, multi_label=True)
times = test_classifiers_features(classifiers, features, feature_sets, multi_label=True)


features:   0%|          | 0/1 [00:00<?, ?it/s]

classifiers:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:root:Init training step for MLP1
INFO:LCPN:Creating digraph from 84035 3D labels
INFO:LCPN:Detected 16 roots
INFO:LCPN:Initializing local classifiers
INFO:LCPN:Initializing siblings binary policy
INFO:LCPN:Fitting local classifiers
