In [7]:
import os
import tensorflow as tf
import pandas as pd


In [8]:

args = pd.Series({
    "root_dir":"/mnt/disks/data/",
    "dataset_path":"/mnt/disks/data/fma/fma_large",
    "embeddings":"music_style",
    "train_id": "hierarchical_mini_test",
    'sample_size':0.01
})


In [9]:

job_path = "/mnt/disks/data/fma/trains"


# In[15]:


train_path = os.path.join(job_path,args.train_id)


# In[16]:


base_path = os.path.join(args.root_dir,"fma")


# In[17]:


models_path = os.path.join(args.root_dir,"models")


metadata_path_fma = os.path.join(base_path,"fma_metadata")


# In[18]:


metadata_file = os.path.join(train_path,"metadata.json")


labels_file = os.path.join(train_path,"labels.json")


In [10]:
import tensorflow as tf
import multiprocessing



class Dataset:
    def __init__(self, tfrecords_path, epochs, batch_size):
        self.tfrecords_path = tfrecords_path
        self.epochs = epochs
        self.batch_size = batch_size

    def list_files(self):
        return [os.path.join(tfrecords_path,file_path) for file_path in os.listdir(tfrecords_path)]

    def build(self):
        files = self.list_files()

        print("build_tf record: files_count: {} / batch_size: {} / epochs: {}".format(len(files), self.batch_size, self.epochs))

        ds = tf.data.TFRecordDataset(files, num_parallel_reads=multiprocessing.cpu_count())
        
        '''''
            Shuffle and reapeat
        '''''
        
        
        ds = ds.shuffle(buffer_size=1024 * 1 * 10)
        ds = ds.repeat(count=self.epochs)
        
        
        
        '''''
            Map and batch
        '''''
        
                      
        ds = ds.map(self.__parse__, num_parallel_calls=None)
        ds = ds.batch(self.batch_size,drop_remainder=False)
        
        
                      
        ds = ds.prefetch(buffer_size=5)
                      

        return ds
    
   
    @staticmethod
    def __parse__(example):
        parsed = tf.parse_single_example(example, features={
            'emb' : tf.io.FixedLenFeature([], tf.string),
            'label' : tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True)
        })
        
        content = tf.io.parse_single_example(element, data)

        label = tf.cast(content['label'], tf.int32)
        label_hot = tf.one_hot(label1[0], label1[1])
        
        emb = content['emb']
        #get our 'feature'
        feature = tf.io.parse_tensor(emb, out_type=tf.float32)

        inp = {'emb': feature}
        out = {'global_output': label_hot}

        return inp, out


In [14]:

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))): # if value ist tensor
        value = value.numpy() # get value of tensor
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a floast_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_array(array):
  array = tf.io.serialize_tensor(array)
  return array



def parser(serialized_example):
    features_description = {'emb': tf.io.FixedLenFeature([], tf.string)}
    features = tf.io.parse_single_example(serialized_example, features_description)
    features = tf.io.decode_raw(features['emb'], tf.float32)
    return features


def parse_tfr_element(element):
    #use the same structure as above; it's kinda an outline of the structure we now want to create
    data = {
        'emb' : tf.io.FixedLenFeature([], tf.string),
        'label' : tf.io.FixedLenFeature([], tf.int64),
    }
    
    content = tf.io.parse_single_example(element, data)

    label = content['label']
    emb = content['emb']
    

    #get our 'feature'-- our image -- and reshape it appropriately
    feature = tf.io.parse_tensor(emb, out_type=tf.float32)
    return (feature, label)


def get_dataset(filename):
    #create the dataset
    dataset = tf.data.TFRecordDataset(filename)

    #pass every single feature through our mapping function
    dataset = dataset.map(
        parse_tfr_element
    )

    return dataset




In [20]:
import numpy as np


def load_dataset(train_path,dataset='train'):
    tfrecords_path = os.path.join(train_path,'tfrecords',dataset)
    
    
    tfrecords_path = [os.path.join(tfrecords_path,path) for path in os.listdir(tfrecords_path)]
    dataset = get_dataset(tfrecords_path)
    
    df = pd.DataFrame(
        dataset.as_numpy_iterator(),
        columns=['Feature', 'Label']
    )
        
    df.Label = df.Label.apply(lambda x: x[0])
    df.dropna(inplace=True)
    
    
    try:
        df.Feature = df.Feature.apply(lambda x: x[0] if x.shape[0] != 0 else None)
    except:
        print(x)
    
    return df
    


In [21]:
df_train = load_dataset(train_path,dataset='train')

[]


In [17]:
df_train

Unnamed: 0,Feature,Label


In [18]:
df_test = load_dataset(train_path,dataset='test')

In [19]:
df_test

Unnamed: 0,Feature,Label


In [12]:
df_val = load_dataset(train_path,dataset='val')

In [22]:
df_train.shape

(95878, 2)

In [10]:
df_test.dropna(inplace=True)

In [11]:
df_test.shape

(5396, 2)

In [19]:
df_train.dropna(inplace=True)

In [20]:
df_val.dropna(inplace=True)

In [21]:
df_val.shape

(5128, 2)

In [23]:
import xgboost as xgb

In [24]:
xgb_model = xgb.XGBClassifier(random_state=42,eval_metric="auc",n_jobs=20)

In [26]:
# Show all messages, including ones pertaining to debugging
xgb.set_config(verbosity=2)

# Get current value of global configuration
# This is a dict containing all parameters in the global configuration,
# including 'verbosity'
config = xgb.get_config()
assert config['verbosity'] == 2

# Example of using the context manager xgb.config_context().
# The context manager will restore the previous value of the global
# configuration upon exiting.
assert xgb.get_config()['verbosity'] == 2  # old value restored

In [None]:
xgb_model.fit(df_train.Feature.values.tolist()[:100], df_train.Label.values.tolist()[:100], 
        eval_set=[(df_val.Feature.values.tolist(), df_val.Label.values.tolist())])

[14:48:06] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 82 extra nodes, 0 pruned nodes, max_depth=6
[14:48:08] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 120 extra nodes, 0 pruned nodes, max_depth=6
[14:48:10] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 96 extra nodes, 0 pruned nodes, max_depth=6
[14:48:12] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 110 extra nodes, 0 pruned nodes, max_depth=6
[14:48:13] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 110 extra nodes, 0 pruned nodes, max_depth=6
[14:48:15] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 62 extra nodes, 0 pruned nodes, max_depth=6
[14:48:17] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 84 extra nodes, 0 pruned nodes, max_depth=6
[14:48:19] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 42 extra nodes, 0 pruned nodes, max_depth=6
[14:48:21] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 26 extra nodes, 0 pruned nodes, m

Exception ignored on calling ctypes callback function: <function _log_callback at 0x7fdf881a1550>
Traceback (most recent call last):
  File "/home/bruno/anaconda3/lib/python3.9/site-packages/xgboost/core.py", line 160, in _log_callback
    def _log_callback(msg: bytes) -> None:
KeyboardInterrupt: 
Exception ignored on calling ctypes callback function: <function _log_callback at 0x7fdf881a1550>
Traceback (most recent call last):
  File "/home/bruno/anaconda3/lib/python3.9/site-packages/xgboost/core.py", line 160, in _log_callback
    def _log_callback(msg: bytes) -> None:
KeyboardInterrupt: 
Exception ignored on calling ctypes callback function: <function _log_callback at 0x7fdf881a1550>
Traceback (most recent call last):
  File "/home/bruno/anaconda3/lib/python3.9/site-packages/xgboost/core.py", line 160, in _log_callback
    def _log_callback(msg: bytes) -> None:
KeyboardInterrupt: 


[14:49:56] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 30 extra nodes, 0 pruned nodes, max_depth=6


Exception ignored on calling ctypes callback function: <function _log_callback at 0x7fdf881a1550>
Traceback (most recent call last):
  File "/home/bruno/anaconda3/lib/python3.9/site-packages/xgboost/core.py", line 160, in _log_callback
    def _log_callback(msg: bytes) -> None:
KeyboardInterrupt: 


[14:49:59] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 62 extra nodes, 0 pruned nodes, max_depth=6


Exception ignored on calling ctypes callback function: <function _log_callback at 0x7fdf881a1550>
Traceback (most recent call last):
  File "/home/bruno/anaconda3/lib/python3.9/site-packages/xgboost/core.py", line 160, in _log_callback
    def _log_callback(msg: bytes) -> None:
KeyboardInterrupt: 


[14:50:03] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 16 extra nodes, 0 pruned nodes, max_depth=6


In [None]:
from sklearn.metrics import confusion_matrix, mean_squared_error, classification_report

In [None]:
xgb_model.save_model(os.path.join(train_path,"model.json"))

In [None]:
y_pred = xgb_model.predict(df_test.Feature.values.tolist())

mean_squared_error(df_test.Label.values.tolist(), y_pred)

In [41]:
report_metrics = pd.DataFrame(classification_report(df_test.Label.values.tolist(), y_pred,output_dict=True)).transpose()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,precision,recall,f1-score,support
0,0.000000,0.000000,0.000000,1.000000
1,0.133333,0.400000,0.200000,5.000000
2,0.000000,0.000000,0.000000,1.000000
3,0.000000,0.000000,0.000000,1.000000
4,0.000000,0.000000,0.000000,1.000000
...,...,...,...,...
93,1.000000,1.000000,1.000000,1.000000
94,1.000000,1.000000,1.000000,1.000000
accuracy,0.370968,0.370968,0.370968,0.370968
macro avg,0.333487,0.347911,0.337561,124.000000


In [None]:
report_metrics.to_csv(os.path.join(train_path,'report_train_xgboost.csv'),index=False)

In [34]:
df = pd.read_csv(os.path.join(train_path,'report_train_xgboost.csv'))

In [35]:
df

Unnamed: 0,precision,recall,f1-score,support
0,0.878222,0.679123,0.765945,2007.00000
1,0.691616,0.756546,0.722626,7829.00000
2,0.984200,0.960797,0.972358,1556.00000
3,0.989390,0.987426,0.988407,1511.00000
4,0.936823,0.923488,0.930108,1686.00000
...,...,...,...,...
147,0.000000,0.000000,0.000000,1.00000
148,0.958506,0.982979,0.970588,235.00000
149,0.835030,0.835030,0.835030,0.83503
150,0.944028,0.928823,0.935587,95878.00000


### Predict after training

In [12]:
import pandas as pd

In [13]:
from sklearn.metrics import confusion_matrix, mean_squared_error, accuracy_score, classification_report

In [14]:
import xgboost as xgb

In [21]:
xgb_model = xgb.XGBClassifier(random_state=42,eval_metric="auc",n_jobs=20)

In [22]:
xgb_model.load_model(os.path.join(train_path,"model.json"))

In [23]:
xgb_model.get_params()

{'objective': 'multi:softprob',
 'use_label_encoder': None,
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': 'auc',
 'feature_types': None,
 'gamma': None,
 'gpu_id': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': 20,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': 42,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [24]:
y_pred = xgb_model.predict(df_test.Feature.values.tolist())

accuracy_score(df_test.Label.values.tolist(), y_pred)

0.2707561156412157

In [25]:
!nvidia-smi

Mon Mar 13 16:15:10 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 510.108.03   Driver Version: 510.108.03   CUDA Version: 11.6     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ...  Off  | 00000000:65:00.0 Off |                  N/A |
|  0%   33C    P8    21W / 260W |   9968MiB / 11264MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [18]:
df_test.Label.values.tolist()

[4,
 28,
 32,
 15,
 15,
 3,
 32,
 3,
 10,
 15,
 30,
 15,
 15,
 15,
 2,
 28,
 21,
 1,
 34,
 4,
 137,
 4,
 107,
 3,
 0,
 15,
 32,
 89,
 25,
 54,
 32,
 25,
 15,
 15,
 15,
 119,
 57,
 9,
 32,
 23,
 10,
 15,
 61,
 12,
 23,
 5,
 15,
 18,
 23,
 0,
 22,
 54,
 32,
 16,
 13,
 32,
 1,
 28,
 15,
 15,
 67,
 10,
 18,
 17,
 21,
 15,
 15,
 15,
 15,
 2,
 15,
 32,
 40,
 25,
 116,
 19,
 1,
 18,
 148,
 15,
 18,
 57,
 18,
 15,
 15,
 23,
 15,
 87,
 15,
 15,
 105,
 10,
 15,
 29,
 25,
 12,
 15,
 17,
 15,
 10,
 23,
 15,
 34,
 4,
 89,
 15,
 17,
 15,
 47,
 15,
 10,
 32,
 1,
 10,
 54,
 17,
 0,
 125,
 32,
 29,
 12,
 32,
 56,
 1,
 15,
 15,
 20,
 24,
 15,
 12,
 82,
 45,
 1,
 21,
 45,
 32,
 34,
 28,
 40,
 18,
 1,
 28,
 49,
 18,
 1,
 18,
 15,
 12,
 4,
 28,
 18,
 12,
 27,
 103,
 15,
 29,
 15,
 34,
 12,
 15,
 18,
 15,
 15,
 32,
 9,
 10,
 21,
 2,
 105,
 23,
 15,
 41,
 28,
 62,
 36,
 18,
 1,
 15,
 10,
 1,
 99,
 124,
 38,
 34,
 14,
 15,
 12,
 78,
 1,
 21,
 46,
 1,
 27,
 40,
 12,
 12,
 32,
 26,
 1,
 21,
 34,
 15,
 8,
 98,
 

In [29]:
y_pred

array([ 15,  15,  12, ..., 105,  15,  10])

In [32]:
report_metrics = pd.DataFrame(classification_report(df_test.Label.values.tolist(), y_pred,output_dict=True)).transpose()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
report_metrics

Unnamed: 0,precision,recall,f1-score,support
0,0.030769,0.017857,0.022599,112.000000
1,0.221311,0.311060,0.258621,434.000000
2,0.385965,0.252874,0.305556,87.000000
3,0.301887,0.190476,0.233577,84.000000
4,0.184211,0.148936,0.164706,94.000000
...,...,...,...,...
147,0.000000,0.000000,0.000000,1.000000
148,1.000000,0.071429,0.133333,14.000000
accuracy,0.270756,0.270756,0.270756,0.270756
macro avg,0.199030,0.099529,0.114855,5396.000000
