In [1]:
import os
import json

from sklearn.linear_model import LogisticRegression
from hiclass import LocalClassifierPerNode

import tensorflow as tf
import pandas as pd
import pickle
from tqdm.notebook import tqdm
import logging


2023-04-24 16:45:19.341914: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:

args = pd.Series({
    "root_dir":"/mnt/disks/data/",
    "dataset_path":"/mnt/disks/data/fma/fma_large",
    "embeddings":"music_style",
    "train_id": "hierarchical_single"
})


In [3]:

job_path = "/mnt/disks/data/fma/trains"


# In[15]:


train_path = os.path.join(job_path,args.train_id)


# In[16]:


base_path = os.path.join(args.root_dir,"fma")


# In[17]:


models_path = os.path.join(args.root_dir,"models")


metadata_path_fma = os.path.join(base_path,"fma_metadata")


# In[18]:


metadata_file = os.path.join(train_path,"metadata.json")


labels_file = os.path.join(train_path,"labels.json")


In [4]:


def __load_json__(path):
    with open(path, 'r') as f:
        tmp = json.loads(f.read())

    return tmp




In [5]:
import tensorflow as tf
import multiprocessing



class Dataset:
    def __init__(self, tfrecords_path, epochs, batch_size):
        self.tfrecords_path = tfrecords_path
        self.epochs = epochs
        self.batch_size = batch_size

    def list_files(self):
        return [os.path.join(tfrecords_path,file_path) for file_path in os.listdir(tfrecords_path)]

    def build(self):
        files = self.list_files()

        print("build_tf record: files_count: {} / batch_size: {} / epochs: {}".format(len(files), self.batch_size, self.epochs))

        ds = tf.data.TFRecordDataset(files, num_parallel_reads=multiprocessing.cpu_count())
                      

        return ds
    
   
    @staticmethod
    def __parse__(example):
        parsed = tf.parse_single_example(example, features={
            'emb' : tf.io.FixedLenFeature([], tf.string),
            'track_id' : tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True)
        })
        
        content = tf.io.parse_single_example(element, data)

        label = tf.cast(content['track_id'], tf.int32)
        label_hot = tf.one_hot(label1[0], label1[1])
        
        emb = content['emb']
        #get our 'feature'
        feature = tf.io.parse_tensor(emb, out_type=tf.float32)

        inp = {'emb': feature}
        out = {'global_output': label_hot}

        return inp, out


In [6]:

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))): # if value ist tensor
        value = value.numpy() # get value of tensor
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a floast_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_array(array):
    array = tf.io.serialize_tensor(array)
    return array



def parser(serialized_example):
    features_description = {'emb': tf.io.FixedLenFeature([], tf.string)}
    features = tf.io.parse_single_example(serialized_example, features_description)
    features = tf.io.decode_raw(features['emb'], tf.float32)
    return features


def parse_tfr_element(element):
    #use the same structure as above; it's kinda an outline of the structure we now want to create
    data = {
        'emb' : tf.io.FixedLenFeature([], tf.string),
        'track_id' : tf.io.FixedLenFeature([], tf.int64),
    }
    
    content = tf.io.parse_single_example(element, data)

    track_id = content['track_id']
    emb = content['emb']
    

    #get our 'feature'-- our image -- and reshape it appropriately
    feature = tf.io.parse_tensor(emb, out_type=tf.float32)
    return (feature, track_id)


def get_dataset(filename):
    #create the dataset
    dataset = tf.data.TFRecordDataset(filename)

    #pass every single feature through our mapping function
    dataset = dataset.map(
        parse_tfr_element
    )

    return dataset




In [7]:
import numpy as np


def load_dataset(path,dataset=args.embeddings):
    tfrecords_path = os.path.join(path,'tfrecords',dataset)
    
    
    tfrecords_path = [os.path.join(tfrecords_path,path) for path in os.listdir(tfrecords_path)]
    dataset = get_dataset(tfrecords_path)
    
    df = pd.DataFrame(
        dataset.as_numpy_iterator(),
        columns=['feature', 'track_id']
    )
        
    df.dropna(inplace=True)
    
    
    try:
        df.feature = df.feature.apply(lambda x: x[0] if x.shape[0] != 0 else None)
    except:
        print(x)
    
    return df
    


In [8]:
df = load_dataset(args.dataset_path,dataset=args.embeddings)

2023-04-24 16:45:23.842359: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 8792 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:65:00.0, compute capability: 7.5
2023-04-24 16:45:24.046981: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [11]
	 [[{{node Placeholder/_0}}]]


In [9]:
print(df)

                                                  feature  track_id
0       [0.0008698702, 0.008201729, 0.018753221, -0.03...    124573
1       [-0.00097459555, -0.0051385164, -0.024011323, ...    124574
2       [0.039855253, 0.0076441965, -0.00922821, -0.04...    124575
3       [0.0029335518, 0.020818433, 0.04269241, -0.016...    124576
4       [0.057992022, -0.0510619, -0.048113894, -0.032...    124577
...                                                   ...       ...
104181  [-0.0080008805, 6.110469e-05, 0.18494046, 0.02...     94245
104182  [0.017404526, 0.0132987695, 0.004312843, 0.048...     94246
104183  [0.01726994, 0.005624622, 0.10627997, 0.007201...     94247
104184  [0.07001238, -3.400445e-05, 0.03528729, 0.0683...     94248
104185  [-0.013616562, -0.012981772, -0.0065422454, -0...     94249

[104186 rows x 2 columns]


In [10]:
df.dropna(inplace=True)

In [11]:
df

Unnamed: 0,feature,track_id
0,"[0.0008698702, 0.008201729, 0.018753221, -0.03...",124573
1,"[-0.00097459555, -0.0051385164, -0.024011323, ...",124574
2,"[0.039855253, 0.0076441965, -0.00922821, -0.04...",124575
3,"[0.0029335518, 0.020818433, 0.04269241, -0.016...",124576
4,"[0.057992022, -0.0510619, -0.048113894, -0.032...",124577
...,...,...
104181,"[-0.0080008805, 6.110469e-05, 0.18494046, 0.02...",94245
104182,"[0.017404526, 0.0132987695, 0.004312843, 0.048...",94246
104183,"[0.01726994, 0.005624622, 0.10627997, 0.007201...",94247
104184,"[0.07001238, -3.400445e-05, 0.03528729, 0.0683...",94248


In [135]:
tracks_df = pd.read_csv(os.path.join(train_path,"tracks.csv"))

In [136]:
tracks_df

Unnamed: 0,track_id,full_genre_id,labels_1,labels_2,labels_3,labels_4,labels_5
0,48192,"[14, '19']",14,14-19,14-19-0,14-19-0-0,14-19-0-0-0
1,92014,"[14, '19']",14,14-19,14-19-0,14-19-0-0,14-19-0-0-0
2,80238,"[3, '567']",3,3-567,3-567-0,3-567-0-0,3-567-0-0-0
3,117638,"[3, '567']",3,3-567,3-567-0,3-567-0-0,3-567-0-0-0
4,38991,"[14, '19']",14,14-19,14-19-0,14-19-0-0,14-19-0-0-0
...,...,...,...,...,...,...,...
605,38976,"[14, '19']",14,14-19,14-19-0,14-19-0-0,14-19-0-0-0
606,123630,"[14, '19']",14,14-19,14-19-0,14-19-0-0,14-19-0-0-0
607,118651,['3'],3,3-0,3-0-0,3-0-0-0,3-0-0-0-0
608,48197,"[14, '19']",14,14-19,14-19-0,14-19-0-0,14-19-0-0-0


In [137]:
labels = __load_json__(labels_file)

In [138]:
labels['label2']

{'14-0': 2, '3-567': 3, '14-19': 4, '3-0': 5, '14-11': 6}

In [139]:
tqdm.pandas()

In [140]:
tracks_df.loc[:,'labels_1'] = tracks_df.labels_1.astype(str).progress_apply(lambda x: labels['label1'][x])

tracks_df.loc[:,'labels_2'] = tracks_df.labels_2.astype(str).progress_apply(lambda x: labels['label2'][x])

# tracks_df.loc[:,'labels_3'] = tracks_df.labels_3.astype(str).progress_apply(lambda x: labels['label3'][x])

# tracks_df.loc[:,'labels_4'] = tracks_df.labels_4.astype(str).progress_apply(lambda x: labels['label4'][x])

# tracks_df.loc[:,'labels_5'] = tracks_df.labels_5.astype(str).progress_apply(lambda x: labels['label5'][x])


  0%|          | 0/610 [00:00<?, ?it/s]

  0%|          | 0/610 [00:00<?, ?it/s]

  tracks_df.loc[:,'labels_2'] = tracks_df.labels_2.astype(str).progress_apply(lambda x: labels['label2'][x])


In [141]:
tracks_df = tracks_df.merge(df, on='track_id')

In [142]:
# genres_df = tracks_df.drop_duplicates(subset=['labels_5'])[['labels_1','labels_2','labels_3','labels_4','labels_5']]
genres_df = tracks_df.drop_duplicates(subset=['labels_2'])[['labels_1','labels_2']]

In [143]:
from sklearn import svm
from sklearn import tree
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

In [144]:
tracks_df

Unnamed: 0,track_id,full_genre_id,labels_1,labels_2,labels_3,labels_4,labels_5,feature
0,48192,"[14, '19']",0,4,14-19-0,14-19-0-0,14-19-0-0-0,"[0.28188887, 0.018675039, 0.0009581049, -0.047..."
1,92014,"[14, '19']",0,4,14-19-0,14-19-0-0,14-19-0-0-0,"[0.0014705658, -0.040685076, -0.019948969, 0.0..."
2,80238,"[3, '567']",1,3,3-567-0,3-567-0-0,3-567-0-0-0,"[0.09746888, -0.016016701, -0.012167622, 0.024..."
3,117638,"[3, '567']",1,3,3-567-0,3-567-0-0,3-567-0-0-0,"[0.05056459, -0.021571884, -0.030859172, 0.011..."
4,38991,"[14, '19']",0,4,14-19-0,14-19-0-0,14-19-0-0-0,"[-0.013123721, -0.043863863, 0.04856713, 0.083..."
...,...,...,...,...,...,...,...,...
605,38976,"[14, '19']",0,4,14-19-0,14-19-0-0,14-19-0-0-0,"[0.104641505, 0.04290545, 0.0416345, 0.1105095..."
606,123630,"[14, '19']",0,4,14-19-0,14-19-0-0,14-19-0-0-0,"[0.030703971, 0.012356709, -0.05860837, -0.041..."
607,118651,['3'],1,5,3-0-0,3-0-0-0,3-0-0-0-0,"[0.0019271672, 0.004137248, 0.18107063, -0.089..."
608,48197,"[14, '19']",0,4,14-19-0,14-19-0-0,14-19-0-0-0,"[0.021714428, 0.05667938, -0.009912391, 0.0812..."


In [183]:
def __map_target(y,genres_df):
    
    genre = genres_df[genres_df['labels_2'] == y]
    
    return genre.values

In [233]:
X_train, X_test, y_train, y_test = train_test_split(
    tracks_df.feature.values.tolist(),
    np.concatenate(list(map(lambda x: __map_target(x,genres_df), tracks_df.labels_2.values)),axis=0).tolist(),
    test_size=0.05,
    random_state=25,
    stratify=tracks_df.labels_2.values,
)


In [234]:
y_train

[[0, 4],
 [1, 5],
 [0, 4],
 [1, 5],
 [0, 4],
 [0, 2],
 [0, 2],
 [0, 4],
 [1, 5],
 [1, 5],
 [0, 4],
 [1, 3],
 [1, 5],
 [0, 4],
 [0, 4],
 [0, 4],
 [0, 6],
 [0, 4],
 [0, 4],
 [1, 5],
 [1, 5],
 [1, 3],
 [0, 4],
 [0, 4],
 [0, 4],
 [1, 5],
 [0, 4],
 [1, 3],
 [0, 4],
 [0, 4],
 [0, 4],
 [1, 5],
 [1, 5],
 [0, 2],
 [1, 5],
 [1, 5],
 [0, 4],
 [1, 5],
 [0, 4],
 [0, 6],
 [0, 6],
 [0, 4],
 [1, 5],
 [1, 5],
 [0, 6],
 [1, 5],
 [0, 4],
 [1, 3],
 [0, 2],
 [0, 6],
 [1, 5],
 [1, 3],
 [0, 2],
 [0, 2],
 [1, 5],
 [0, 4],
 [0, 4],
 [1, 5],
 [1, 3],
 [1, 5],
 [0, 4],
 [0, 4],
 [0, 4],
 [0, 4],
 [1, 3],
 [1, 3],
 [0, 6],
 [0, 4],
 [1, 5],
 [0, 4],
 [0, 6],
 [0, 4],
 [0, 4],
 [1, 3],
 [0, 4],
 [1, 3],
 [0, 4],
 [0, 6],
 [0, 4],
 [1, 3],
 [0, 2],
 [0, 4],
 [1, 3],
 [0, 4],
 [0, 6],
 [0, 4],
 [0, 4],
 [1, 3],
 [0, 4],
 [0, 4],
 [0, 4],
 [0, 4],
 [0, 6],
 [0, 4],
 [0, 4],
 [0, 2],
 [0, 6],
 [0, 4],
 [0, 4],
 [0, 4],
 [1, 3],
 [0, 4],
 [0, 4],
 [0, 2],
 [0, 6],
 [0, 2],
 [0, 4],
 [0, 4],
 [1, 5],
 [1, 5],
 [1, 5],
 

In [189]:
genres_df[genres_df['labels_2'] == 4]

Unnamed: 0,labels_1,labels_2
0,0,4


In [235]:
import xgboost as xgb

In [236]:
xgb_model = xgb.XGBClassifier(random_state=42,eval_metric="auc",n_jobs=20)

In [237]:
# Show all messages, including ones pertaining to debugging
xgb.set_config(verbosity=0)

# Get current value of global configuration
# This is a dict containing all parameters in the global configuration,
# including 'verbosity'
config = xgb.get_config()
assert config['verbosity'] == 0

# Example of using the context manager xgb.config_context().
# The context manager will restore the previous value of the global
# configuration upon exiting.
assert xgb.get_config()['verbosity'] == 0  # old value restored

In [238]:
# xgb_model.fit(df_train.Feature.values.tolist()[:100], df_train.Label.values.tolist()[:100], eval_set=[(df_val.Feature.values.tolist(), df_val.Label.values.tolist())])

In [269]:
# Use random forest classifiers for every node
logistic_estimator = LogisticRegression()

In [270]:
svm_estimator = make_pipeline(
    svm.SVC(
        kernel="rbf",
        probability=True
    ),
)


In [271]:
xgb_estimator = xgb.XGBClassifier(random_state=42,eval_metric="auc",n_jobs=20)

In [272]:
tree_estimator = tree.DecisionTreeClassifier(min_samples_leaf=7,max_features='sqrt')


In [273]:
classifier = LocalClassifierPerNode(local_classifier=tree_estimator)

In [274]:
model = classifier.fit(X_train, y_train)

In [275]:
filename = os.path.join(train_path,'hsvm.model')
pickle.dump(model, open(filename, 'wb'))


In [276]:
y_pred = model.predict(X_test).astype(np.int).tolist()

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_pred = model.predict(X_test).astype(np.int).tolist()


In [277]:
print("Classification Report:\n", classification_report([x[0] for x in y_test], [x[0] for x in y_pred]))

Classification Report:
               precision    recall  f1-score   support

           0       0.72      0.82      0.77        22
           1       0.33      0.22      0.27         9

    accuracy                           0.65        31
   macro avg       0.53      0.52      0.52        31
weighted avg       0.61      0.65      0.62        31

