In [8]:
import os
import json

from sklearn.linear_model import LogisticRegression
from hiclass import LocalClassifierPerNode

import tensorflow as tf
import pandas as pd
import pickle
from tqdm.notebook import tqdm
import logging
import ast


In [9]:

args = pd.Series({
    "root_dir":"/mnt/disks/data/",
    "dataset_path":"/mnt/disks/data/fma/fma_large",
    "embeddings":"music_style",
    "train_id": "hierarchical_all"
})


In [10]:

job_path = "/mnt/disks/data/fma/trains"


# In[15]:


train_path = os.path.join(job_path,args.train_id)


# In[16]:


base_path = os.path.join(args.root_dir,"fma")


# In[17]:


models_path = os.path.join(args.root_dir,"models")


metadata_path_fma = os.path.join(base_path,"fma_metadata")


# In[18]:


metadata_file = os.path.join(train_path,"metadata.json")


labels_file = os.path.join(train_path,"labels.json")


In [11]:


def __load_json__(path):
    with open(path, 'r') as f:
        tmp = json.loads(f.read())

    return tmp




In [12]:

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))): # if value ist tensor
        value = value.numpy() # get value of tensor
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a floast_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_array(array):
    array = tf.io.serialize_tensor(array)
    return array




def parse_tfr_element(element):
    #use the same structure as above; it's kinda an outline of the structure we now want to create
    data = {
        'emb' : tf.io.FixedLenFeature([], tf.string),
        'track_id' : tf.io.FixedLenFeature([], tf.int64),
    }
    
    content = tf.io.parse_single_example(element, data)

    track_id = content['track_id']
    emb = content['emb']
    

    #get our 'feature'-- our image -- and reshape it appropriately
    feature = tf.io.parse_tensor(emb, out_type=tf.float32)
    return (feature, track_id)


def get_dataset(filename):
    #create the dataset
    dataset = tf.data.TFRecordDataset(filename)

    #pass every single feature through our mapping function
    dataset = dataset.map(
        parse_tfr_element
    )

    return dataset




In [13]:
import numpy as np


def load_dataset(path,dataset=args.embeddings):
    tfrecords_path = os.path.join(path,'tfrecords',dataset)
    
    
    tfrecords_path = [os.path.join(tfrecords_path,path) for path in os.listdir(tfrecords_path)]
    dataset = get_dataset(tfrecords_path)
    
    df = pd.DataFrame(
        dataset.as_numpy_iterator(),
        columns=['feature', 'track_id']
    )
        
    df.dropna(inplace=True)
    
    
    try:
        df.feature = df.feature.apply(lambda x: x[0] if x.shape[0] != 0 else None)
    except:
        print(x)
    
    return df
    


In [14]:
df = load_dataset(args.dataset_path,dataset=args.embeddings)

2023-05-12 16:59:00.779185: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 8494 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:65:00.0, compute capability: 7.5
2023-05-12 16:59:03.072236: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [11]
	 [[{{node Placeholder/_0}}]]


In [9]:
print(df)

                                                  feature  track_id
0       [0.0008698702, 0.008201729, 0.018753221, -0.03...    124573
1       [-0.00097459555, -0.0051385164, -0.024011323, ...    124574
2       [0.039855253, 0.0076441965, -0.00922821, -0.04...    124575
3       [0.0029335518, 0.020818433, 0.04269241, -0.016...    124576
4       [0.057992022, -0.0510619, -0.048113894, -0.032...    124577
...                                                   ...       ...
104181  [-0.0080008805, 6.110469e-05, 0.18494046, 0.02...     94245
104182  [0.017404526, 0.0132987695, 0.004312843, 0.048...     94246
104183  [0.01726994, 0.005624622, 0.10627997, 0.007201...     94247
104184  [0.07001238, -3.400445e-05, 0.03528729, 0.0683...     94248
104185  [-0.013616562, -0.012981772, -0.0065422454, -0...     94249

[104186 rows x 2 columns]


In [17]:
print(df.feature.iloc[1].shape)

(1280,)


In [10]:
df.dropna(inplace=True)

In [11]:
df

Unnamed: 0,feature,track_id
0,"[0.0008698702, 0.008201729, 0.018753221, -0.03...",124573
1,"[-0.00097459555, -0.0051385164, -0.024011323, ...",124574
2,"[0.039855253, 0.0076441965, -0.00922821, -0.04...",124575
3,"[0.0029335518, 0.020818433, 0.04269241, -0.016...",124576
4,"[0.057992022, -0.0510619, -0.048113894, -0.032...",124577
...,...,...
104181,"[-0.0080008805, 6.110469e-05, 0.18494046, 0.02...",94245
104182,"[0.017404526, 0.0132987695, 0.004312843, 0.048...",94246
104183,"[0.01726994, 0.005624622, 0.10627997, 0.007201...",94247
104184,"[0.07001238, -3.400445e-05, 0.03528729, 0.0683...",94248


In [12]:
tracks_df = pd.read_csv(os.path.join(train_path,"tracks.csv"))

In [13]:
tracks_df

Unnamed: 0,track_id,full_genre_id,labels_1,labels_2,labels_3,labels_4,labels_5
0,103580,"[17, 103]",17,17-103,17-103-0,17-103-0-0,17-103-0-0-0
1,101111,"[2, 46, 502]",2,2-46,2-46-502,2-46-502-0,2-46-502-0-0
2,106704,[15],15,15-0,15-0-0,15-0-0-0,15-0-0-0-0
3,86412,"[4, 97]",4,4-97,4-97-0,4-97-0-0,4-97-0-0-0
4,51008,[2],2,2-0,2-0-0,2-0-0-0,2-0-0-0-0
...,...,...,...,...,...,...,...
104180,59987,"[38, 456]",38,38-456,38-456-0,38-456-0-0,38-456-0-0-0
104181,8370,"[38, 22]",38,38-22,38-22-0,38-22-0-0,38-22-0-0-0
104182,100857,"[1235, 18]",1235,1235-18,1235-18-0,1235-18-0-0,1235-18-0-0-0
104183,140400,[38],38,38-0,38-0-0,38-0-0-0,38-0-0-0-0


In [14]:
labels = __load_json__(labels_file)

In [15]:
labels['label2']

{'17-103': 16,
 '2-118': 17,
 '12-66': 18,
 '4-179': 19,
 '38-6': 20,
 '12-88': 21,
 '20-465': 22,
 '38-125': 23,
 '13-0': 24,
 '12-27': 25,
 '15-184': 26,
 '12-25': 27,
 '2-177': 28,
 '21-539': 29,
 '1235-267': 30,
 '17-180': 31,
 '5-441': 32,
 '15-0': 33,
 '20-7': 34,
 '10-76': 35,
 '13-810': 36,
 '38-1': 37,
 '15-236': 38,
 '5-443': 39,
 '12-31': 40,
 '12-45': 41,
 '12-70': 42,
 '17-49': 43,
 '2-117': 44,
 '1235-0': 45,
 '5-659': 46,
 '2-46': 47,
 '3-0': 48,
 '15-337': 49,
 '9-0': 50,
 '5-444': 51,
 '21-580': 52,
 '2-176': 53,
 '2-77': 54,
 '1235-107': 55,
 '12-314': 56,
 '20-374': 57,
 '12-36': 58,
 '13-170': 59,
 '12-440': 60,
 '17-0': 61,
 '4-0': 62,
 '12-359': 63,
 '38-514': 64,
 '4-74': 65,
 '15-185': 66,
 '38-250': 67,
 '38-224': 68,
 '2-92': 69,
 '2-0': 70,
 '15-468': 71,
 '21-100': 72,
 '15-286': 73,
 '2-504': 74,
 '38-22': 75,
 '38-456': 76,
 '12-0': 77,
 '21-693': 78,
 '15-181': 79,
 '14-0': 80,
 '2-130': 81,
 '38-32': 82,
 '1235-18': 83,
 '5-0': 84,
 '20-138': 85,
 '2-86'

In [16]:
tqdm.pandas()

In [None]:
# tracks_df.loc[:,'labels_1'] = tracks_df.labels_1.astype(str).progress_apply(lambda x: labels['label1'][x])

# tracks_df.loc[:,'labels_2'] = tracks_df.labels_2.astype(str).progress_apply(lambda x: labels['label2'][x])

# tracks_df.loc[:,'labels_3'] = tracks_df.labels_3.astype(str).progress_apply(lambda x: labels['label3'][x])

# tracks_df.loc[:,'labels_4'] = tracks_df.labels_4.astype(str).progress_apply(lambda x: labels['label4'][x])

# tracks_df.loc[:,'labels_5'] = tracks_df.labels_5.astype(str).progress_apply(lambda x: labels['label5'][x])


In [18]:
tracks_df = tracks_df.merge(df, on='track_id')

In [19]:
# genres_df = tracks_df.drop_duplicates(subset=['labels_5'])[['labels_1','labels_2','labels_3','labels_4','labels_5']]
# genres_df = tracks_df.drop_duplicates(subset=['labels_2'])[['labels_1','labels_2','labels_3','labels_4']]

In [20]:
from sklearn import svm
from sklearn import tree
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

In [21]:
tracks_df['full_genre_id']

0            [17, 103]
1         [2, 46, 502]
2                 [15]
3              [4, 97]
4                  [2]
              ...     
104164       [38, 456]
104165        [38, 22]
104166      [1235, 18]
104167            [38]
104168     [1235, 107]
Name: full_genre_id, Length: 104169, dtype: object

In [22]:
tracks_df['full_genre_id'] = tracks_df.full_genre_id.apply(lambda x: ast.literal_eval(x))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    tracks_df.feature.values.tolist(),
    tracks_df.full_genre_id.values.tolist(),
    test_size=0.05,
    random_state=25,
    stratify=tracks_df.full_genre_id.values,
)


In [25]:
import xgboost as xgb

In [26]:
xgb_model = xgb.XGBClassifier(random_state=42,eval_metric="auc",n_jobs=20)

In [27]:
# Show all messages, including ones pertaining to debugging
xgb.set_config(verbosity=0)

# Get current value of global configuration
# This is a dict containing all parameters in the global configuration,
# including 'verbosity'
config = xgb.get_config()
assert config['verbosity'] == 0

# Example of using the context manager xgb.config_context().
# The context manager will restore the previous value of the global
# configuration upon exiting.
assert xgb.get_config()['verbosity'] == 0  # old value restored

In [28]:
# xgb_model.fit(df_train.Feature.values.tolist()[:100], df_train.Label.values.tolist()[:100], eval_set=[(df_val.Feature.values.tolist(), df_val.Label.values.tolist())])

In [29]:
# Use random forest classifiers for every node
logistic_estimator = LogisticRegression(penalty='l2',n_jobs=8,multi_class='multinomial',max_iter=1000)

In [30]:
svm_estimator = make_pipeline(
    svm.SVC(
        kernel="rbf",
        probability=True
    ),
)


In [31]:
xgb_estimator = xgb.XGBClassifier(random_state=42,eval_metric="auc",n_jobs=20)

In [32]:
tree_estimator = tree.DecisionTreeClassifier(min_samples_leaf=7,max_features='sqrt')


In [33]:
classifier = LocalClassifierPerNode(local_classifier=xgb_estimator,n_jobs=8,verbose=1)

In [None]:
model = classifier.fit(X_train, y_train)

  array = numpy.asarray(array, order=order, dtype=dtype)
2023-04-24 22:25:17,620 - LCPN - INFO - Creating digraph from 98960 2D labels
2023-04-24 22:25:18,692 - LCPN - INFO - Detected 16 roots
2023-04-24 22:25:18,693 - LCPN - INFO - Initializing local classifiers
2023-04-24 22:25:18,702 - LCPN - INFO - Initializing siblings binary policy
2023-04-24 22:25:18,703 - LCPN - INFO - Fitting local classifiers


In [None]:
filename = os.path.join(train_path,'lxgb.model')
pickle.dump(model, open(filename, 'wb'))


In [None]:
y_pred = model.predict(X_test)

In [None]:
print("Classification Report:\n", classification_report([int(x[0]) for x in y_test], [int(x[0]) for x in y_pred]))