In [32]:
import os
import json

from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from hiclass import LocalClassifierPerNode

import tensorflow as tf
import pandas as pd
import pickle
from tqdm.notebook import tqdm
import logging
import ast


In [2]:

args = pd.Series({
    "root_dir":"/mnt/disks/data/",
    "dataset_path":"/mnt/disks/data/fma/fma_large",
    "embeddings":"music_style",
    "train_id": "hierarchical_local_sample"
})


In [3]:

job_path = "/mnt/disks/data/fma/trains"


# In[15]:


train_path = os.path.join(job_path,args.train_id)


# In[16]:


base_path = os.path.join(args.root_dir,"fma")


# In[17]:


models_path = os.path.join(args.root_dir,"models")


metadata_path_fma = os.path.join(base_path,"fma_metadata")


# In[18]:


metadata_file = os.path.join(train_path,"metadata.json")


labels_file = os.path.join(train_path,"labels.json")


In [4]:


def __load_json__(path):
    with open(path, 'r') as f:
        tmp = json.loads(f.read())

    return tmp




In [5]:

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))): # if value ist tensor
        value = value.numpy() # get value of tensor
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a floast_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_array(array):
    array = tf.io.serialize_tensor(array)
    return array




def parse_tfr_element(element):
    #use the same structure as above; it's kinda an outline of the structure we now want to create
    data = {
        'emb' : tf.io.FixedLenFeature([], tf.string),
        'track_id' : tf.io.FixedLenFeature([], tf.int64),
    }
    
    content = tf.io.parse_single_example(element, data)

    track_id = content['track_id']
    emb = content['emb']
    

    #get our 'feature'-- our image -- and reshape it appropriately
    feature = tf.io.parse_tensor(emb, out_type=tf.float32)
    return (feature, track_id)


def get_dataset(filename):
    #create the dataset
    dataset = tf.data.TFRecordDataset(filename)

    #pass every single feature through our mapping function
    dataset = dataset.map(
        parse_tfr_element
    )

    return dataset




In [6]:
import numpy as np


def load_dataset(path,dataset=args.embeddings):
    tfrecords_path = os.path.join(path,'tfrecords',dataset)
    
    
    tfrecords_path = [os.path.join(tfrecords_path,path) for path in os.listdir(tfrecords_path)]
    dataset = get_dataset(tfrecords_path)
    
    df = pd.DataFrame(
        dataset.as_numpy_iterator(),
        columns=['feature', 'track_id']
    )
        
    df.dropna(inplace=True)
    
    
    try:
        df.feature = df.feature.apply(lambda x: x[0] if x.shape[0] != 0 else None)
    except:
        print(x)
    
    return df
    


In [7]:
df = load_dataset(args.dataset_path,dataset=args.embeddings)

2023-07-31 09:47:05.830638: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 1295 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:65:00.0, compute capability: 7.5
2023-07-31 09:47:05.967205: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [11]
	 [[{{node Placeholder/_0}}]]


In [8]:
print(df)

                                                  feature  track_id
0       [0.0008698702, 0.008201729, 0.018753221, -0.03...    124573
1       [-0.00097459555, -0.0051385164, -0.024011323, ...    124574
2       [0.039855253, 0.0076441965, -0.00922821, -0.04...    124575
3       [0.0029335518, 0.020818433, 0.04269241, -0.016...    124576
4       [0.057992022, -0.0510619, -0.048113894, -0.032...    124577
...                                                   ...       ...
104181  [-0.0080008805, 6.110469e-05, 0.18494046, 0.02...     94245
104182  [0.017404526, 0.0132987695, 0.004312843, 0.048...     94246
104183  [0.01726994, 0.005624622, 0.10627997, 0.007201...     94247
104184  [0.07001238, -3.400445e-05, 0.03528729, 0.0683...     94248
104185  [-0.013616562, -0.012981772, -0.0065422454, -0...     94249

[104186 rows x 2 columns]


In [9]:
print(df.feature.iloc[1].shape)

(1280,)


In [10]:
df.dropna(inplace=True)

In [11]:
df

Unnamed: 0,feature,track_id
0,"[0.0008698702, 0.008201729, 0.018753221, -0.03...",124573
1,"[-0.00097459555, -0.0051385164, -0.024011323, ...",124574
2,"[0.039855253, 0.0076441965, -0.00922821, -0.04...",124575
3,"[0.0029335518, 0.020818433, 0.04269241, -0.016...",124576
4,"[0.057992022, -0.0510619, -0.048113894, -0.032...",124577
...,...,...
104181,"[-0.0080008805, 6.110469e-05, 0.18494046, 0.02...",94245
104182,"[0.017404526, 0.0132987695, 0.004312843, 0.048...",94246
104183,"[0.01726994, 0.005624622, 0.10627997, 0.007201...",94247
104184,"[0.07001238, -3.400445e-05, 0.03528729, 0.0683...",94248


In [12]:
tracks_df = pd.read_csv(os.path.join(train_path,"tracks.csv"))

In [13]:
tracks_df

Unnamed: 0,track_id,full_genre_id,labels_1,labels_2,labels_3,labels_4,labels_5
0,64740,"[12, 66]",12,12-66,12-66-0,12-66-0-0,12-66-0-0-0
1,136037,"[1235, 267]",1235,1235-267,1235-267-0,1235-267-0-0,1235-267-0-0-0
2,139650,"[15, 468]",15,15-468,15-468-0,15-468-0-0,15-468-0-0-0
3,148700,"[38, 247]",38,38-247,38-247-0,38-247-0-0,38-247-0-0-0
4,120570,[1235],1235,1235-0,1235-0-0,1235-0-0-0,1235-0-0-0-0
...,...,...,...,...,...,...,...
10414,62092,"[12, 45, 53]",12,12-45,12-45-53,12-45-53-0,12-45-53-0-0
10415,24623,"[12, 25, 89]",12,12-25,12-25-89,12-25-89-0,12-25-89-0-0
10416,71779,"[1235, 107]",1235,1235-107,1235-107-0,1235-107-0-0,1235-107-0-0-0
10417,26554,"[38, 514]",38,38-514,38-514-0,38-514-0-0,38-514-0-0-0


In [14]:
labels = __load_json__(labels_file)

In [15]:
labels['label2']

{'38-514': 16,
 '5-659': 17,
 '2-130': 18,
 '2-79': 19,
 '12-98': 20,
 '15-185': 21,
 '15-183': 22,
 '15-0': 23,
 '2-118': 24,
 '4-37': 25,
 '38-6': 26,
 '20-138': 27,
 '12-66': 28,
 '3-567': 29,
 '2-176': 30,
 '15-495': 31,
 '21-100': 32,
 '5-441': 33,
 '5-443': 34,
 '17-94': 35,
 '38-224': 36,
 '15-695': 37,
 '20-7': 38,
 '2-86': 39,
 '10-0': 40,
 '12-36': 41,
 '5-0': 42,
 '21-811': 43,
 '12-31': 44,
 '12-440': 45,
 '2-172': 46,
 '15-181': 47,
 '15-468': 48,
 '12-45': 49,
 '17-49': 50,
 '20-378': 51,
 '38-125': 52,
 '14-19': 53,
 '38-41': 54,
 '10-76': 55,
 '20-0': 56,
 '1235-0': 57,
 '14-11': 58,
 '3-0': 59,
 '21-0': 60,
 '15-236': 61,
 '21-542': 62,
 '4-906': 63,
 '4-179': 64,
 '2-46': 65,
 '2-177': 66,
 '38-30': 67,
 '17-33': 68,
 '13-810': 69,
 '12-70': 70,
 '13-0': 71,
 '10-362': 72,
 '38-186': 73,
 '15-42': 74,
 '38-22': 75,
 '15-296': 76,
 '14-0': 77,
 '20-374': 78,
 '4-97': 79,
 '2-171': 80,
 '2-92': 81,
 '15-286': 82,
 '13-311': 83,
 '1235-107': 84,
 '12-85': 85,
 '38-247': 

In [16]:
tqdm.pandas()

In [17]:
# tracks_df.loc[:,'labels_1'] = tracks_df.labels_1.astype(str).progress_apply(lambda x: labels['label1'][x])

# tracks_df.loc[:,'labels_2'] = tracks_df.labels_2.astype(str).progress_apply(lambda x: labels['label2'][x])

# tracks_df.loc[:,'labels_3'] = tracks_df.labels_3.astype(str).progress_apply(lambda x: labels['label3'][x])

# tracks_df.loc[:,'labels_4'] = tracks_df.labels_4.astype(str).progress_apply(lambda x: labels['label4'][x])

# tracks_df.loc[:,'labels_5'] = tracks_df.labels_5.astype(str).progress_apply(lambda x: labels['label5'][x])


In [18]:
tracks_df = tracks_df.merge(df, on='track_id')

In [19]:
# genres_df = tracks_df.drop_duplicates(subset=['labels_5'])[['labels_1','labels_2','labels_3','labels_4','labels_5']]
# genres_df = tracks_df.drop_duplicates(subset=['labels_2'])[['labels_1','labels_2','labels_3','labels_4']]

In [20]:
from sklearn import svm
from sklearn import tree
from sklearn.preprocessing import Binarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

In [26]:
tracks_df

Unnamed: 0,track_id,full_genre_id,labels_1,labels_2,labels_3,labels_4,labels_5,feature
0,64740,"[12, 66]",12,12-66,12-66-0,12-66-0-0,12-66-0-0-0,"[0.000762254, 0.072248645, 0.009548098, -0.011..."
1,136037,"[1235, 267]",1235,1235-267,1235-267-0,1235-267-0-0,1235-267-0-0-0,"[-0.010245671, -0.024914384, -0.032503784, 0.0..."
2,139650,"[15, 468]",15,15-468,15-468-0,15-468-0-0,15-468-0-0-0,"[0.05113448, -0.025999209, -0.042326104, -0.05..."
3,148700,"[38, 247]",38,38-247,38-247-0,38-247-0-0,38-247-0-0-0,"[-0.0075327256, 0.007753501, -0.026723444, 0.0..."
4,120570,[1235],1235,1235-0,1235-0-0,1235-0-0-0,1235-0-0-0-0,"[0.08098389, -0.017046234, -0.059169482, 0.037..."
...,...,...,...,...,...,...,...,...
10413,62092,"[12, 45, 53]",12,12-45,12-45-53,12-45-53-0,12-45-53-0-0,"[0.00843513, -0.023941943, -0.018283257, -0.04..."
10414,24623,"[12, 25, 89]",12,12-25,12-25-89,12-25-89-0,12-25-89-0-0,"[0.08232648, 0.092691354, 0.014757127, 0.02020..."
10415,71779,"[1235, 107]",1235,1235-107,1235-107-0,1235-107-0-0,1235-107-0-0-0,"[0.1127581, 0.010836601, 0.00038804612, -0.038..."
10416,26554,"[38, 514]",38,38-514,38-514-0,38-514-0-0,38-514-0-0-0,"[0.02009497, -0.0027926564, 0.015744597, -0.03..."


In [27]:
X_train, X_test, y_train, y_test = train_test_split(
    tracks_df.feature.values.tolist(),
    tracks_df.labels_1.values.tolist(),
    test_size=0.05,
    random_state=25,
    stratify=tracks_df.labels_1.values,
)


In [33]:
xgb_model = xgb.XGBClassifier(random_state=42,eval_metric="auc",n_jobs=20)

In [34]:
# Show all messages, including ones pertaining to debugging
xgb.set_config(verbosity=0)

# Get current value of global configuration
# This is a dict containing all parameters in the global configuration,
# including 'verbosity'
config = xgb.get_config()
assert config['verbosity'] == 0

# Example of using the context manager xgb.config_context().
# The context manager will restore the previous value of the global
# configuration upon exiting.
assert xgb.get_config()['verbosity'] == 0  # old value restored

In [37]:
classifier = LocalClassifierPerNode(local_classifier=xgb_model,verbose=1)

In [None]:
model = classifier.fit(X_train, y_train)

2023-07-31 10:08:00,753 - LCPN - INFO - Creating digraph from 9897 1D labels
2023-07-31 10:08:00,758 - LCPN - INFO - Initializing siblings binary policy
2023-07-31 10:08:00,759 - LCPN - INFO - Detected 16 roots
2023-07-31 10:08:00,760 - LCPN - INFO - Initializing local classifiers
2023-07-31 10:08:00,761 - LCPN - INFO - Fitting local classifiers
2023-07-31 10:08:00,762 - LCPN - INFO - Fitting local classifier for node '10' (1/16)
2023-07-31 10:11:14,111 - LCPN - INFO - Fitting local classifier for node '21' (2/16)
2023-07-31 10:13:58,558 - LCPN - INFO - Fitting local classifier for node '15' (3/16)
2023-07-31 10:17:49,448 - LCPN - INFO - Fitting local classifier for node '1235' (4/16)
2023-07-31 10:21:24,729 - LCPN - INFO - Fitting local classifier for node '38' (5/16)


In [None]:
filename = os.path.join(train_path,'lxgb.model')
pickle.dump(model, open(filename, 'wb'))


In [None]:
y_pred = model.predict(X_test)

In [None]:
print("Classification Report:\n", classification_report([int(x[0]) for x in y_test], [int(x[0]) for x in y_pred]))

In [25]:
tracks_df['full_genre_id'] = tracks_df.full_genre_id.apply(lambda x: ast.literal_eval(x))

ValueError: malformed node or string: [12, 66]

In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    tracks_df.feature.values.tolist(),
    tracks_df.full_genre_id.values.tolist(),
    test_size=0.05,
    random_state=25,
    stratify=tracks_df.full_genre_id.values,
)


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [28]:
# xgb_model.fit(df_train.Feature.values.tolist()[:100], df_train.Label.values.tolist()[:100], eval_set=[(df_val.Feature.values.tolist(), df_val.Label.values.tolist())])

In [29]:
# Use random forest classifiers for every node
logistic_estimator = LogisticRegression(penalty='l2',n_jobs=8,multi_class='multinomial',max_iter=1000)

In [30]:
svm_estimator = make_pipeline(
    svm.SVC(
        kernel="rbf",
        probability=True
    ),
)


In [31]:
xgb_estimator = xgb.XGBClassifier(random_state=42,eval_metric="auc",n_jobs=20)

In [32]:
tree_estimator = tree.DecisionTreeClassifier(min_samples_leaf=7,max_features='sqrt')
