In [83]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json
import ast
import os
import csv
import math
from sklearn.utils import shuffle
from math import ceil

from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

In [4]:
tqdm.pandas()

In [5]:

args = pd.Series({
    "root_dir":"/mnt/disks/data/",
    "dataset_path":"/mnt/disks/data/fma/fma_large",
    "embeddings":"music_style",
    "sequence_size": 1280,
    "train_id": "hierarchical_top_sample",
    'sample_size': 0.1
})


In [74]:

job_path = "/mnt/disks/data/fma/trains"

# In[15]:

train_path = os.path.join(job_path,args.train_id)


tfrecord_path = os.path.join(train_path,"tfrecords")

# In[16]:

base_path = os.path.join(args.root_dir,"fma")

# In[17]:

models_path = os.path.join(args.root_dir,"models")


metadata_path_fma = os.path.join(base_path,"fma_metadata")

# In[18]:

metadata_path = os.path.join(train_path,"metadata.json")


categories_labels_path = os.path.join(train_path,"labels.json")


In [7]:


def __load_json__(path):
    with open(path, 'r') as f:
        tmp = json.loads(f.read())

    return tmp




In [8]:

def create_dir(path):
    # checking if the directory demo_folder2 
    # exist or not.
    if not os.path.isdir(path):

        # if the demo_folder2 directory is 
        # not present then create it.
        os.makedirs(path)
    return True



In [9]:
create_dir(train_path)

True

## Load genres file. Contains relationships beetwen genres

In [10]:
genres_df = pd.read_csv(os.path.join(metadata_path_fma,'genres.csv'))


In [11]:
genres_df

Unnamed: 0,genre_id,#tracks,parent,title,top_level
0,1,8693,38,Avant-Garde,38
1,2,5271,0,International,2
2,3,1752,0,Blues,3
3,4,4126,0,Jazz,4
4,5,4106,0,Classical,5
...,...,...,...,...,...
158,1032,60,102,Turkish,2
159,1060,30,46,Tango,2
160,1156,26,130,Fado,2
161,1193,72,763,Christmas,38


In [12]:
genres_df[genres_df['genre_id'] == 495]

Unnamed: 0,genre_id,#tracks,parent,title,top_level
136,495,2061,15,Downtempo,15


In [13]:
# Cria um dicionário que associa o ID de cada música aos IDs de seus gêneros musicais
tracks_df = pd.read_csv(os.path.join(metadata_path_fma,'tracks_valid.csv'))

In [14]:
tracks_df = tracks_df.sample(frac=args.sample_size)

In [15]:
tracks_df.sample(20)

Unnamed: 0,track_id,track_title,valid_genre
49066,73374,Musta Tupa Ja Perunamaa,"['38', '47', '49']"
93808,140274,Elegant Culture,['15']
37800,58084,Fear,"['27', '314', '362']"
78967,120608,Hyperpiano 4a-4d,"['1', '30', '38', '41', '247']"
40064,60907,Tilt,"['1', '38', '42']"
57887,87091,Farewell,"['26', '38', '107']"
99106,147686,Playground Pigeon,['1235']
81020,123456,Solphadeen - Demo Tape Track 3,['18']
64220,98383,RENEWED RIVALRY,"['1', '4', '5', '38', '41', '322']"
101093,150266,Here,"['18', '107', '1235']"


In [16]:
tracks_df.valid_genre.values

array(["['15', '64']", "['15', '32', '47', '58', '76', '250']",
       "['22', '38']", ..., "['107']", "['38', '183', '236']",
       "['15', '38', '58']"], dtype=object)

In [17]:
tracks_df.track_title

85419                       Knives Cut
18565              Looking in a mirror
5837                              1990
9740     Now What Do You Think of That
2898         spanish lady 00000 ending
                     ...              
42606                     I Don't Care
52163                    Cyborg Sniper
74138                         Thinking
40505                 Doepfer Fantasie
40820               The Final Bro Down
Name: track_title, Length: 10419, dtype: object

In [18]:
## Get complete genre structure
def get_all_structure(estrutura,df_genres):
    ## Get structure from df_genres
    def get_all_structure_from_df(estrutura,df_genres,structure=[]):
        if estrutura == 0:
            return structure
        else:
            structure.append(int(estrutura))
            get_all_structure_from_df(df_genres[df_genres["genre_id"]==int(estrutura)].parent.values[0],df_genres,structure)
            return structure
    
    return get_all_structure_from_df(estrutura,df_genres,structure=[])
    

In [19]:
# tracks_df['valid_genre'] = tracks_df.track_genres.apply(lambda x: x.strip('][').split(', ') if x != '[]' else None)
tracks_df['valid_genre'] = tracks_df.valid_genre.apply(lambda x: ast.literal_eval(x))

In [20]:
tracks_df['last_genre_id'] = tracks_df.valid_genre.apply(lambda x:x[-1] if x != None else None)

In [21]:
tracks_df.sample(20)

Unnamed: 0,track_id,track_title,valid_genre,last_genre_id
75363,115979,Pianochocolate - Smell of the sea,"[5, 1235]",1235
10162,16941,Fields 3,"[38, 47]",47
60347,92180,The Handyman's Lament,"[12, 17, 103]",103
89405,134309,Wild,"[17, 66, 103]",103
53126,80962,Compliance 2,"[1, 38, 188]",188
83885,127294,Autopsy,[10],10
23192,36707,Moja Krypta,[101],101
12749,21069,Call From Restricted,[66],66
86571,130909,Purification,"[15, 240, 297]",297
26739,41615,Track 3,"[31, 32, 38, 47]",47


In [22]:
tracks_df.dropna(inplace=True)

In [23]:
tracks_df

Unnamed: 0,track_id,track_title,valid_genre,last_genre_id
85419,129446,Knives Cut,"[15, 64]",64
18565,30107,Looking in a mirror,"[15, 32, 47, 58, 76, 250]",250
5837,10990,1990,"[22, 38]",38
9740,16385,Now What Do You Think of That,[8],8
2898,4895,spanish lady 00000 ending,"[30, 65]",65
...,...,...,...,...
42606,64488,I Don't Care,"[25, 85]",85
52163,79547,Cyborg Sniper,"[15, 38]",38
74138,114374,Thinking,[107],107
40505,61644,Doepfer Fantasie,"[38, 183, 236]",236


In [24]:
tracks_df['full_genre_id'] = tracks_df.last_genre_id.progress_apply(lambda x: get_all_structure(x,genres_df)[::-1])

  0%|          | 0/10419 [00:00<?, ?it/s]

In [25]:
tracks_df.full_genre_id.value_counts()

[38]              645
[1235]            624
[10, 76]          453
[1235, 107]       425
[21]              347
                 ... 
[2, 130, 1156]      2
[20]                1
[2, 117]            1
[2, 232]            1
[2, 86, 174]        1
Name: full_genre_id, Length: 152, dtype: int64

In [26]:
tracks_df.columns

Index(['track_id', 'track_title', 'valid_genre', 'last_genre_id',
       'full_genre_id'],
      dtype='object')

In [27]:
tracks_df = tracks_df[['track_id','full_genre_id']]

In [28]:
tracks_df.full_genre_id.values

array([list([12, 25, 64]), list([38, 250]), list([38]), ...,
       list([1235, 107]), list([15, 236]), list([12, 58])], dtype=object)

In [29]:
tracks_df.full_genre_id.info

<bound method Series.info of 85419    [12, 25, 64]
18565       [38, 250]
5837             [38]
9740              [8]
2898         [20, 65]
             ...     
42606        [12, 85]
52163            [38]
74138     [1235, 107]
40505       [15, 236]
40820        [12, 58]
Name: full_genre_id, Length: 10419, dtype: object>

In [30]:
labels_size = tracks_df.full_genre_id.apply(lambda x: len(x))

In [31]:
labels_size = labels_size.max()

In [32]:
labels_size

5

### Parse of label to structure

In [33]:
### Function for parse label to sctructure of hierarhical scheme

def parse_label(label,label_size=5):
    # label = label.split('-')
    # preencher com 0 no caso de haver menos de 5 níveis
    labels = np.zeros(label_size,dtype=int)
    for i, label in enumerate(label):
        if i == 5:
            break
        # Aqui você pode fazer a conversão do label em um índice inteiro usando um dicionário ou outro método
        # Neste exemplo, estou apenas usando a posição da label na lista como índice
        labels[i] = label
    return labels

In [34]:
parsed_labels = tracks_df.full_genre_id.apply(lambda x: parse_label(x))

In [35]:
tracks_df['full_genre_id']

85419    [12, 25, 64]
18565       [38, 250]
5837             [38]
9740              [8]
2898         [20, 65]
             ...     
42606        [12, 85]
52163            [38]
74138     [1235, 107]
40505       [15, 236]
40820        [12, 58]
Name: full_genre_id, Length: 10419, dtype: object

In [36]:
def convert_label_to_string(x,level=2):
    return '-'.join([str(value) for value in x[:level]])

In [37]:
tracks_df['labels_1'] = parsed_labels.progress_apply(lambda x: str(x[:1][0]))
tracks_df['labels_2'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=2))
tracks_df['labels_3'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=3))
tracks_df['labels_4'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=4))
tracks_df['labels_5'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=5))

  0%|          | 0/10419 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tracks_df['labels_1'] = parsed_labels.progress_apply(lambda x: str(x[:1][0]))


  0%|          | 0/10419 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tracks_df['labels_2'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=2))


  0%|          | 0/10419 [00:00<?, ?it/s]

  0%|          | 0/10419 [00:00<?, ?it/s]

  0%|          | 0/10419 [00:00<?, ?it/s]

In [38]:
tracks_df.labels_1.value_counts()

38      2226
12      2145
15      1791
1235    1158
10       699
17       674
21       635
2        324
5        267
4        112
20       106
9         88
8         61
14        58
13        51
3         24
Name: labels_1, dtype: int64

In [39]:
tracks_df['labels_2'].value_counts()

38-0        645
1235-0      624
12-25       532
10-76       453
1235-107    425
           ... 
21-83         3
20-465        2
20-0          1
2-117         1
2-232         1
Name: labels_2, Length: 118, dtype: int64

In [40]:
# tracks_df = tracks_df[tracks_df['labels_1'].isin(['3','14'])]

In [41]:
categories_df = pd.DataFrame({'level5':tracks_df.labels_5.unique()})

In [42]:
categories_df['level1'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:1]))
categories_df['level2'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:2]))
categories_df['level3'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:3]))
categories_df['level4'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:4]))

  0%|          | 0/152 [00:00<?, ?it/s]

  0%|          | 0/152 [00:00<?, ?it/s]

  0%|          | 0/152 [00:00<?, ?it/s]

  0%|          | 0/152 [00:00<?, ?it/s]

In [43]:
categories_df

Unnamed: 0,level5,level1,level2,level3,level4
0,12-25-64-0-0,12,12-25,12-25-64,12-25-64-0
1,38-250-0-0-0,38,38-250,38-250-0,38-250-0-0
2,38-0-0-0-0,38,38-0,38-0-0,38-0-0-0
3,8-0-0-0-0,8,8-0,8-0-0,8-0-0-0
4,20-65-0-0-0,20,20-65,20-65-0,20-65-0-0
...,...,...,...,...,...
147,20-65-43-0-0,20,20-65,20-65-43,20-65-43-0
148,2-117-0-0-0,2,2-117,2-117-0,2-117-0-0
149,2-130-1156-0-0,2,2-130,2-130-1156,2-130-1156-0
150,2-232-0-0-0,2,2-232,2-232-0,2-232-0-0


In [44]:
def get_labels_name(x,genres_df):
    levels = 5
    full_name = []
    last_level = 0
    genre_root = ""
    for genre in x.split('-'):
        genre_df = genres_df[genres_df['genre_id'] == int(genre)]
        if genre_df.empty:
            genre_name = genre_root 
        else:
            genre_name = genre_df.title.values.tolist()[0]
            genre_root = genre_name
        
        full_name.append(genre_name)
    full_name = '>'.join(full_name)
        
    return full_name
    # return genres_df[genres_df['genre_id'] == int(x)].title.values.tolist()[0]

In [45]:
categories_df['level5_name'] = categories_df.level5.apply(lambda x: get_labels_name(x,genres_df))

In [46]:
categories_df

Unnamed: 0,level5,level1,level2,level3,level4,level5_name
0,12-25-64-0-0,12,12-25,12-25-64,12-25-64-0,Rock>Punk>Electro-Punk>Electro-Punk>Electro-Punk
1,38-250-0-0-0,38,38-250,38-250-0,38-250-0-0,Experimental>Improv>Improv>Improv>Improv
2,38-0-0-0-0,38,38-0,38-0-0,38-0-0-0,Experimental>Experimental>Experimental>Experim...
3,8-0-0-0-0,8,8-0,8-0-0,8-0-0-0,Old-Time / Historic>Old-Time / Historic>Old-Ti...
4,20-65-0-0-0,20,20-65,20-65-0,20-65-0-0,Spoken>Radio>Radio>Radio>Radio
...,...,...,...,...,...,...
147,20-65-43-0-0,20,20-65,20-65-43,20-65-43-0,Spoken>Radio>Radio Art>Radio Art>Radio Art
148,2-117-0-0-0,2,2-117,2-117-0,2-117-0-0,International>Polka>Polka>Polka>Polka
149,2-130-1156-0-0,2,2-130,2-130-1156,2-130-1156-0,International>Europe>Fado>Fado>Fado
150,2-232-0-0-0,2,2-232,2-232-0,2-232-0-0,International>Flamenco>Flamenco>Flamenco>Flamenco


In [47]:
def __create_labels__(categories_df):
    data = {
        "label1": {},
        "label2": {},
        "label3": {},
        "label4": {},
        "label5": {},
        "label1_inverse": [],
        "label2_inverse": [],
        "label3_inverse": [],
        "label4_inverse": [],
        "label5_inverse": [],
        "label1_name": {},
        "label2_name": {},
        "label3_name": {},
        "label4_name": {},
        "label5_name": {},
    }

    idx = 0
    
    for id_x, cat in enumerate(set(categories_df.level1.values.tolist())):
        data['label1'][cat] = idx
        data['label1_inverse'].append(cat)
        data['label1_count'] = idx + 1
        idx+=1

    for id_x, cat in enumerate(set(categories_df.level2.values.tolist())):
        data['label2'][cat] = idx
        data['label2_inverse'].append(cat)
        data['label2_count'] = idx + 1
        idx+=1
        
    for id_x, cat in enumerate(set(categories_df.level3.values.tolist())):
        data['label3'][cat] = idx
        data['label3_inverse'].append(cat)
        data['label3_count'] = idx + 1
        idx+=1

    for id_x, cat in enumerate(set(categories_df.level4.values.tolist())):
        data['label4'][cat] = idx
        data['label4_inverse'].append(cat)
        data['label4_count'] = idx + 1
        idx+=1
        
    for idx, cat in enumerate(set(categories_df.level5.values.tolist())):
        data['label5'][cat] = idx
        data['label5_inverse'].append(cat)
        data['label5_count'] = idx + 1
        idx+=1
        
    for cat5,cat1,cat2,cat3,cat4,name5 in categories_df.values:
        
        name1 = '>'.join(name5.split('>')[:1])
        name2 = '>'.join(name5.split('>')[:2])
        name3 = '>'.join(name5.split('>')[:3])
        name4 = '>'.join(name5.split('>')[:4])
        
        
        data['label1_name'][cat1] = name1
        data['label2_name'][cat2] = name2
        data['label3_name'][cat3] = name3
        data['label4_name'][cat4] = name4
        data['label5_name'][cat5] = name5
        
    return data

In [48]:
with open(categories_labels_path, 'w+') as f:
    f.write(json.dumps(__create_labels__(categories_df)))

In [49]:
labels  = __create_labels__(categories_df)

In [50]:
labels['label4']

{'12-359-0-0': 283,
 '2-130-1156-0': 284,
 '12-0-0-0': 285,
 '1235-18-538-0': 286,
 '2-130-619-0': 287,
 '20-428-0-0': 288,
 '13-170-0-0': 289,
 '38-250-0-0': 290,
 '12-26-113-0': 291,
 '12-45-53-90': 292,
 '15-296-0-0': 293,
 '12-85-404-0': 294,
 '2-86-0-0': 295,
 '38-22-0-0': 296,
 '5-0-0-0': 297,
 '12-31-439-0': 298,
 '10-76-0-0': 299,
 '15-181-401-0': 300,
 '12-314-0-0': 301,
 '13-810-0-0': 302,
 '38-514-0-0': 303,
 '20-65-43-0': 304,
 '2-79-602-0': 305,
 '15-184-0-0': 306,
 '15-297-0-0': 307,
 '12-70-0-0': 308,
 '2-117-0-0': 309,
 '4-97-0-0': 310,
 '15-297-240-0': 311,
 '13-311-0-0': 312,
 '2-46-1060-0': 313,
 '12-36-0-0': 314,
 '12-25-109-0': 315,
 '12-25-71-0': 316,
 '4-0-0-0': 317,
 '12-25-64-0': 318,
 '20-0-0-0': 319,
 '15-468-0-0': 320,
 '12-25-0-0': 321,
 '20-188-0-0': 322,
 '15-495-0-0': 323,
 '21-693-0-0': 324,
 '1235-267-0-0': 325,
 '2-176-0-0': 326,
 '12-440-0-0': 327,
 '2-102-1032-0': 328,
 '38-32-0-0': 329,
 '12-98-0-0': 330,
 '21-0-0-0': 331,
 '17-94-0-0': 332,
 '2-46

In [51]:
labels['label1_count']

16

In [52]:


def parse_tfr_element(element):
    #use the same structure as above; it's kinda an outline of the structure we now want to create
    data = {
        'emb' : tf.io.FixedLenFeature([], tf.string),
        'track_id' : tf.io.FixedLenFeature([], tf.int64),
    }
    
    content = tf.io.parse_single_example(element, data)

    track_id = content['track_id']
    emb = content['emb']
    

    #get our 'feature'-- our image -- and reshape it appropriately
    feature = tf.io.parse_tensor(emb, out_type=tf.float32)
    return (feature, track_id)


def get_dataset(filename):
    #create the dataset
    dataset = tf.data.TFRecordDataset(filename)

    #pass every single feature through our mapping function
    dataset = dataset.map(
        parse_tfr_element
    )

    return dataset




In [53]:
import numpy as np


def load_dataset(path,dataset=args.embeddings):
    tfrecords_path = os.path.join(path,'tfrecords',dataset)
    
    
    tfrecords_path = [os.path.join(tfrecords_path,path) for path in os.listdir(tfrecords_path)]
    dataset = get_dataset(tfrecords_path)
    
    df = pd.DataFrame(
        dataset.as_numpy_iterator(),
        columns=['feature', 'track_id']
    )
        
    df.dropna(inplace=True)
    
    
    try:
        df.feature = df.feature.apply(lambda x: x[0] if x.shape[0] != 0 else None)
    except:
        print(x)
    
    return df
    


In [54]:
tracks_df['labels_5']

85419      12-25-64-0-0
18565      38-250-0-0-0
5837         38-0-0-0-0
9740          8-0-0-0-0
2898        20-65-0-0-0
              ...      
42606       12-85-0-0-0
52163        38-0-0-0-0
74138    1235-107-0-0-0
40505      15-236-0-0-0
40820       12-58-0-0-0
Name: labels_5, Length: 10419, dtype: object

In [55]:
def __split_data__(group, percentage=0.1):
    if len(group) == 1:
        return group, group

    shuffled = shuffle(group.values)
    finish_test = int(ceil(len(group) * percentage))

    first = pd.DataFrame(shuffled[:finish_test], columns=group.columns)
    second = pd.DataFrame(shuffled[finish_test:], columns=group.columns)

    return first, second

In [75]:
def select_dataset(tracks_df):
    
#     dataset_testset_path = os.path.join(tfrecord_path,'test')
#     dataset_validationset_path = os.path.join(tfrecord_path,'val')
#     dataset_trainset_path = os.path.join(tfrecord_path,'train')
    
    df = load_dataset(args.dataset_path,dataset=args.embeddings)
    
    df.dropna(inplace=True)
    
    tracks_df = tracks_df.merge(df, on='track_id')
    
    tracks_df.loc[:,'labels_1'] = tracks_df.labels_1.astype(str).progress_apply(lambda x: labels['label1'][x])
    tracks_df.loc[:,'labels_2'] = tracks_df.labels_2.astype(str).progress_apply(lambda x: labels['label2'][x])
    tracks_df.loc[:,'labels_3'] = tracks_df.labels_3.astype(str).progress_apply(lambda x: labels['label3'][x])
    tracks_df.loc[:,'labels_4'] = tracks_df.labels_4.astype(str).progress_apply(lambda x: labels['label4'][x])
    tracks_df.loc[:,'labels_5'] = tracks_df.labels_5.astype(str).progress_apply(lambda x: labels['label5'][x])
    
    tests = []
    trains = []
    validations = []
    groups = tracks_df.groupby("labels_5")
    
    
    count = 0
    items_count = 0
    total = len(groups)
    total_items = len(tracks_df)
    oversampling_size = 30  # int(group_sizes.mean() + group_sizes.std() * 2)
    print(f"oversampling_size: {oversampling_size}")

    for code, group in groups:
        test, train_to_split = __split_data__(group, 0.01)  # 10%
        train_to_split = train_to_split
        validation, train = __split_data__(train_to_split, 0.01)  # %1

        tests.append(test)
        validations.append(validation)

        ## this increase the numner of samples when classes has low quantity
        count_train = len(train)
        if count_train < oversampling_size:
            train = train.sample(oversampling_size, replace=True)

        trains.append(train)

        count += 1
        items_count += count_train
        
        
    df_test = pd.concat(tests, sort=False).sample(frac=1).reset_index(drop=True)
    # .to_csv(dataset_testset_path, index=False,quoting=csv.QUOTE_ALL)
    df_val = pd.concat(validations, sort=False).sample(frac=1).reset_index(drop=True)
    df_train = pd.concat(trains, sort=False).sample(frac=1).reset_index(drop=True)

    return df_train,df_test,df_val

In [76]:
df_train,df_test,df_val = select_dataset(tracks_df)

2023-05-08 17:46:59.823325: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [11]
	 [[{{node Placeholder/_0}}]]


  0%|          | 0/10419 [00:00<?, ?it/s]

  tracks_df.loc[:,'labels_1'] = tracks_df.labels_1.astype(str).progress_apply(lambda x: labels['label1'][x])


  0%|          | 0/10419 [00:00<?, ?it/s]

  tracks_df.loc[:,'labels_2'] = tracks_df.labels_2.astype(str).progress_apply(lambda x: labels['label2'][x])


  0%|          | 0/10419 [00:00<?, ?it/s]

  tracks_df.loc[:,'labels_3'] = tracks_df.labels_3.astype(str).progress_apply(lambda x: labels['label3'][x])


  0%|          | 0/10419 [00:00<?, ?it/s]

  tracks_df.loc[:,'labels_4'] = tracks_df.labels_4.astype(str).progress_apply(lambda x: labels['label4'][x])


  0%|          | 0/10419 [00:00<?, ?it/s]

  tracks_df.loc[:,'labels_5'] = tracks_df.labels_5.astype(str).progress_apply(lambda x: labels['label5'][x])


oversampling_size: 30


In [78]:
df_train

Unnamed: 0,track_id,full_genre_id,labels_1,labels_2,labels_3,labels_4,labels_5,feature
0,21159,"[15, 296]",3,52,193,293,53,"[0.14478458, -0.017100343, 0.008137767, 0.1190..."
1,82443,"[15, 236]",3,113,174,363,147,"[0.0057163932, 0.08835802, 0.08281917, -0.0385..."
2,142588,"[12, 26]",2,131,250,405,129,"[-0.0020183325, -0.027405122, -0.046768416, -0..."
3,92763,"[9, 169]",8,118,139,406,98,"[0.09024087, -0.043394577, 0.093729414, 0.1540..."
4,41130,"[15, 468]",3,122,261,320,39,"[0.024159094, -0.006362796, 0.22809465, 0.2140..."
...,...,...,...,...,...,...,...,...
11633,151874,"[15, 183]",3,90,252,344,74,"[0.032073956, 0.008359601, 0.13315788, -0.0574..."
11634,132059,[1235],5,41,209,355,72,"[-0.010686775, -0.008971523, -0.05980732, 0.02..."
11635,1593,[12],2,78,211,285,143,"[-0.036503952, 0.052119505, 0.03817637, 0.0360..."
11636,129222,"[12, 36]",2,64,208,314,19,"[-0.032589257, 0.05484901, 0.11774453, -0.0373..."


In [91]:

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))): # if value ist tensor
        value = value.numpy() # get value of tensor
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a floast_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_array(array):
  array = tf.io.serialize_tensor(array)
  return array


In [92]:
def parse_single_music(data,labels):
    # cat1, cat2, cat3, cat4, cat5 = data
    track_id, _, cat1, cat2, cat3, cat4, cat5, music = data
    
    label1 = np.array([cat1, labels['label1_count']], np.int64)
    label2 = np.array([cat2, labels['label2_count']], np.int64)
    label3 = np.array([cat3, labels['label3_count']], np.int64)
    label4 = np.array([cat4, labels['label4_count']], np.int64)
    label5 = np.array([cat5, labels['label5_count']], np.int64)
    
    
    #define the dictionary -- the structure -- of our single example
    data = {
        'label1': tf.train.Feature(int64_list=tf.train.Int64List(value=label1)),
        'label2': tf.train.Feature(int64_list=tf.train.Int64List(value=label2)),
        'label3': tf.train.Feature(int64_list=tf.train.Int64List(value=label3)),
        'label4': tf.train.Feature(int64_list=tf.train.Int64List(value=label4)),
        'emb' : _bytes_feature(serialize_array(music)),
        'track_id' : _int64_feature(track_id)
    }
    #create an Example, wrapping the single features
    out = tf.train.Example(features=tf.train.Features(feature=data))

    return out

In [95]:
def generate_tf_record(df,tf_path='tfrecords/val'):
    create_dir(tf_path)
    
    
    batch_size = 1024 * 50 # 50k records from each file batch
    count = 0
    total = math.ceil(len(df) / batch_size)

    for i in range(0, len(df), batch_size):
        batch_df = df[i:i+batch_size]
        
        tfrecords = [parse_single_music(data, labels) for data in batch_df.values]
        
        path = f"{tf_path}/{str(count).zfill(10)}.tfrecord"
        
        #with tf.python_io.TFRecordWriter(path) as writer:
        with tf.io.TFRecordWriter(path) as writer:
            for tfrecord in tfrecords:
                writer.write(tfrecord.SerializeToString())

        print(f"{count} {len(tfrecords)} {path}")
        count += 1
        print(f"{count}/{total} batchs / {count * batch_size} processed")

    print(f"{count}/{total} batchs / {len(df)} processed")

    

In [96]:
generate_tf_record(df_val,tf_path=os.path.join(tfrecord_path,'val'))

0 208 /mnt/disks/data/fma/trains/hierarchical_top_sample/tfrecords/val/0000000000.tfrecord
1/1 batchs / 51200 processed
1/1 batchs / 208 processed


In [97]:
generate_tf_record(df_test,tf_path=os.path.join(tfrecord_path,'test'))

0 209 /mnt/disks/data/fma/trains/hierarchical_top_sample/tfrecords/test/0000000000.tfrecord
1/1 batchs / 51200 processed
1/1 batchs / 209 processed


In [98]:
generate_tf_record(df_train,tf_path=os.path.join(tfrecord_path,'train'))

0 11638 /mnt/disks/data/fma/trains/hierarchical_top_sample/tfrecords/train/0000000000.tfrecord
1/1 batchs / 51200 processed
1/1 batchs / 11638 processed


In [62]:
def create_metadata(metadata_path):
    job_path = context['ti'].xcom_pull(task_ids='create_job_id', key="job_path")

    with open(metadata_path, 'w+') as f:
        f.write(json.dumps({
            'sequence_size': args.sequence_size,
            'n_levels': labels_size,
            'labels_size' [labels['label1_count'],labels['label2_count'],
                           labels['label3_count'],labels['label4_count'],
                           labels['label5_count']]
            'trainset_count': ,
            'validationset_count': ,
            'testset_count': 
        }))

SyntaxError: expression expected after dictionary key and ':' (4164871504.py, line 6)

In [141]:
tracks_df.to_csv(os.path.join(train_path,"tracks.csv"),index=False)

In [142]:
tracks_df = pd.read_csv(os.path.join(train_path,"tracks.csv"))