In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json
import ast
import os
import csv
import math
from sklearn.utils import shuffle
from math import ceil

from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

2023-05-15 15:21:16.040651: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
tqdm.pandas()

In [3]:

args = pd.Series({
    "root_dir":"/mnt/disks/data/",
    "dataset_path":"/mnt/disks/data/fma/fma_large",
    "embeddings":"music_style",
    "sequence_size": 1280,
    "train_id": "hierarchical_single",
    'sample_size': 1
})


In [4]:

job_path = "/mnt/disks/data/fma/trains"

# In[15]:

train_path = os.path.join(job_path,args.train_id)


tfrecord_path = os.path.join(train_path,"tfrecords")

# In[16]:

base_path = os.path.join(args.root_dir,"fma")

# In[17]:

models_path = os.path.join(args.root_dir,"models")


metadata_path_fma = os.path.join(base_path,"fma_metadata")

# In[18]:

metadata_path = os.path.join(train_path,"metadata.json")


categories_labels_path = os.path.join(train_path,"labels.json")


In [5]:


def __load_json__(path):
    with open(path, 'r') as f:
        tmp = json.loads(f.read())

    return tmp




In [6]:

def create_dir(path):
    # checking if the directory demo_folder2 
    # exist or not.
    if not os.path.isdir(path):

        # if the demo_folder2 directory is 
        # not present then create it.
        os.makedirs(path)
    return True



In [7]:
create_dir(train_path)

True

## Load genres file. Contains relationships beetwen genres

In [8]:
genres_df = pd.read_csv(os.path.join(metadata_path_fma,'genres.csv'))


In [9]:
genres_df

Unnamed: 0,genre_id,#tracks,parent,title,top_level
0,1,8693,38,Avant-Garde,38
1,2,5271,0,International,2
2,3,1752,0,Blues,3
3,4,4126,0,Jazz,4
4,5,4106,0,Classical,5
...,...,...,...,...,...
158,1032,60,102,Turkish,2
159,1060,30,46,Tango,2
160,1156,26,130,Fado,2
161,1193,72,763,Christmas,38


In [10]:
genres_df[genres_df['genre_id'] == 495]

Unnamed: 0,genre_id,#tracks,parent,title,top_level
136,495,2061,15,Downtempo,15


In [11]:
# Cria um dicionário que associa o ID de cada música aos IDs de seus gêneros musicais
tracks_df = pd.read_csv(os.path.join(metadata_path_fma,'tracks_valid.csv'))

In [12]:
tracks_df = tracks_df.sample(frac=args.sample_size)

In [13]:
tracks_df.sample(20)

Unnamed: 0,track_id,track_title,valid_genre
47053,70637,Pentheus phase I,"['70', '107']"
58945,89905,Daybreak [#0517] (CK Re-Edit),"['1', '30', '38', '41', '247']"
101711,151368,horselover fat,"['70', '236']"
92621,138724,Leaving,"['107', '495']"
13766,22548,One Thing,"['12', '25', '85']"
42175,63979,Decay III,"['47', '107']"
47688,71474,Interview,"['66', '103', '111']"
9613,16225,Babu po ebalu,"['11', '15', '181', '182', '296']"
29075,44915,Voices,"['10', '27', '103']"
8035,13943,there!,"['38', '107', '224']"


In [14]:
tracks_df.valid_genre.values

array(["['32', '38']", "['43', '138']", "['468']", ..., "['15', '66']",
       "['12']", "['15', '38', '811']"], dtype=object)

In [15]:
tracks_df.track_title

101695    Emphasis Between Strange Dreams
27876                        In the Night
26114                         Cotrol Clod
103035          good looking instrumental
78687           From Dreams to Melancholy
                       ...               
40842                            Potloque
50295                   Holy Pain's World
49868                             Refugee
10532             Bad Form/American Cream
61353                       I Never Could
Name: track_title, Length: 104186, dtype: object

In [16]:
## Get complete genre structure
def get_all_structure(estrutura,df_genres):
    ## Get structure from df_genres
    def get_all_structure_from_df(estrutura,df_genres,structure=[]):
        if estrutura == 0:
            return structure
        else:
            structure.append(int(estrutura))
            get_all_structure_from_df(df_genres[df_genres["genre_id"]==int(estrutura)].parent.values[0],df_genres,structure)
            return structure
    
    return get_all_structure_from_df(estrutura,df_genres,structure=[])
    

In [17]:
# tracks_df['valid_genre'] = tracks_df.track_genres.apply(lambda x: x.strip('][').split(', ') if x != '[]' else None)
tracks_df['valid_genre'] = tracks_df.valid_genre.apply(lambda x: ast.literal_eval(x))

In [18]:
tracks_df['last_genre_id'] = tracks_df.valid_genre.apply(lambda x:x[-1] if x != None else None)

In [19]:
tracks_df.sample(20)

Unnamed: 0,track_id,track_title,valid_genre,last_genre_id
81984,124652,Ice Cave [Loop],"[18, 66, 240]",240
46971,70500,Help Us,"[21, 38]",38
93995,140548,Funeral con piñata,"[27, 76, 94]",94
37324,57442,Nuclear Beach,[38],38
24712,38973,Dinner Bass,"[11, 15, 19]",19
56638,85444,Drifter,"[27, 66]",66
72567,112400,Streetview,"[10, 15, 17, 66]",66
74444,114790,Желе,"[27, 58, 103]",103
24464,38665,Solid State,"[31, 32, 38, 47]",47
12439,20608,Reflections Sayyid verse 89bpm-109bpm,"[21, 38]",38


In [20]:
tracks_df.dropna(inplace=True)

In [21]:
tracks_df

Unnamed: 0,track_id,track_title,valid_genre,last_genre_id
101695,151337,Emphasis Between Strange Dreams,"[32, 38]",38
27876,43111,In the Night,"[43, 138]",138
26114,40830,Cotrol Clod,[468],468
103035,153967,good looking instrumental,"[21, 1235]",1235
78687,120226,From Dreams to Melancholy,"[5, 18, 107]",107
...,...,...,...,...
40842,62140,Potloque,"[15, 21, 468]",468
50295,75220,Holy Pain's World,"[31, 45]",45
49868,74552,Refugee,"[15, 66]",66
10532,17603,Bad Form/American Cream,[12],12


In [22]:
tracks_df['full_genre_id'] = tracks_df.last_genre_id.progress_apply(lambda x: get_all_structure(x,genres_df)[::-1])

  0%|          | 0/104185 [00:00<?, ?it/s]

In [23]:
tracks_df.full_genre_id

101695           [38]
27876       [20, 138]
26114       [15, 468]
103035         [1235]
78687     [1235, 107]
             ...     
40842       [15, 468]
50295        [12, 45]
49868        [12, 66]
10532            [12]
61353       [21, 811]
Name: full_genre_id, Length: 104185, dtype: object

In [24]:
tracks_df.columns

Index(['track_id', 'track_title', 'valid_genre', 'last_genre_id',
       'full_genre_id'],
      dtype='object')

In [25]:
tracks_df = tracks_df[['track_id','full_genre_id']]

In [26]:
tracks_df.full_genre_id.values

array([list([38]), list([20, 138]), list([15, 468]), ..., list([12, 66]),
       list([12]), list([21, 811])], dtype=object)

In [27]:
tracks_df.full_genre_id.info

<bound method Series.info of 101695           [38]
27876       [20, 138]
26114       [15, 468]
103035         [1235]
78687     [1235, 107]
             ...     
40842       [15, 468]
50295        [12, 45]
49868        [12, 66]
10532            [12]
61353       [21, 811]
Name: full_genre_id, Length: 104185, dtype: object>

In [28]:
labels_size = tracks_df.full_genre_id.apply(lambda x: len(x))

In [81]:
labels_size = int(labels_size.max())

In [82]:
type(labels_size)

int

### Parse of label to structure

In [31]:
### Function for parse label to sctructure of hierarhical scheme

def parse_label(label,label_size=5):
    # label = label.split('-')
    # preencher com 0 no caso de haver menos de 5 níveis
    labels = np.zeros(label_size,dtype=int)
    for i, label in enumerate(label):
        if i == 5:
            break
        # Aqui você pode fazer a conversão do label em um índice inteiro usando um dicionário ou outro método
        # Neste exemplo, estou apenas usando a posição da label na lista como índice
        labels[i] = label
    return labels

In [32]:
parsed_labels = tracks_df.full_genre_id.apply(lambda x: parse_label(x))

In [33]:
tracks_df['full_genre_id']

101695           [38]
27876       [20, 138]
26114       [15, 468]
103035         [1235]
78687     [1235, 107]
             ...     
40842       [15, 468]
50295        [12, 45]
49868        [12, 66]
10532            [12]
61353       [21, 811]
Name: full_genre_id, Length: 104185, dtype: object

In [34]:
def convert_label_to_string(x,level=2):
    return '-'.join([str(value) for value in x[:level]])

In [35]:
tracks_df['labels_1'] = parsed_labels.progress_apply(lambda x: str(x[:1][0]))
tracks_df['labels_2'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=2))
tracks_df['labels_3'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=3))
tracks_df['labels_4'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=4))
tracks_df['labels_5'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=5))

  0%|          | 0/104185 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tracks_df['labels_1'] = parsed_labels.progress_apply(lambda x: str(x[:1][0]))


  0%|          | 0/104185 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tracks_df['labels_2'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=2))


  0%|          | 0/104185 [00:00<?, ?it/s]

  0%|          | 0/104185 [00:00<?, ?it/s]

  0%|          | 0/104185 [00:00<?, ?it/s]

In [36]:
tracks_df = tracks_df[tracks_df['labels_1'].isin(["38","1235"])]

In [37]:
tracks_df['labels_2'].value_counts()

labels_2
38-0        6508
1235-0      6043
1235-107    3988
38-250      3379
38-247      2749
38-41       1997
38-224      1422
38-32       1207
1235-18      982
38-47        841
38-456       829
38-125       703
38-514       686
38-6         519
38-186       428
38-1         313
38-30        254
38-22        231
1235-267     201
Name: count, dtype: int64

In [38]:
tracks_df['labels_5'].value_counts()

labels_5
38-0-0-0-0          6508
1235-0-0-0-0        6043
1235-107-0-0-0      3988
38-250-0-0-0        3379
38-247-0-0-0        2749
38-41-0-0-0         1997
38-224-0-0-0        1422
38-32-0-0-0         1207
38-47-0-0-0          841
38-456-0-0-0         829
38-125-0-0-0         703
38-514-0-0-0         686
1235-18-0-0-0        659
38-186-0-0-0         428
1235-18-538-0-0      323
38-1-0-0-0           313
38-30-0-0-0          254
38-22-0-0-0          231
1235-267-0-0-0       201
38-6-16-763-0        194
38-6-360-0-0         190
38-6-16-763-1193      72
38-6-0-0-0            63
Name: count, dtype: int64

In [39]:
# tracks_df = tracks_df[tracks_df['labels_1'].isin(['3','14'])]

In [40]:
categories_df = pd.DataFrame({'level5':tracks_df.labels_5.unique()})

In [41]:
categories_df['level1'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:1]))
categories_df['level2'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:2]))
categories_df['level3'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:3]))
categories_df['level4'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:4]))

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

In [42]:
categories_df

Unnamed: 0,level5,level1,level2,level3,level4
0,38-0-0-0-0,38,38-0,38-0-0,38-0-0-0
1,1235-0-0-0-0,1235,1235-0,1235-0-0,1235-0-0-0
2,1235-107-0-0-0,1235,1235-107,1235-107-0,1235-107-0-0
3,38-186-0-0-0,38,38-186,38-186-0,38-186-0-0
4,38-250-0-0-0,38,38-250,38-250-0,38-250-0-0
5,38-247-0-0-0,38,38-247,38-247-0,38-247-0-0
6,38-41-0-0-0,38,38-41,38-41-0,38-41-0-0
7,38-224-0-0-0,38,38-224,38-224-0,38-224-0-0
8,38-514-0-0-0,38,38-514,38-514-0,38-514-0-0
9,38-456-0-0-0,38,38-456,38-456-0,38-456-0-0


In [43]:
def get_labels_name(x,genres_df):
    levels = 5
    full_name = []
    last_level = 0
    genre_root = ""
    for genre in x.split('-'):
        genre_df = genres_df[genres_df['genre_id'] == int(genre)]
        if genre_df.empty:
            genre_name = genre_root 
        else:
            genre_name = genre_df.title.values.tolist()[0]
            genre_root = genre_name
        
        full_name.append(genre_name)
    full_name = '>'.join(full_name)
        
    return full_name
    # return genres_df[genres_df['genre_id'] == int(x)].title.values.tolist()[0]

In [44]:
categories_df['level5_name'] = categories_df.level5.apply(lambda x: get_labels_name(x,genres_df))

In [45]:
categories_df

Unnamed: 0,level5,level1,level2,level3,level4,level5_name
0,38-0-0-0-0,38,38-0,38-0-0,38-0-0-0,Experimental>Experimental>Experimental>Experim...
1,1235-0-0-0-0,1235,1235-0,1235-0-0,1235-0-0-0,Instrumental>Instrumental>Instrumental>Instrum...
2,1235-107-0-0-0,1235,1235-107,1235-107-0,1235-107-0-0,Instrumental>Ambient>Ambient>Ambient>Ambient
3,38-186-0-0-0,38,38-186,38-186-0,38-186-0-0,Experimental>Sound Poetry>Sound Poetry>Sound P...
4,38-250-0-0-0,38,38-250,38-250-0,38-250-0-0,Experimental>Improv>Improv>Improv>Improv
5,38-247-0-0-0,38,38-247,38-247-0,38-247-0-0,Experimental>Musique Concrete>Musique Concrete...
6,38-41-0-0-0,38,38-41,38-41-0,38-41-0-0,Experimental>Electroacoustic>Electroacoustic>E...
7,38-224-0-0-0,38,38-224,38-224-0,38-224-0-0,Experimental>Sound Collage>Sound Collage>Sound...
8,38-514-0-0-0,38,38-514,38-514-0,38-514-0-0,Experimental>Sound Art>Sound Art>Sound Art>Sou...
9,38-456-0-0-0,38,38-456,38-456-0,38-456-0-0,Experimental>Minimalism>Minimalism>Minimalism>...


In [46]:
def __create_labels__(categories_df):
    data = {
        "label1": {},
        "label2": {},
        "label3": {},
        "label4": {},
        "label5": {},
        "label1_inverse": [],
        "label2_inverse": [],
        "label3_inverse": [],
        "label4_inverse": [],
        "label5_inverse": [],
        "label1_name": {},
        "label2_name": {},
        "label3_name": {},
        "label4_name": {},
        "label5_name": {},
    }

    idx = 0
    
    for id_x, cat in enumerate(set(categories_df.level1.values.tolist())):
        data['label1'][cat] = idx
        data['label1_inverse'].append(cat)
        data['label1_count'] = idx + 1
        idx+=1

    for id_x, cat in enumerate(set(categories_df.level2.values.tolist())):
        data['label2'][cat] = idx
        data['label2_inverse'].append(cat)
        data['label2_count'] = idx + 1
        idx+=1
        
    for id_x, cat in enumerate(set(categories_df.level3.values.tolist())):
        data['label3'][cat] = idx
        data['label3_inverse'].append(cat)
        data['label3_count'] = idx + 1
        idx+=1

    for id_x, cat in enumerate(set(categories_df.level4.values.tolist())):
        data['label4'][cat] = idx
        data['label4_inverse'].append(cat)
        data['label4_count'] = idx + 1
        idx+=1
        
    for idx, cat in enumerate(set(categories_df.level5.values.tolist())):
        data['label5'][cat] = idx
        data['label5_inverse'].append(cat)
        data['label5_count'] = idx + 1
        idx+=1
        
    for cat5,cat1,cat2,cat3,cat4,name5 in categories_df.values:
        
        name1 = '>'.join(name5.split('>')[:1])
        name2 = '>'.join(name5.split('>')[:2])
        name3 = '>'.join(name5.split('>')[:3])
        name4 = '>'.join(name5.split('>')[:4])
        
        
        data['label1_name'][cat1] = name1
        data['label2_name'][cat2] = name2
        data['label3_name'][cat3] = name3
        data['label4_name'][cat4] = name4
        data['label5_name'][cat5] = name5
        
    return data

In [47]:
with open(categories_labels_path, 'w+') as f:
    f.write(json.dumps(__create_labels__(categories_df)))

In [48]:
labels  = __create_labels__(categories_df)

In [49]:
labels['label4']

{'38-22-0-0': 43,
 '38-41-0-0': 44,
 '38-125-0-0': 45,
 '38-247-0-0': 46,
 '38-186-0-0': 47,
 '38-30-0-0': 48,
 '38-6-360-0': 49,
 '38-224-0-0': 50,
 '38-456-0-0': 51,
 '38-6-0-0': 52,
 '1235-0-0-0': 53,
 '38-32-0-0': 54,
 '38-6-16-763': 55,
 '1235-267-0-0': 56,
 '1235-107-0-0': 57,
 '38-1-0-0': 58,
 '38-0-0-0': 59,
 '38-47-0-0': 60,
 '38-514-0-0': 61,
 '38-250-0-0': 62,
 '1235-18-0-0': 63,
 '1235-18-538-0': 64}

In [50]:
labels['label1_count']

2

In [51]:


def parse_tfr_element(element):
    #use the same structure as above; it's kinda an outline of the structure we now want to create
    data = {
        'emb' : tf.io.FixedLenFeature([], tf.string),
        'track_id' : tf.io.FixedLenFeature([], tf.int64),
    }
    
    content = tf.io.parse_single_example(element, data)

    track_id = content['track_id']
    emb = content['emb']
    

    #get our 'feature'-- our image -- and reshape it appropriately
    feature = tf.io.parse_tensor(emb, out_type=tf.float32)
    return (feature, track_id)


def get_dataset(filename):
    #create the dataset
    dataset = tf.data.TFRecordDataset(filename)

    #pass every single feature through our mapping function
    dataset = dataset.map(
        parse_tfr_element
    )

    return dataset




In [52]:
import numpy as np


def load_dataset(path,dataset=args.embeddings):
    tfrecords_path = os.path.join(path,'tfrecords',dataset)
    
    
    tfrecords_path = [os.path.join(tfrecords_path,path) for path in os.listdir(tfrecords_path)]
    dataset = get_dataset(tfrecords_path)
    
    df = pd.DataFrame(
        dataset.as_numpy_iterator(),
        columns=['feature', 'track_id']
    )
        
    df.dropna(inplace=True)
    
    
    try:
        df.feature = df.feature.apply(lambda x: x[0] if x.shape[0] != 0 else None)
    except:
        print(x)
    
    return df
    


In [53]:
tracks_df['labels_5'].value_counts()

labels_5
38-0-0-0-0          6508
1235-0-0-0-0        6043
1235-107-0-0-0      3988
38-250-0-0-0        3379
38-247-0-0-0        2749
38-41-0-0-0         1997
38-224-0-0-0        1422
38-32-0-0-0         1207
38-47-0-0-0          841
38-456-0-0-0         829
38-125-0-0-0         703
38-514-0-0-0         686
1235-18-0-0-0        659
38-186-0-0-0         428
1235-18-538-0-0      323
38-1-0-0-0           313
38-30-0-0-0          254
38-22-0-0-0          231
1235-267-0-0-0       201
38-6-16-763-0        194
38-6-360-0-0         190
38-6-16-763-1193      72
38-6-0-0-0            63
Name: count, dtype: int64

In [54]:
def __split_data__(group, percentage=0.1):
    if len(group) == 1:
        return group, group

    shuffled = shuffle(group.values)
    finish_test = int(ceil(len(group) * percentage))

    first = pd.DataFrame(shuffled[:finish_test], columns=group.columns)
    second = pd.DataFrame(shuffled[finish_test:], columns=group.columns)

    return first, second

In [55]:
def select_dataset(tracks_df):
    
#     dataset_testset_path = os.path.join(tfrecord_path,'test')
#     dataset_validationset_path = os.path.join(tfrecord_path,'val')
#     dataset_trainset_path = os.path.join(tfrecord_path,'train')
    
    df = load_dataset(args.dataset_path,dataset=args.embeddings)
    
    df.dropna(inplace=True)
    
    tracks_df = tracks_df.merge(df, on='track_id')
    
    tracks_df.loc[:,'labels_1'] = tracks_df.labels_1.astype(str).progress_apply(lambda x: labels['label1'][x])
    tracks_df.loc[:,'labels_2'] = tracks_df.labels_2.astype(str).progress_apply(lambda x: labels['label2'][x])
    tracks_df.loc[:,'labels_3'] = tracks_df.labels_3.astype(str).progress_apply(lambda x: labels['label3'][x])
    tracks_df.loc[:,'labels_4'] = tracks_df.labels_4.astype(str).progress_apply(lambda x: labels['label4'][x])
    tracks_df.loc[:,'labels_5'] = tracks_df.labels_5.astype(str).progress_apply(lambda x: labels['label5'][x])
    
    tests = []
    trains = []
    validations = []
    groups = tracks_df.groupby("labels_5")
    
    
    count = 0
    items_count = 0
    total = len(groups)
    total_items = len(tracks_df)
    oversampling_size = 30  # int(group_sizes.mean() + group_sizes.std() * 2)
    print(f"oversampling_size: {oversampling_size}")

    for code, group in groups:
        test, train_to_split = __split_data__(group, 0.01)  # 10%
        train_to_split = train_to_split
        validation, train = __split_data__(train_to_split, 0.01)  # %1

        tests.append(test)
        validations.append(validation)

        ## this increase the numner of samples when classes has low quantity
        count_train = len(train)
        if count_train < oversampling_size:
            train = train.sample(oversampling_size, replace=True)

        trains.append(train)

        count += 1
        items_count += count_train
        
        
    df_test = pd.concat(tests, sort=False).sample(frac=1).reset_index(drop=True)
    # .to_csv(dataset_testset_path, index=False,quoting=csv.QUOTE_ALL)
    df_val = pd.concat(validations, sort=False).sample(frac=1).reset_index(drop=True)
    df_train = pd.concat(trains, sort=False).sample(frac=1).reset_index(drop=True)

    return df_train,df_test,df_val

In [56]:
df_train,df_test,df_val = select_dataset(tracks_df)

2023-05-15 15:23:28.646966: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9574 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:65:00.0, compute capability: 7.5
2023-05-15 15:23:28.877367: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_0' with dtype string and shape [11]
	 [[{{node Placeholder/_0}}]]


  0%|          | 0/33278 [00:00<?, ?it/s]

  0%|          | 0/33278 [00:00<?, ?it/s]

  0%|          | 0/33278 [00:00<?, ?it/s]

  0%|          | 0/33278 [00:00<?, ?it/s]

  0%|          | 0/33278 [00:00<?, ?it/s]

oversampling_size: 30


In [57]:
df_train

Unnamed: 0,track_id,full_genre_id,labels_1,labels_2,labels_3,labels_4,labels_5,feature
0,145309,"[38, 456]",0,5,38,51,21,"[0.055912975, 0.35137907, -0.016573707, -0.024..."
1,41054,"[38, 186]",0,20,26,47,0,"[-0.026045322, 0.027480403, -0.013469517, -0.0..."
2,86979,"[1235, 107]",1,9,22,57,15,"[0.33327603, -0.024481317, 0.18244474, -0.0607..."
3,120714,[1235],1,3,25,53,17,"[0.16110718, 0.060774684, -0.10526278, -0.0568..."
4,118846,[1235],1,3,25,53,17,"[0.045932412, 0.2162286, -0.019199193, -0.0332..."
...,...,...,...,...,...,...,...,...
32588,141261,"[1235, 107]",1,9,22,57,15,"[-0.0023950438, -0.00047282377, -0.0033394992,..."
32589,53322,[38],0,18,21,59,12,"[-0.026444316, 0.031975966, -0.057269026, 0.00..."
32590,40559,"[38, 6, 16, 763]",0,2,28,55,4,"[0.5035694, -0.12227299, -0.08204511, -0.00804..."
32591,145936,[1235],1,3,25,53,17,"[-0.0036885838, -0.00404045, 0.013731887, -0.0..."


In [64]:

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))): # if value ist tensor
        value = value.numpy() # get value of tensor
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a floast_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_array(array):
  array = tf.io.serialize_tensor(array)
  return array


In [65]:
def parse_single_music(data,labels):
    # cat1, cat2, cat3, cat4, cat5 = data
    track_id, _, cat1, cat2, cat3, cat4, cat5, music = data
    #define the dictionary -- the structure -- of our single example
    data = {
        'label1': _int64_feature(cat1),
        'label2': _int64_feature(cat2),
        'label3': _int64_feature(cat3),
        'label4': _int64_feature(cat4),
        'label5': _int64_feature(cat5),
        'emb' : _bytes_feature(serialize_array(music)),
        'track_id' : _int64_feature(track_id)
    }
    #create an Example, wrapping the single features
    out = tf.train.Example(features=tf.train.Features(feature=data))

    return out

In [66]:
def generate_tf_record(df,tf_path='val'):
    create_dir(tf_path)
    
    
    batch_size = 1024 * 50 # 50k records from each file batch
    count = 0
    total = math.ceil(len(df) / batch_size)

    for i in range(0, len(df), batch_size):
        batch_df = df[i:i+batch_size]
        
        tfrecords = [parse_single_music(data, labels) for data in batch_df.values]
        
        path = f"{tf_path}/{str(count).zfill(10)}.tfrecord"
        
        #with tf.python_io.TFRecordWriter(path) as writer:
        with tf.io.TFRecordWriter(path) as writer:
            for tfrecord in tfrecords:
                writer.write(tfrecord.SerializeToString())

        print(f"{count} {len(tfrecords)} {path}")
        count += 1
        print(f"{count}/{total} batchs / {count * batch_size} processed")

    print(f"{count}/{total} batchs / {len(df)} processed")
    
    return tf_path

    

In [67]:
tfrecord_path

'/mnt/disks/data/fma/trains/hierarchical_single/tfrecords'

In [68]:
!rm -R '/mnt/disks/data/fma/trains/hierarchical_single/tfrecords'

/usr/bin/zsh: /home/bruno/anaconda3/lib/libtinfo.so.6: no version information available (required by /usr/bin/zsh)


In [69]:
val_path = generate_tf_record(df_val,tf_path=os.path.join(tfrecord_path,'val'))

0 340 /mnt/disks/data/fma/trains/hierarchical_single/tfrecords/val/0000000000.tfrecord
1/1 batchs / 51200 processed
1/1 batchs / 340 processed


In [70]:
test_path = generate_tf_record(df_test,tf_path=os.path.join(tfrecord_path,'test'))

0 345 /mnt/disks/data/fma/trains/hierarchical_single/tfrecords/test/0000000000.tfrecord
1/1 batchs / 51200 processed
1/1 batchs / 345 processed


In [71]:
train_path = generate_tf_record(df_train,tf_path=os.path.join(tfrecord_path,'train'))

0 32593 /mnt/disks/data/fma/trains/hierarchical_single/tfrecords/train/0000000000.tfrecord
1/1 batchs / 51200 processed
1/1 batchs / 32593 processed


In [83]:
def create_metadata(metadata_path):

    with open(metadata_path, 'w+') as f:
        f.write(json.dumps({
            
            'sequence_size': args.sequence_size,
            'n_levels': labels_size,
            'labels_size': [labels['label1_count'],labels['label2_count'],
                           labels['label3_count'],labels['label4_count'],
                           labels['label5_count']],
            'val_path': val_path,
            'train_path': train_path,
            'test_path': test_path,
            'trainset_count': len(df_train),
            'validationset_count': len(df_val),
            'testset_count': len(df_test)
        }))

In [84]:
create_metadata(metadata_path)

In [85]:
tracks_df.to_csv(os.path.join(train_path,"tracks.csv"),index=False)