In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json
import ast
import os
import csv
import math
from sklearn.utils import shuffle
from math import ceil
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

2023-10-09 17:31:20.447955: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
tqdm.pandas()

In [3]:

args = pd.Series({
    "root_dir":"/mnt/disks/data/",
    "dataset_path":"/mnt/disks/data/fma/fma_large",
    "embeddings":"music_style",
    "sequence_size": 1280,
    "train_id": "hierarchical_all",
    'sample_size': 1
})


In [4]:

base_path = "/mnt/disks/data/fma/trains"


job_path = os.path.join(base_path,args.train_id)


tfrecord_path = os.path.join(job_path,"tfrecords")

# In[16]:

base_path = os.path.join(args.root_dir,"fma")

# In[17]:

models_path = os.path.join(args.root_dir,"models")


metadata_path_fma = os.path.join(base_path,"fma_metadata")

# In[18]:

metadata_path = os.path.join(job_path,"metadata.json")


categories_labels_path = os.path.join(job_path,"labels.json")


In [5]:


def __load_json__(path):
    with open(path, 'r') as f:
        tmp = json.loads(f.read())

    return tmp




In [6]:

def create_dir(path):
    # checking if the directory demo_folder2 
    # exist or not.
    if not os.path.isdir(path):

        # if the demo_folder2 directory is 
        # not present then create it.
        os.makedirs(path)
    return True



In [7]:
import shutil
shutil.rmtree(job_path)

In [8]:
create_dir(job_path)

True

## Load genres file. Contains relationships beetwen genres

In [9]:
genres_df = pd.read_csv(os.path.join(metadata_path_fma,'genres.csv'))


In [10]:
genres_df

Unnamed: 0,genre_id,#tracks,parent,title,top_level
0,1,8693,38,Avant-Garde,38
1,2,5271,0,International,2
2,3,1752,0,Blues,3
3,4,4126,0,Jazz,4
4,5,4106,0,Classical,5
...,...,...,...,...,...
158,1032,60,102,Turkish,2
159,1060,30,46,Tango,2
160,1156,26,130,Fado,2
161,1193,72,763,Christmas,38


In [11]:
genres_df[genres_df['genre_id'] == 495]

Unnamed: 0,genre_id,#tracks,parent,title,top_level
136,495,2061,15,Downtempo,15


In [12]:
# Cria um dicionário que associa o ID de cada música aos IDs de seus gêneros musicais
tracks_df = pd.read_csv(os.path.join(metadata_path_fma,'tracks_valid.csv'))

In [13]:
tracks_df = tracks_df.sample(frac=args.sample_size)

In [14]:
tracks_df.sample(20)

Unnamed: 0,track_id,track_title,valid_genre
77088,118368,All I Have,"['15', '38', '811']"
62086,95276,103+108B(2)+223(4)+69D+173(19),"['1', '38', '97']"
42870,64831,Green Stars And An Orange Sky,['58']
49627,74255,So Long To Folk Dance Again,"['12', '42', '66']"
40350,61445,EZ Money + Comancho,['185']
67082,105161,Trollsyn,"['38', '47', '107']"
79095,120755,Feather,"['15', '18', '1235']"
77593,118980,Constellation Blackbird,"['15', '38', '107']"
24219,38385,Breatin,"['98', '240', '297']"
82479,125316,Take A Little Time,"['12', '362']"


In [15]:
tracks_df.valid_genre.values

array(["['70']", "['17', '30', '86']", "['15', '32', '38']", ..., "['8']",
       "['15', '38', '42', '107', '184']", "['15', '32', '38', '41']"],
      dtype=object)

In [16]:
tracks_df.track_title

88008                                           Into Limbo
26522    Exotic Waterfall & Bird Magic, About 35 Km Fro...
32868                          la muerte tiembla la tierra
54872                                           Barceloner
36327                                        Nitrous Oxide
                               ...                        
51691                                               Nimbus
21638                  Microfiche (with Miriam Hanks-Todd)
9736                                 Paddle Your Own Canoe
28459                                                 Onyx
98412                                  Upheaval Version 59
Name: track_title, Length: 104186, dtype: object

In [17]:
## Get complete genre structure
def get_all_structure(estrutura,df_genres):
    ## Get structure from df_genres
    def get_all_structure_from_df(estrutura,df_genres,structure=[]):
        if estrutura == 0:
            return structure
        else:
            structure.append(int(estrutura))
            get_all_structure_from_df(df_genres[df_genres["genre_id"]==int(estrutura)].parent.values[0],df_genres,structure)
            return structure
    
    return get_all_structure_from_df(estrutura,df_genres,structure=[])
    

In [18]:
# tracks_df['valid_genre'] = tracks_df.track_genres.apply(lambda x: x.strip('][').split(', ') if x != '[]' else None)
tracks_df['valid_genre'] = tracks_df.valid_genre.apply(lambda x: ast.literal_eval(x))

In [19]:
tracks_df['last_genre_id'] = tracks_df.valid_genre.apply(lambda x:x[-1] if x != None else None)

In [20]:
tracks_df.sample(20)

Unnamed: 0,track_id,track_title,valid_genre,last_genre_id
94109,140672,Exile,"[21, 38, 70]",70
2345,4089,End of Cloud,[58],58
17209,28306,I Wanna Miss You,"[10, 66, 111]",111
76244,117222,Fraulein,"[9, 12, 85]",85
47076,70676,The Guitarist,"[10, 17, 66]",66
72924,112823,... и....? by Kosta T,"[38, 125, 514]",514
81698,124299,In[ner]cantations,"[1, 30, 38, 41, 247]",247
95566,142967,Red Eyed Troll,"[12, 25, 85]",85
100182,148960,Change,[15],15
73763,113897,130 old drum break,"[15, 811]",811


In [21]:
tracks_df.dropna(inplace=True)

In [22]:
tracks_df

Unnamed: 0,track_id,track_title,valid_genre,last_genre_id
88008,132701,Into Limbo,[70],70
26522,41359,"Exotic Waterfall & Bird Magic, About 35 Km Fro...","[17, 30, 86]",86
32868,51082,la muerte tiembla la tierra,"[15, 32, 38]",38
54872,83244,Barceloner,"[12, 27, 76]",76
36327,56012,Nitrous Oxide,"[32, 38, 250]",250
...,...,...,...,...
51691,77859,Nimbus,[15],15
21638,34615,Microfiche (with Miriam Hanks-Todd),"[1, 27, 76]",76
9736,16381,Paddle Your Own Canoe,[8],8
28459,43880,Onyx,"[15, 38, 42, 107, 184]",184


In [23]:
tracks_df['full_genre_id'] = tracks_df.last_genre_id.progress_apply(lambda x: get_all_structure(x,genres_df)[::-1])

  0%|          | 0/104185 [00:00<?, ?it/s]

In [26]:
tracks_df.full_genre_id

88008     [12, 70]
26522      [2, 86]
32868         [38]
54872     [10, 76]
36327    [38, 250]
           ...    
51691         [15]
21638     [10, 76]
9736           [8]
28459    [15, 184]
98412     [38, 41]
Name: full_genre_id, Length: 104185, dtype: object

In [27]:
tracks_df.columns

Index(['track_id', 'track_title', 'valid_genre', 'last_genre_id',
       'full_genre_id'],
      dtype='object')

In [28]:
tracks_df = tracks_df[['track_id','full_genre_id']]

In [29]:
tracks_df.full_genre_id.values

array([list([12, 70]), list([2, 86]), list([38]), ..., list([8]),
       list([15, 184]), list([38, 41])], dtype=object)

In [30]:
tracks_df.full_genre_id.info

<bound method Series.info of 88008     [12, 70]
26522      [2, 86]
32868         [38]
54872     [10, 76]
36327    [38, 250]
           ...    
51691         [15]
21638     [10, 76]
9736           [8]
28459    [15, 184]
98412     [38, 41]
Name: full_genre_id, Length: 104185, dtype: object>

In [31]:
labels_size = tracks_df.full_genre_id.apply(lambda x: len(x))

In [32]:
labels_size = int(labels_size.max())

In [33]:
type(labels_size)

int

### Parse of label to structure

In [34]:
### Function for parse label to sctructure of hierarhical scheme

def parse_label(label,label_size=5):
    # label = label.split('-')
    # preencher com 0 no caso de haver menos de 5 níveis
    labels = np.zeros(label_size,dtype=int)
    for i, label in enumerate(label):
        if i == 5:
            break
        # Aqui você pode fazer a conversão do label em um índice inteiro usando um dicionário ou outro método
        # Neste exemplo, estou apenas usando a posição da label na lista como índice
        labels[i] = label
    return labels

In [35]:
parsed_labels = tracks_df.full_genre_id.apply(lambda x: parse_label(x))

In [36]:
tracks_df['full_genre_id']

88008     [12, 70]
26522      [2, 86]
32868         [38]
54872     [10, 76]
36327    [38, 250]
           ...    
51691         [15]
21638     [10, 76]
9736           [8]
28459    [15, 184]
98412     [38, 41]
Name: full_genre_id, Length: 104185, dtype: object

In [37]:
def convert_label_to_string(x,level=2):
    return '-'.join([str(value) for value in x[:level]])

In [38]:
tracks_df['labels_1'] = parsed_labels.progress_apply(lambda x: str(x[:1][0]))
tracks_df['labels_2'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=2))
tracks_df['labels_3'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=3))
tracks_df['labels_4'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=4))
tracks_df['labels_5'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=5))

  0%|          | 0/104185 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tracks_df['labels_1'] = parsed_labels.progress_apply(lambda x: str(x[:1][0]))


  0%|          | 0/104185 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tracks_df['labels_2'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=2))


  0%|          | 0/104185 [00:00<?, ?it/s]

  0%|          | 0/104185 [00:00<?, ?it/s]

  0%|          | 0/104185 [00:00<?, ?it/s]

In [39]:
# tracks_df = tracks_df[tracks_df['labels_1'].isin(["38","1235"])]

In [40]:
tracks_df['labels_2'].value_counts()

labels_2
38-0        6508
1235-0      6043
12-25       5706
10-76       4126
1235-107    3988
            ... 
13-0          17
20-374         9
2-117          8
5-444          2
20-7           2
Name: count, Length: 121, dtype: int64

In [41]:
tracks_df['labels_5'].value_counts()

labels_5
38-0-0-0-0        6508
1235-0-0-0-0      6043
10-76-0-0-0       4126
1235-107-0-0-0    3988
17-103-0-0-0      3482
                  ... 
9-651-493-0-0        4
20-65-189-0-0        4
2-86-173-0-0         4
5-444-0-0-0          2
20-7-0-0-0           2
Name: count, Length: 159, dtype: int64

In [42]:
# tracks_df = tracks_df[tracks_df['labels_1'].isin(['3','14'])]

In [43]:
categories_df = pd.DataFrame({'level5':tracks_df.labels_5.unique()})

In [44]:
categories_df['level1'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:1]))
categories_df['level2'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:2]))
categories_df['level3'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:3]))
categories_df['level4'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:4]))

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

  0%|          | 0/159 [00:00<?, ?it/s]

In [45]:
categories_df

Unnamed: 0,level5,level1,level2,level3,level4
0,12-70-0-0-0,12,12-70,12-70-0,12-70-0-0
1,2-86-0-0-0,2,2-86,2-86-0,2-86-0-0
2,38-0-0-0-0,38,38-0,38-0-0,38-0-0-0
3,10-76-0-0-0,10,10-76,10-76-0,10-76-0-0
4,38-250-0-0-0,38,38-250,38-250-0,38-250-0-0
...,...,...,...,...,...
154,20-465-0-0-0,20,20-465,20-465-0,20-465-0-0
155,20-65-189-0-0,20,20-65,20-65-189,20-65-189-0
156,2-86-173-0-0,2,2-86,2-86-173,2-86-173-0
157,5-444-0-0-0,5,5-444,5-444-0,5-444-0-0


In [46]:
def get_labels_name(x,genres_df):
    levels = 5
    full_name = []
    last_level = 0
    genre_root = ""
    for genre in x.split('-'):
        genre_df = genres_df[genres_df['genre_id'] == int(genre)]
        if genre_df.empty:
            genre_name = genre_root 
        else:
            genre_name = genre_df.title.values.tolist()[0]
            genre_root = genre_name
        
        full_name.append(genre_name)
    full_name = '>'.join(full_name)
        
    return full_name
    # return genres_df[genres_df['genre_id'] == int(x)].title.values.tolist()[0]

In [47]:
categories_df['level5_name'] = categories_df.level5.apply(lambda x: get_labels_name(x,genres_df))

In [48]:
categories_df

Unnamed: 0,level5,level1,level2,level3,level4,level5_name
0,12-70-0-0-0,12,12-70,12-70-0,12-70-0-0,Rock>Industrial>Industrial>Industrial>Industrial
1,2-86-0-0-0,2,2-86,2-86-0,2-86-0-0,International>Indian>Indian>Indian>Indian
2,38-0-0-0-0,38,38-0,38-0-0,38-0-0-0,Experimental>Experimental>Experimental>Experim...
3,10-76-0-0-0,10,10-76,10-76-0,10-76-0-0,Pop>Experimental Pop>Experimental Pop>Experime...
4,38-250-0-0-0,38,38-250,38-250-0,38-250-0-0,Experimental>Improv>Improv>Improv>Improv
...,...,...,...,...,...,...
154,20-465-0-0-0,20,20-465,20-465-0,20-465-0-0,Spoken>Musical Theater>Musical Theater>Musical...
155,20-65-189-0-0,20,20-65,20-65-189,20-65-189-0,Spoken>Radio>Talk Radio>Talk Radio>Talk Radio
156,2-86-173-0-0,2,2-86,2-86-173,2-86-173-0,International>Indian>N. Indian Traditional>N. ...
157,5-444-0-0-0,5,5-444,5-444-0,5-444-0-0,Classical>Symphony>Symphony>Symphony>Symphony


In [49]:
def __create_labels__(categories_df):
    data = {
        "label1": {},
        "label2": {},
        "label3": {},
        "label4": {},
        "label5": {},
        "label1_inverse": [],
        "label2_inverse": [],
        "label3_inverse": [],
        "label4_inverse": [],
        "label5_inverse": [],
        "label1_name": {},
        "label2_name": {},
        "label3_name": {},
        "label4_name": {},
        "label5_name": {},
    }

    idx = 0
    
    for id_x, cat in enumerate(set(categories_df.level1.values.tolist())):
        data['label1'][cat] = idx
        data['label1_inverse'].append(cat)
        data['label1_count'] = idx + 1
        idx+=1

    for id_x, cat in enumerate(set(categories_df.level2.values.tolist())):
        data['label2'][cat] = idx
        data['label2_inverse'].append(cat)
        data['label2_count'] = idx + 1
        idx+=1
        
    for id_x, cat in enumerate(set(categories_df.level3.values.tolist())):
        data['label3'][cat] = idx
        data['label3_inverse'].append(cat)
        data['label3_count'] = idx + 1
        idx+=1

    for id_x, cat in enumerate(set(categories_df.level4.values.tolist())):
        data['label4'][cat] = idx
        data['label4_inverse'].append(cat)
        data['label4_count'] = idx + 1
        idx+=1
        
    for idx, cat in enumerate(set(categories_df.level5.values.tolist())):
        data['label5'][cat] = idx
        data['label5_inverse'].append(cat)
        data['label5_count'] = idx + 1
        idx+=1
        
    for cat5,cat1,cat2,cat3,cat4,name5 in categories_df.values:
        
        name1 = '>'.join(name5.split('>')[:1])
        name2 = '>'.join(name5.split('>')[:2])
        name3 = '>'.join(name5.split('>')[:3])
        name4 = '>'.join(name5.split('>')[:4])
        
        
        data['label1_name'][cat1] = name1
        data['label2_name'][cat2] = name2
        data['label3_name'][cat3] = name3
        data['label4_name'][cat4] = name4
        data['label5_name'][cat5] = name5
        
    return data

In [50]:
with open(categories_labels_path, 'w+') as f:
    f.write(json.dumps(__create_labels__(categories_df)))

In [51]:
labels  = __create_labels__(categories_df)

In [52]:
labels['label5']

{'38-22-0-0-0': 0,
 '2-86-173-0-0': 1,
 '9-169-0-0-0': 2,
 '9-137-0-0-0': 3,
 '20-188-0-0-0': 4,
 '15-42-0-0-0': 5,
 '12-58-0-0-0': 6,
 '2-177-0-0-0': 7,
 '2-92-0-0-0': 8,
 '12-25-64-0-0': 9,
 '2-118-0-0-0': 10,
 '2-46-1060-0-0': 11,
 '38-41-0-0-0': 12,
 '12-27-0-0-0': 13,
 '8-0-0-0-0': 14,
 '3-0-0-0-0': 15,
 '17-103-0-0-0': 16,
 '38-6-360-0-0': 17,
 '21-0-0-0-0': 18,
 '17-49-0-0-0': 19,
 '4-179-0-0-0': 20,
 '15-182-400-0-0': 21,
 '12-85-404-0-0': 22,
 '15-468-0-0-0': 23,
 '15-183-0-0-0': 24,
 '20-65-189-0-0': 25,
 '5-0-0-0-0': 26,
 '20-374-0-0-0': 27,
 '38-47-0-0-0': 28,
 '21-83-0-0-0': 29,
 '12-26-0-0-0': 30,
 '20-138-0-0-0': 31,
 '12-85-0-0-0': 32,
 '21-580-0-0-0': 33,
 '12-25-71-0-0': 34,
 '15-181-0-0-0': 35,
 '38-6-0-0-0': 36,
 '2-0-0-0-0': 37,
 '20-7-0-0-0': 38,
 '12-25-111-0-0': 39,
 '2-171-0-0-0': 40,
 '12-359-0-0-0': 41,
 '1235-18-538-0-0': 42,
 '2-102-0-0-0': 43,
 '2-92-214-0-0': 44,
 '12-70-0-0-0': 45,
 '2-46-0-0-0': 46,
 '2-46-502-0-0': 47,
 '2-232-0-0-0': 48,
 '12-25-109-0

In [53]:
labels['label1_count']

16

In [54]:


def parse_tfr_element(element):
    #use the same structure as above; it's kinda an outline of the structure we now want to create
    data = {
        'emb' : tf.io.FixedLenFeature([], tf.string),
        'track_id' : tf.io.FixedLenFeature([], tf.int64),
    }
    
    content = tf.io.parse_single_example(element, data)

    track_id = content['track_id']
    emb = content['emb']
    

    #get our 'feature'-- our image -- and reshape it appropriately
    feature = tf.io.parse_tensor(emb, out_type=tf.float32)
    return (feature, track_id)


def get_dataset(filename):
    #create the dataset
    dataset = tf.data.TFRecordDataset(filename)

    #pass every single feature through our mapping function
    dataset = dataset.map(
        parse_tfr_element
    )

    return dataset




In [55]:
import numpy as np


def load_dataset(path,dataset=args.embeddings):
    tfrecords_path = os.path.join(path,'tfrecords',dataset)
    
    
    tfrecords_path = [os.path.join(tfrecords_path,path) for path in os.listdir(tfrecords_path)]
    dataset = get_dataset(tfrecords_path)
    
    df = pd.DataFrame(
        dataset.as_numpy_iterator(),
        columns=['feature', 'track_id']
    )
        
    df.dropna(inplace=True)
    
    
    try:
        df.feature = df.feature.apply(lambda x: x[0] if x.shape[0] != 0 else None)
    except:
        print(x)
    
    return df
    


In [56]:
tracks_df['labels_5'].value_counts()

labels_5
38-0-0-0-0        6508
1235-0-0-0-0      6043
10-76-0-0-0       4126
1235-107-0-0-0    3988
17-103-0-0-0      3482
                  ... 
9-651-493-0-0        4
20-65-189-0-0        4
2-86-173-0-0         4
5-444-0-0-0          2
20-7-0-0-0           2
Name: count, Length: 159, dtype: int64

In [57]:
def __split_data__(group, percentage=0.1):
    if len(group) == 1:
        return group, group

    shuffled = shuffle(group.values)
    finish_test = int(ceil(len(group) * percentage))

    first = pd.DataFrame(shuffled[:finish_test], columns=group.columns)
    second = pd.DataFrame(shuffled[finish_test:], columns=group.columns)

    return first, second

In [58]:
def select_dataset(tracks_df):
    
#     dataset_testset_path = os.path.join(tfrecord_path,'test')
#     dataset_validationset_path = os.path.join(tfrecord_path,'val')
#     dataset_trainset_path = os.path.join(tfrecord_path,'train')
    
    df = load_dataset(args.dataset_path,dataset=args.embeddings)
    
    df.dropna(inplace=True)
    
    tracks_df = tracks_df.merge(df, on='track_id')
    
    tracks_df.loc[:,'labels_1'] = tracks_df.labels_1.astype(str).progress_apply(lambda x: labels['label1'][x])
    tracks_df.loc[:,'labels_2'] = tracks_df.labels_2.astype(str).progress_apply(lambda x: labels['label2'][x])
    tracks_df.loc[:,'labels_3'] = tracks_df.labels_3.astype(str).progress_apply(lambda x: labels['label3'][x])
    tracks_df.loc[:,'labels_4'] = tracks_df.labels_4.astype(str).progress_apply(lambda x: labels['label4'][x])
    tracks_df.loc[:,'labels_5'] = tracks_df.labels_5.astype(str).progress_apply(lambda x: labels['label5'][x])
    
    tests = []
    trains = []
    validations = []
    groups = tracks_df.groupby("labels_5")
    
    
    count = 0
    items_count = 0
    total = len(groups)
    total_items = len(tracks_df)
    oversampling_size = 30  # int(group_sizes.mean() + group_sizes.std() * 2)
    print(f"oversampling_size: {oversampling_size}")

    for code, group in groups:
        test, train_to_split = __split_data__(group, 0.01)  # 10%
        train_to_split = train_to_split
        validation, train = __split_data__(train_to_split, 0.01)  # %1

        tests.append(test)
        validations.append(validation)

        ## this increase the numner of samples when classes has low quantity
        count_train = len(train)
        if count_train < oversampling_size:
            train = train.sample(oversampling_size, replace=True)

        trains.append(train)

        count += 1
        items_count += count_train
        
        
    df_test = pd.concat(tests, sort=False).sample(frac=1).reset_index(drop=True)
    # .to_csv(dataset_testset_path, index=False,quoting=csv.QUOTE_ALL)
    df_val = pd.concat(validations, sort=False).sample(frac=1).reset_index(drop=True)
    df_train = pd.concat(trains, sort=False).sample(frac=1).reset_index(drop=True)

    return df_train,df_test,df_val

In [59]:
df_train,df_test,df_val = select_dataset(tracks_df)

2023-10-09 17:32:27.905047: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9630 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 2080 Ti, pci bus id: 0000:65:00.0, compute capability: 7.5


  0%|          | 0/104169 [00:00<?, ?it/s]

  0%|          | 0/104169 [00:00<?, ?it/s]

  0%|          | 0/104169 [00:00<?, ?it/s]

  0%|          | 0/104169 [00:00<?, ?it/s]

  0%|          | 0/104169 [00:00<?, ?it/s]

oversampling_size: 30


In [60]:
df_train

Unnamed: 0,track_id,full_genre_id,labels_1,labels_2,labels_3,labels_4,labels_5,feature
0,115627,"[12, 25, 89]",8,135,164,373,84,"[0.021790445, 0.29853275, -0.026587656, 0.0666..."
1,146915,"[38, 41]",1,78,158,371,12,"[0.1287572, -0.04188484, -0.020825902, 0.01465..."
2,144872,"[38, 250]",1,92,218,385,126,"[0.084029794, -7.178386e-05, -0.053328514, -0...."
3,66741,"[1235, 18]",4,105,217,312,72,"[0.11808327, 0.10941779, 0.40051472, -0.049361..."
4,121728,"[1235, 107]",4,25,206,331,93,"[-0.03590433, 0.020767719, -0.03485803, -0.030..."
...,...,...,...,...,...,...,...,...
102226,96841,"[38, 41]",1,78,158,371,12,"[-0.03989613, -0.004323274, 0.02414892, 0.0396..."
102227,19591,[8],5,68,267,338,14,"[0.056298267, -0.018925885, -0.04787774, -0.01..."
102228,5166,[12],8,128,236,442,69,"[0.031009829, 0.012057076, -0.03138266, 0.0563..."
102229,106189,"[15, 286]",2,132,160,393,87,"[0.009615977, 0.12510824, 0.010890921, 0.00987..."


In [61]:
def _bytes_feature(value):
  ### Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))): # if value ist tensor
        value = value.numpy() # get value of tensor
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))

def _float_feature(value):
  ### Returns a floast_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def _int64List_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def _int64_feature(value):
  ###  Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_array(array):
    array = tf.io.serialize_tensor(array)
    return array

In [62]:
def parse_single_music(data,labels):
    track_id, _, cat1, cat2, cat3, cat4, cat5, music = data
    
    
    label1 = np.array([cat1, labels['label1_count']], np.int64)
    label2 = np.array([cat2, labels['label2_count']], np.int64)
    label3 = np.array([cat3, labels['label3_count']], np.int64)
    label4 = np.array([cat4, labels['label4_count']], np.int64)
    label5 = np.array([cat5, labels['label5_count']], np.int64)
    
    
    
    #define the dictionary -- the structure -- of our single example
    data = {
        'label1': _int64List_feature(label1),
        'label2': _int64List_feature(label2),
        'label3': _int64List_feature(label3),
        'label4': _int64List_feature(label4),
        'label5': _int64List_feature(label5),
        # 'features' : _bytes_feature(serialize_array(music)),
        'features' : _float_feature(music),
        'track_id' : _int64_feature(track_id)
    }
    #create an Example, wrapping the single features
    out = tf.train.Example(features=tf.train.Features(feature=data))

    return out

In [63]:
def generate_tf_record(df,tf_path='val'):
    create_dir(tf_path)
    
    
    batch_size = 1024 * 50 # 50k records from each file batch
    count = 0
    total = math.ceil(len(df) / batch_size)

    for i in range(0, len(df), batch_size):
        batch_df = df[i:i+batch_size]
        
        tfrecords = [parse_single_music(data, labels) for data in batch_df.values]
        
        path = f"{tf_path}/{str(count).zfill(10)}.tfrecord"
        
        #with tf.python_io.TFRecordWriter(path) as writer:
        with tf.io.TFRecordWriter(path) as writer:
            for tfrecord in tfrecords:
                writer.write(tfrecord.SerializeToString())

        print(f"{count} {len(tfrecords)} {path}")
        count += 1
        print(f"{count}/{total} batchs / {count * batch_size} processed")

    print(f"{count}/{total} batchs / {len(df)} processed")
    
    return tf_path

    

In [64]:
tfrecord_path

'/mnt/disks/data/fma/trains/hierarchical_all/tfrecords'

In [65]:
val_path = generate_tf_record(df_val,tf_path=os.path.join(tfrecord_path,'val'))

0 1114 /mnt/disks/data/fma/trains/hierarchical_all/tfrecords/val/0000000000.tfrecord
1/1 batchs / 51200 processed
1/1 batchs / 1114 processed


In [66]:
test_path = generate_tf_record(df_test,tf_path=os.path.join(tfrecord_path,'test'))

0 1125 /mnt/disks/data/fma/trains/hierarchical_all/tfrecords/test/0000000000.tfrecord
1/1 batchs / 51200 processed
1/1 batchs / 1125 processed


In [67]:
train_path = generate_tf_record(df_train,tf_path=os.path.join(tfrecord_path,'train'))

0 51200 /mnt/disks/data/fma/trains/hierarchical_all/tfrecords/train/0000000000.tfrecord
1/2 batchs / 51200 processed
1 51031 /mnt/disks/data/fma/trains/hierarchical_all/tfrecords/train/0000000001.tfrecord
2/2 batchs / 102400 processed
2/2 batchs / 102231 processed


In [68]:
def create_metadata(metadata_path):

    with open(metadata_path, 'w+') as f:
        f.write(json.dumps({
            'sequence_size': args.sequence_size,
            'n_levels': labels_size,
            'labels_size': [labels['label1_count'],labels['label2_count'],
                           labels['label3_count'],labels['label4_count'],
                           labels['label5_count']],
            'val_path': val_path,
            'train_path': train_path,
            'test_path': test_path,
            'trainset_count': len(df_train),
            'validationset_count': len(df_val),
            'testset_count': len(df_test)
        }))

In [69]:
create_metadata(metadata_path)

In [70]:
job_path

'/mnt/disks/data/fma/trains/hierarchical_all'

In [71]:
tracks_df.to_csv(os.path.join(job_path,"tracks.csv"),index=False)

In [72]:
with open(categories_labels_path, 'r') as f:
    labels = json.loads(f.read())

In [130]:
levels_size = {'level1_size': labels['label1_count']-1,
        'level2_size': labels['label2_count']-1,
        'level3_size': labels['label3_count']-1,
        'level4_size': labels['label4_count']-1,
        'level5_size': labels['label5_count']-1}

In [131]:
levels_size['level1_size']

15