In [1]:
import pandas as pd
import numpy as np
import json
import ast
import os

In [2]:
from tqdm.notebook import tqdm

In [3]:
tqdm.pandas()

In [4]:

args = pd.Series({
    "root_dir":"/mnt/disks/data/",
    "dataset_path":"/mnt/disks/data/fma/fma_large",
    "embeddings":"music_style",
    "train_id": "hierarchical_partition",
    'sample_size':1
})


In [5]:

job_path = "/mnt/disks/data/fma/trains"

# In[15]:

train_path = os.path.join(job_path,args.train_id)

# In[16]:

base_path = os.path.join(args.root_dir,"fma")

# In[17]:

models_path = os.path.join(args.root_dir,"models")


metadata_path_fma = os.path.join(base_path,"fma_metadata")

# In[18]:

metadata_file = os.path.join(train_path,"metadata.json")


categories_labels_path = os.path.join(train_path,"labels.json")


In [6]:


def __load_json__(path):
    with open(path, 'r') as f:
        tmp = json.loads(f.read())

    return tmp




In [7]:

def create_dir(path):
    # checking if the directory demo_folder2 
    # exist or not.
    if not os.path.isdir(path):

        # if the demo_folder2 directory is 
        # not present then create it.
        os.makedirs(path)
    return True



In [8]:
create_dir(train_path)

True

## Load genres file. Contains relationships beetwen genres

In [9]:
genres_df = pd.read_csv(os.path.join(metadata_path_fma,'genres.csv'))


In [10]:
genres_df

Unnamed: 0,genre_id,#tracks,parent,title,top_level
0,1,8693,38,Avant-Garde,38
1,2,5271,0,International,2
2,3,1752,0,Blues,3
3,4,4126,0,Jazz,4
4,5,4106,0,Classical,5
...,...,...,...,...,...
158,1032,60,102,Turkish,2
159,1060,30,46,Tango,2
160,1156,26,130,Fado,2
161,1193,72,763,Christmas,38


In [11]:
genres_df[genres_df['genre_id'] == 495]

Unnamed: 0,genre_id,#tracks,parent,title,top_level
136,495,2061,15,Downtempo,15


In [12]:
# Cria um dicionário que associa o ID de cada música aos IDs de seus gêneros musicais
tracks_df = pd.read_csv(os.path.join(metadata_path_fma,'tracks_valid.csv'))

In [13]:
tracks_df = tracks_df.sample(frac=args.sample_size)

In [14]:
tracks_df.sample(20)

Unnamed: 0,track_id,track_title,valid_genre
56573,85364,Affectum Avibus,"['15', '30', '38']"
72844,112739,Stay Lost,"['17', '94', '137']"
1566,1953,Rancid Hearts,['12']
29983,46496,The Coming of Kiegher's Army,"['15', '18', '322']"
77143,118475,Martin,"['76', '79', '362']"
9873,16577,East 105st (AHX Ghetto Mix),"['15', '297']"
75394,116021,Democracia (Sci Fi Industries Remix),['15']
99628,148345,Rico-gabber,['695']
52774,80506,River Part One,"['15', '181', '659']"
80773,123050,Fremen,"['42', '811']"


In [15]:
tracks_df.valid_genre.values

array(["['26', '107', '267']", "['38', '41', '247']", "['76', '362']",
       ..., "['12', '66', '111']", "['12']", "['12', '25', '27']"],
      dtype=object)

In [16]:
tracks_df.track_title

85067                                        "why?"
102949    Une sombre histoire de palourde japonaise
29251                                    GRAVESLAVE
16773                              Lowtom Interlude
84542                                    j a b s 10
                            ...                    
50619                                      4. Slaps
2689                          Under Skin Or By Name
46131                               Sea of Darkness
34680                                   Interview 2
83294                                          Sick
Name: track_title, Length: 104186, dtype: object

In [17]:
## Get complete genre structure
def get_all_structure(estrutura,df_genres):
    ## Get structure from df_genres
    def get_all_structure_from_df(estrutura,df_genres,structure=[]):
        if estrutura == 0:
            return structure
        else:
            structure.append(estrutura)
            get_all_structure_from_df(df_genres[df_genres["genre_id"]==int(estrutura)].parent.values[0],df_genres,structure)
            return structure
    
    return get_all_structure_from_df(estrutura,df_genres,structure=[])
    

In [18]:
# tracks_df['valid_genre'] = tracks_df.track_genres.apply(lambda x: x.strip('][').split(', ') if x != '[]' else None)
tracks_df['valid_genre'] = tracks_df.valid_genre.apply(lambda x: ast.literal_eval(x))

In [19]:
tracks_df['valid_genre']

85067     [26, 107, 267]
102949     [38, 41, 247]
29251          [76, 362]
16773            [4, 38]
84542               [15]
               ...      
50619               [21]
2689                [58]
46131      [12, 66, 111]
34680               [12]
83294       [12, 25, 27]
Name: valid_genre, Length: 104186, dtype: object

In [20]:
tracks_df['last_genre_id'] = tracks_df.valid_genre.apply(lambda x:x[-1] if x != None else None)

In [21]:
tracks_df.sample(20)

Unnamed: 0,track_id,track_title,valid_genre,last_genre_id
57313,86231,Lake Of Misplacement,"[36, 38]",38
60416,92308,El esclavo del amor,"[25, 46, 76]",76
21916,35052,Switched On,"[15, 38, 70, 183, 236]",236
47238,70935,When It Rains,[659],659
4073,8203,quarter pounder,[25],25
20094,32459,They Rot,"[31, 58, 90]",90
22978,36416,Horizon (lyrics by Robert Mitchell),"[66, 186, 188, 361]",361
29394,45383,Further...,"[3, 10, 27, 33, 58, 103, 137]",137
55431,83912,Goodbye My Darling,"[1, 38, 186]",186
37621,57852,Texas Stampede,"[10, 89, 109]",109


In [22]:
# tracks_df.dropna(inplace=True)

In [23]:
tracks_df

Unnamed: 0,track_id,track_title,valid_genre,last_genre_id
85067,129006,"""why?""","[26, 107, 267]",267
102949,153877,Une sombre histoire de palourde japonaise,"[38, 41, 247]",247
29251,45120,GRAVESLAVE,"[76, 362]",362
16773,27635,Lowtom Interlude,"[4, 38]",38
84542,128257,j a b s 10,[15],15
...,...,...,...,...
50619,75655,4. Slaps,[21],21
2689,4502,Under Skin Or By Name,[58],58
46131,69261,Sea of Darkness,"[12, 66, 111]",111
34680,53353,Interview 2,[12],12


In [24]:
tracks_df['full_genre_id'] = tracks_df.last_genre_id.apply(lambda x: get_all_structure(x,genres_df)[::-1])

In [25]:
tracks_df.full_genre_id.value_counts()

[38]             6508
[1235]           6043
[10, 76]         4126
[1235, 107]      3988
[17, 103]        3482
                 ... 
[20, 65, 189]       4
[9, 651, 493]       4
[2, 86, 173]        4
[20, 7]             2
[5, 444]            2
Name: full_genre_id, Length: 159, dtype: int64

In [26]:
tracks_df.columns

Index(['track_id', 'track_title', 'valid_genre', 'last_genre_id',
       'full_genre_id'],
      dtype='object')

In [27]:
tracks_df = tracks_df[['track_id','full_genre_id']]

In [28]:
tracks_df.full_genre_id.value_counts()

[38]             6508
[1235]           6043
[10, 76]         4126
[1235, 107]      3988
[17, 103]        3482
                 ... 
[20, 65, 189]       4
[9, 651, 493]       4
[2, 86, 173]        4
[20, 7]             2
[5, 444]            2
Name: full_genre_id, Length: 159, dtype: int64

In [29]:
tracks_df.full_genre_id.info

<bound method Series.info of 85067       [1235, 267]
102949        [38, 247]
29251         [10, 362]
16773              [38]
84542              [15]
              ...      
50619              [21]
2689           [12, 58]
46131     [12, 25, 111]
34680              [12]
83294          [12, 27]
Name: full_genre_id, Length: 104186, dtype: object>

In [30]:
labels_size = tracks_df.full_genre_id.apply(lambda x: len(x))

In [31]:
labels_size = labels_size.max()

In [32]:
labels_size

5

In [33]:
import pandas as pd
import os

### Parse of label to structure

In [34]:
### Function for parse label to sctructure of hierarhical scheme

def parse_label(label,label_size=5):
    # label = label.split('-')
    # preencher com 0 no caso de haver menos de 5 níveis
    labels = np.zeros(label_size,dtype=int)
    for i, label in enumerate(label):
        if i == 5:
            break
        # Aqui você pode fazer a conversão do label em um índice inteiro usando um dicionário ou outro método
        # Neste exemplo, estou apenas usando a posição da label na lista como índice
        labels[i] = label
    return labels

In [35]:
tracks_df.full_genre_id.value_counts()

[38]             6508
[1235]           6043
[10, 76]         4126
[1235, 107]      3988
[17, 103]        3482
                 ... 
[20, 65, 189]       4
[9, 651, 493]       4
[2, 86, 173]        4
[20, 7]             2
[5, 444]            2
Name: full_genre_id, Length: 159, dtype: int64

In [36]:
parsed_labels = tracks_df.full_genre_id.apply(lambda x: parse_label(x))

In [37]:
def convert_label_to_string(x,level=2):
    return '-'.join([str(value) for value in x[:level]])

In [38]:
tracks_df['labels_1'] = parsed_labels.progress_apply(lambda x: str(x[:1][0]))
tracks_df['labels_2'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=2))
tracks_df['labels_3'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=3))
tracks_df['labels_4'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=4))
tracks_df['labels_5'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=5))

  0%|          | 0/104186 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tracks_df['labels_1'] = parsed_labels.progress_apply(lambda x: str(x[:1][0]))


  0%|          | 0/104186 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tracks_df['labels_2'] = parsed_labels.progress_apply(lambda x: convert_label_to_string(x,level=2))


  0%|          | 0/104186 [00:00<?, ?it/s]

  0%|          | 0/104186 [00:00<?, ?it/s]

  0%|          | 0/104186 [00:00<?, ?it/s]

In [41]:
tracks_df.labels_1.value_counts()

38      22066
12      21710
15      18493
1235    11214
17       6900
10       6514
21       6453
2        3331
5        2456
20       1183
4        1147
9        1023
8         620
13        466
14        430
3         180
Name: labels_1, dtype: int64

In [42]:
tracks_df = tracks_df[tracks_df['labels_1'].isin(['38','12'])]

In [43]:
tracks_df.to_csv(os.path.join(train_path,"tracks.csv"),index=False)

In [44]:
tracks_df = pd.read_csv(os.path.join(train_path,"tracks.csv"))

In [45]:
categories_df = pd.DataFrame({'level5':tracks_df.labels_5.unique()})

In [46]:
categories_df['level1'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:1]))
categories_df['level2'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:2]))
categories_df['level3'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:3]))
categories_df['level4'] = categories_df.level5.progress_apply(lambda x: '-'.join(x.split('-')[:4]))

  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/47 [00:00<?, ?it/s]

  0%|          | 0/47 [00:00<?, ?it/s]

In [47]:
categories_df

Unnamed: 0,level5,level1,level2,level3,level4
0,38-247-0-0-0,38,38-247,38-247-0,38-247-0-0
1,38-0-0-0-0,38,38-0,38-0-0,38-0-0-0
2,12-26-113-0-0,12,12-26,12-26-113,12-26-113-0
3,12-25-89-0-0,12,12-25,12-25-89,12-25-89-0
4,12-0-0-0-0,12,12-0,12-0-0,12-0-0-0
5,12-36-0-0-0,12,12-36,12-36-0,12-36-0-0
6,12-27-0-0-0,12,12-27,12-27-0,12-27-0-0
7,12-66-0-0-0,12,12-66,12-66-0,12-66-0-0
8,38-224-0-0-0,38,38-224,38-224-0,38-224-0-0
9,38-250-0-0-0,38,38-250,38-250-0,38-250-0-0


In [48]:
def get_labels_name(x,genres_df):
    levels = 5
    full_name = []
    last_level = 0
    genre_root = ""
    for genre in x.split('-'):
        genre_df = genres_df[genres_df['genre_id'] == int(genre)]
        if genre_df.empty:
            genre_name = genre_root 
        else:
            genre_name = genre_df.title.values.tolist()[0]
            genre_root = genre_name
        
        full_name.append(genre_name)
    full_name = '>'.join(full_name)
        
    return full_name
    # return genres_df[genres_df['genre_id'] == int(x)].title.values.tolist()[0]

In [49]:
categories_df['level5_name'] = categories_df.level5.apply(lambda x: get_labels_name(x,genres_df))

In [50]:
categories_df

Unnamed: 0,level5,level1,level2,level3,level4,level5_name
0,38-247-0-0-0,38,38-247,38-247-0,38-247-0-0,Experimental>Musique Concrete>Musique Concrete...
1,38-0-0-0-0,38,38-0,38-0-0,38-0-0-0,Experimental>Experimental>Experimental>Experim...
2,12-26-113-0-0,12,12-26,12-26-113,12-26-113-0,Rock>Post-Rock>Space-Rock>Space-Rock>Space-Rock
3,12-25-89-0-0,12,12-25,12-25-89,12-25-89-0,Rock>Punk>Post-Punk>Post-Punk>Post-Punk
4,12-0-0-0-0,12,12-0,12-0-0,12-0-0-0,Rock>Rock>Rock>Rock>Rock
5,12-36-0-0-0,12,12-36,12-36-0,12-36-0-0,Rock>Krautrock>Krautrock>Krautrock>Krautrock
6,12-27-0-0-0,12,12-27,12-27-0,12-27-0-0,Rock>Lo-Fi>Lo-Fi>Lo-Fi>Lo-Fi
7,12-66-0-0-0,12,12-66,12-66-0,12-66-0-0,Rock>Indie-Rock>Indie-Rock>Indie-Rock>Indie-Rock
8,38-224-0-0-0,38,38-224,38-224-0,38-224-0-0,Experimental>Sound Collage>Sound Collage>Sound...
9,38-250-0-0-0,38,38-250,38-250-0,38-250-0-0,Experimental>Improv>Improv>Improv>Improv


In [51]:
def __create_labels__(categories_df):
    data = {
        "label1": {},
        "label2": {},
        "label3": {},
        "label4": {},
        "label5": {},
        "label1_inverse": [],
        "label2_inverse": [],
        "label3_inverse": [],
        "label4_inverse": [],
        "label5_inverse": [],
        "label1_name": {},
        "label2_name": {},
        "label3_name": {},
        "label4_name": {},
        "label5_name": {},
    }

    for idx, cat in enumerate(set(categories_df.level1.values.tolist())):
        data['label1'][cat] = idx
        data['label1_inverse'].append(cat)
        data['label1_count'] = idx + 1

    for idx, cat in enumerate(set(categories_df.level2.values.tolist())):
        data['label2'][cat] = idx
        data['label2_inverse'].append(cat)
        data['label2_count'] = idx + 1
        
    for idx, cat in enumerate(set(categories_df.level3.values.tolist())):
        data['label3'][cat] = idx
        data['label3_inverse'].append(cat)
        data['label3_count'] = idx + 1

    for idx, cat in enumerate(set(categories_df.level4.values.tolist())):
        data['label4'][cat] = idx
        data['label4_inverse'].append(cat)
        data['label4_count'] = idx + 1
        
    for idx, cat in enumerate(set(categories_df.level5.values.tolist())):
        data['label5'][cat] = idx
        data['label5_inverse'].append(cat)
        data['label5_count'] = idx + 1
        
    for cat5,cat1,cat2,cat3,cat4,name5 in categories_df.values:
        
        name1 = '>'.join(name5.split('>')[:1])
        name2 = '>'.join(name5.split('>')[:2])
        name3 = '>'.join(name5.split('>')[:3])
        name4 = '>'.join(name5.split('>')[:4])
        
        
        data['label1_name'][cat1] = name1
        data['label2_name'][cat2] = name2
        data['label3_name'][cat3] = name3
        data['label4_name'][cat4] = name4
        data['label5_name'][cat5] = name5
        
    return data

In [52]:
with open(categories_labels_path, 'w+') as f:
    f.write(json.dumps(__create_labels__(categories_df)))

In [53]:
labels  = __create_labels__(categories_df)

In [54]:
labels['label3_name']

{'38-247-0': 'Experimental>Musique Concrete>Musique Concrete',
 '38-0-0': 'Experimental>Experimental>Experimental',
 '12-26-113': 'Rock>Post-Rock>Space-Rock',
 '12-25-89': 'Rock>Punk>Post-Punk',
 '12-0-0': 'Rock>Rock>Rock',
 '12-36-0': 'Rock>Krautrock>Krautrock',
 '12-27-0': 'Rock>Lo-Fi>Lo-Fi',
 '12-66-0': 'Rock>Indie-Rock>Indie-Rock',
 '38-224-0': 'Experimental>Sound Collage>Sound Collage',
 '38-250-0': 'Experimental>Improv>Improv',
 '12-45-53': 'Rock>Loud-Rock>Noise-Rock',
 '12-58-0': 'Rock>Psych-Rock>Psych-Rock',
 '38-30-0': 'Experimental>Field Recordings>Field Recordings',
 '12-314-0': 'Rock>Goth>Goth',
 '38-32-0': 'Experimental>Noise>Noise',
 '38-125-0': 'Experimental>Unclassifiable>Unclassifiable',
 '38-456-0': 'Experimental>Minimalism>Minimalism',
 '12-85-0': 'Rock>Garage>Garage',
 '38-41-0': 'Experimental>Electroacoustic>Electroacoustic',
 '12-359-0': 'Rock>Shoegaze>Shoegaze',
 '38-514-0': 'Experimental>Sound Art>Sound Art',
 '12-25-0': 'Rock>Punk>Punk',
 '12-25-71': 'Rock>Punk