In [43]:
%pwd

'/mnt/c/Users/kubar/Documents/Sem7/labelthissongforme/notebooks'

In [44]:
import numpy as np
import pandas as pd
import random
import torch
import os

from sklearn.model_selection import train_test_split

In [45]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

set_seed(123456)

Random seed set as 123456


In [46]:
split_path = "../split/mtat/"
os.makedirs(split_path, exist_ok=True)
n_tags = 50

In [47]:
broken_tracks = [
    "6/norine_braun-now_and_zen-08-gently-117-146.mp3",
    "8/jacob_heringman-josquin_des_prez_lute_settings-19-gintzler__pater_noster-204-233.mp3",
    "9/american_baroque-dances_and_suites_of_rameau_and_couperin-25-le_petit_rien_xiveme_ordre_couperin-88-117.mp3",
    "c/domased-new_memories-07-first_sun_rays-233-262.mp3",
    "d/tim_rayborn-chordae-04-sempr_alegria-784-813.mp3"
]

In [48]:
adnotations = pd.read_csv('../data/mtat/annotations_final.csv', sep='\t', index_col="clip_id")
adnotations = adnotations[~adnotations["mp3_path"].isin(broken_tracks)]
adnotations

Unnamed: 0_level_0,no voice,singer,duet,plucking,hard rock,world,bongos,harpsichord,female singing,clasical,...,rap,metal,hip hop,quick,water,baroque,women,fiddle,english,mp3_path
clip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58899,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8/jacob_heringman-blame_not_my_lute-56-la_bres...
58906,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8/jacob_heringman-blame_not_my_lute-57-lost_is...
58907,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8/jacob_heringman-blame_not_my_lute-57-lost_is...
58908,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8/jacob_heringman-blame_not_my_lute-57-lost_is...


In [49]:
clip_info = pd.read_csv('../data/mtat/clip_info_final.csv', sep='\t')
clip_info

Unnamed: 0,clip_id,track_number,title,artist,album,url,segmentStart,segmentEnd,original_url,mp3_path
0,2,1,BWV54 - I Aria,American Bach Soloists,J.S. Bach Solo Cantatas,http://www.magnatune.com/artists/albums/abs-so...,30,59,http://he3.magnatune.com/all/01--BWV54%20-%20I...,f/american_bach_soloists-j_s__bach_solo_cantat...
1,6,1,BWV54 - I Aria,American Bach Soloists,J.S. Bach Solo Cantatas,http://www.magnatune.com/artists/albums/abs-so...,146,175,http://he3.magnatune.com/all/01--BWV54%20-%20I...,f/american_bach_soloists-j_s__bach_solo_cantat...
2,10,1,BWV54 - I Aria,American Bach Soloists,J.S. Bach Solo Cantatas,http://www.magnatune.com/artists/albums/abs-so...,262,291,http://he3.magnatune.com/all/01--BWV54%20-%20I...,f/american_bach_soloists-j_s__bach_solo_cantat...
3,11,1,BWV54 - I Aria,American Bach Soloists,J.S. Bach Solo Cantatas,http://www.magnatune.com/artists/albums/abs-so...,291,320,http://he3.magnatune.com/all/01--BWV54%20-%20I...,f/american_bach_soloists-j_s__bach_solo_cantat...
4,12,1,BWV54 - I Aria,American Bach Soloists,J.S. Bach Solo Cantatas,http://www.magnatune.com/artists/albums/abs-so...,320,349,http://he3.magnatune.com/all/01--BWV54%20-%20I...,f/american_bach_soloists-j_s__bach_solo_cantat...
...,...,...,...,...,...,...,...,...,...,...
31377,58899,56,La Bressanina,Jacob Heringman,Blame Not My Lute,http://www.magnatune.com/artists/albums/hering...,88,117,http://he3.magnatune.com/all/56-La%20Bressanin...,8/jacob_heringman-blame_not_my_lute-56-la_bres...
31378,58906,57,Lost is my Lyberty,Jacob Heringman,Blame Not My Lute,http://www.magnatune.com/artists/albums/hering...,0,29,http://he3.magnatune.com/all/57-Lost%20is%20my...,8/jacob_heringman-blame_not_my_lute-57-lost_is...
31379,58907,57,Lost is my Lyberty,Jacob Heringman,Blame Not My Lute,http://www.magnatune.com/artists/albums/hering...,30,59,http://he3.magnatune.com/all/57-Lost%20is%20my...,8/jacob_heringman-blame_not_my_lute-57-lost_is...
31380,58908,57,Lost is my Lyberty,Jacob Heringman,Blame Not My Lute,http://www.magnatune.com/artists/albums/hering...,59,88,http://he3.magnatune.com/all/57-Lost%20is%20my...,8/jacob_heringman-blame_not_my_lute-57-lost_is...


# n tags with the largest quantity in dataset

In [50]:
tags = adnotations.filter(regex='[^mp3_path]').sum().nlargest(n_tags)
tags

guitar          4851
classical       4269
slow            3547
techno          2953
strings         2726
drums           2598
electronic      2519
rock            2371
fast            2304
piano           2056
ambient         1956
beat            1906
violin          1826
vocal           1729
synth           1716
female          1474
indian          1395
opera           1296
male            1279
singing         1211
vocals          1184
no vocals       1158
harpsichord     1092
loud            1086
quiet           1055
flute           1024
woman           1016
male vocal      1002
no vocal         995
pop              995
soft             985
sitar            925
solo             826
man              741
classic          690
choir            688
voice            665
new age          650
dance            649
male voice       644
female vocal     644
beats            634
harp             621
cello            575
no voice         573
weird            557
country          541
female voice 

In [51]:
tags = tags.keys().to_numpy('str')
np.save(os.path.join(split_path, 'tags.npy'), tags)
tags

array(['guitar', 'classical', 'slow', 'techno', 'strings', 'drums',
       'electronic', 'rock', 'fast', 'piano', 'ambient', 'beat', 'violin',
       'vocal', 'synth', 'female', 'indian', 'opera', 'male', 'singing',
       'vocals', 'no vocals', 'harpsichord', 'loud', 'quiet', 'flute',
       'woman', 'male vocal', 'no vocal', 'pop', 'soft', 'sitar', 'solo',
       'man', 'classic', 'choir', 'voice', 'new age', 'dance',
       'male voice', 'female vocal', 'beats', 'harp', 'cello', 'no voice',
       'weird', 'country', 'female voice', 'metal', 'choral'],
      dtype='<U12')

# Binary tags

In [52]:
binary = adnotations[tags]
binary

Unnamed: 0_level_0,guitar,classical,slow,techno,strings,drums,electronic,rock,fast,piano,...,female vocal,beats,harp,cello,no voice,weird,country,female voice,metal,choral
clip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58899,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58906,1,0,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
58907,1,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58908,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
indexes_to_keep = binary[binary.sum(axis=1) != 0].index.values
indexes_to_keep

array([    2,     6,    10, ..., 58907, 58908, 58915])

In [54]:
binary = binary[binary.index.isin(indexes_to_keep)]
binary.insert(0, "clip_id", binary.index)
binary

Unnamed: 0_level_0,clip_id,guitar,classical,slow,techno,strings,drums,electronic,rock,fast,...,female vocal,beats,harp,cello,no voice,weird,country,female voice,metal,choral
clip_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,2,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,6,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10,10,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11,11,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12,12,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58899,58899,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58906,58906,1,0,1,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
58907,58907,1,1,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
58908,58908,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
binary_np = binary.to_numpy()
np.save(os.path.join(split_path, 'binary.npy'), binary_np)
binary_np

array([[    2,     0,     1, ...,     0,     0,     0],
       [    6,     0,     1, ...,     0,     0,     0],
       [   10,     0,     1, ...,     0,     0,     0],
       ...,
       [58907,     1,     1, ...,     0,     0,     0],
       [58908,     1,     1, ...,     0,     0,     0],
       [58915,     1,     1, ...,     0,     0,     0]])

In [56]:
adnotations = adnotations[adnotations.index.isin(indexes_to_keep)]
adnotations = adnotations['mp3_path']
adnotations

clip_id
2        f/american_bach_soloists-j_s__bach_solo_cantat...
6        f/american_bach_soloists-j_s__bach_solo_cantat...
10       f/american_bach_soloists-j_s__bach_solo_cantat...
11       f/american_bach_soloists-j_s__bach_solo_cantat...
12       f/american_bach_soloists-j_s__bach_solo_cantat...
                               ...                        
58899    8/jacob_heringman-blame_not_my_lute-56-la_bres...
58906    8/jacob_heringman-blame_not_my_lute-57-lost_is...
58907    8/jacob_heringman-blame_not_my_lute-57-lost_is...
58908    8/jacob_heringman-blame_not_my_lute-57-lost_is...
58915    8/jacob_heringman-blame_not_my_lute-58-a_toy_f...
Name: mp3_path, Length: 21106, dtype: object

# Spliting dataset to train, valid and test datasets

In [57]:
dataset = pd.DataFrame(adnotations, columns=['idx', 'mp3_path'])
dataset["idx"] = dataset.index
dataset

Unnamed: 0_level_0,idx,mp3_path
clip_id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,2,f/american_bach_soloists-j_s__bach_solo_cantat...
6,6,f/american_bach_soloists-j_s__bach_solo_cantat...
10,10,f/american_bach_soloists-j_s__bach_solo_cantat...
11,11,f/american_bach_soloists-j_s__bach_solo_cantat...
12,12,f/american_bach_soloists-j_s__bach_solo_cantat...
...,...,...
58899,58899,8/jacob_heringman-blame_not_my_lute-56-la_bres...
58906,58906,8/jacob_heringman-blame_not_my_lute-57-lost_is...
58907,58907,8/jacob_heringman-blame_not_my_lute-57-lost_is...
58908,58908,8/jacob_heringman-blame_not_my_lute-57-lost_is...


In [58]:
dataset.to_numpy()

array([[2,
        'f/american_bach_soloists-j_s__bach_solo_cantatas-01-bwv54__i_aria-30-59.mp3'],
       [6,
        'f/american_bach_soloists-j_s__bach_solo_cantatas-01-bwv54__i_aria-146-175.mp3'],
       [10,
        'f/american_bach_soloists-j_s__bach_solo_cantatas-01-bwv54__i_aria-262-291.mp3'],
       ...,
       [58907,
        '8/jacob_heringman-blame_not_my_lute-57-lost_is_my_lyberty-30-59.mp3'],
       [58908,
        '8/jacob_heringman-blame_not_my_lute-57-lost_is_my_lyberty-59-88.mp3'],
       [58915,
        '8/jacob_heringman-blame_not_my_lute-58-a_toy_for_monica_bishop-0-29.mp3']],
      dtype=object)

In [59]:
train_ratio = 0.75
valid_ratio = 0.15
test_ratio = 0.1

In [60]:
train, test = train_test_split(dataset, test_size=1 - train_ratio)
valid, test = train_test_split(test, test_size=test_ratio/(test_ratio + valid_ratio))

In [61]:
dataset_train = train.to_numpy()
np.save(os.path.join(split_path, 'train.npy'), dataset_train)
dataset_train

array([[47962, 'f/strojovna_07-dirnix-12-ezachrum-0-29.mp3'],
       [45240,
        'f/heavy_mellow-acoustic_abstracts-11-bright_side-0-29.mp3'],
       [54127,
        '1/vito_paternoster-cd1bach_cello_suites-16-suite_vi_in_re_magiore__sarabande-146-175.mp3'],
       ...,
       [8501,
        'c/jay_kishor-the_payans_concert-02-nat_malhar-1074-1103.mp3'],
       [58382,
        'c/edward_martin-art_of_the_lute_in_renaissance_france-28-premier_branle_de_bourgogne_le_roy-30-59.mp3'],
       [16974,
        '4/american_bach_soloists-j_s__bach__favorite_cantatas-04-chorale__zion_hort_die_wachter_singen-175-204.mp3']],
      dtype=object)

In [62]:
dataset_valid = valid.to_numpy()
np.save(os.path.join(split_path, 'valid.npy'), dataset_valid)
dataset_valid

array([[5952,
        '9/suzanne_teng-enchanted_wind-02-above_the_clouds-146-175.mp3'],
       [8392,
        '9/various_artists-the_2007_magnatune_records_sampler-02-moonrise_yongen-0-29.mp3'],
       [5312,
        '7/dac_crowell-the_sea_and_the_sky-01-tidal_motion-958-987.mp3'],
       ...,
       [58146,
        'd/katherine_roberts_perl-j_s__bach_french_suites-26-suite_no_5_in_g_major_bwv_816_gigue-146-175.mp3'],
       [18635, 'a/tilopa-by_the_way-04-moving_on-30-59.mp3'],
       [21251,
        '4/john_williams-long_ride_home-05-anything_at_all-175-204.mp3']],
      dtype=object)

In [63]:
dataset_test = test.to_numpy()
np.save(os.path.join(split_path, 'test.npy'), dataset_test)
dataset_test

array([[33148,
        '8/magnatune-red_hat_summit_compilation-07-shane_jackman__what_i_did_on_my_vacation-204-233.mp3'],
       [53033,
        '3/jan_hanford-24_preludes_for_solo_piano-15-prelude_no__15_in_a_minor-59-88.mp3'],
       [44098, 'b/solar_cycle-sunlight-10-shaman_night-0-29.mp3'],
       ...,
       [37862,
        'e/c_layne-the_sun_will_come_out_to_blind_you-08-the_king-175-204.mp3'],
       [10024,
        'c/vito_paternoster-cd1bach_sonatas_and_partitas_for_solo_violin-02-sonata_prima_in_do_minore__fuga_allegro-146-175.mp3'],
       [34890, 'e/shira_kammen-music_of_waters-08-boathag-117-146.mp3']],
      dtype=object)