In [1]:
import numpy as np
import pandas as pd
import random
import torch
import os

from sklearn.model_selection import train_test_split

In [2]:
def set_seed(seed: int = 42) -> None:
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    # Set a fixed value for the hash seed
    os.environ["PYTHONHASHSEED"] = str(seed)
    print(f"Random seed set as {seed}")

set_seed(123456)

Random seed set as 123456


In [3]:
split_path = "../split/mtat/"
os.makedirs(split_path, exist_ok=True)

In [4]:
broken_tracks = [
    "6/norine_braun-now_and_zen-08-gently-117-146.mp3",
    "8/jacob_heringman-josquin_des_prez_lute_settings-19-gintzler__pater_noster-204-233.mp3",
    "9/american_baroque-dances_and_suites_of_rameau_and_couperin-25-le_petit_rien_xiveme_ordre_couperin-88-117.mp3",
    "c/domased-new_memories-07-first_sun_rays-233-262.mp3",
    "d/tim_rayborn-chordae-04-sempr_alegria-784-813.mp3"
]

In [5]:
adnotations = pd.read_csv('../data/mtat/annotations_final.csv', sep='\t')
adnotations = adnotations[~adnotations["mp3_path"].isin(broken_tracks)]
adnotations

Unnamed: 0,clip_id,no voice,singer,duet,plucking,hard rock,world,bongos,harpsichord,female singing,...,rap,metal,hip hop,quick,water,baroque,women,fiddle,english,mp3_path
0,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
1,6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
2,10,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
3,11,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
4,12,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25858,58899,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8/jacob_heringman-blame_not_my_lute-56-la_bres...
25859,58906,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8/jacob_heringman-blame_not_my_lute-57-lost_is...
25860,58907,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8/jacob_heringman-blame_not_my_lute-57-lost_is...
25861,58908,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8/jacob_heringman-blame_not_my_lute-57-lost_is...


In [6]:
clip_info = pd.read_csv('../data/mtat/clip_info_final.csv', sep='\t')
clip_info

Unnamed: 0,clip_id,track_number,title,artist,album,url,segmentStart,segmentEnd,original_url,mp3_path
0,2,1,BWV54 - I Aria,American Bach Soloists,J.S. Bach Solo Cantatas,http://www.magnatune.com/artists/albums/abs-so...,30,59,http://he3.magnatune.com/all/01--BWV54%20-%20I...,f/american_bach_soloists-j_s__bach_solo_cantat...
1,6,1,BWV54 - I Aria,American Bach Soloists,J.S. Bach Solo Cantatas,http://www.magnatune.com/artists/albums/abs-so...,146,175,http://he3.magnatune.com/all/01--BWV54%20-%20I...,f/american_bach_soloists-j_s__bach_solo_cantat...
2,10,1,BWV54 - I Aria,American Bach Soloists,J.S. Bach Solo Cantatas,http://www.magnatune.com/artists/albums/abs-so...,262,291,http://he3.magnatune.com/all/01--BWV54%20-%20I...,f/american_bach_soloists-j_s__bach_solo_cantat...
3,11,1,BWV54 - I Aria,American Bach Soloists,J.S. Bach Solo Cantatas,http://www.magnatune.com/artists/albums/abs-so...,291,320,http://he3.magnatune.com/all/01--BWV54%20-%20I...,f/american_bach_soloists-j_s__bach_solo_cantat...
4,12,1,BWV54 - I Aria,American Bach Soloists,J.S. Bach Solo Cantatas,http://www.magnatune.com/artists/albums/abs-so...,320,349,http://he3.magnatune.com/all/01--BWV54%20-%20I...,f/american_bach_soloists-j_s__bach_solo_cantat...
...,...,...,...,...,...,...,...,...,...,...
31377,58899,56,La Bressanina,Jacob Heringman,Blame Not My Lute,http://www.magnatune.com/artists/albums/hering...,88,117,http://he3.magnatune.com/all/56-La%20Bressanin...,8/jacob_heringman-blame_not_my_lute-56-la_bres...
31378,58906,57,Lost is my Lyberty,Jacob Heringman,Blame Not My Lute,http://www.magnatune.com/artists/albums/hering...,0,29,http://he3.magnatune.com/all/57-Lost%20is%20my...,8/jacob_heringman-blame_not_my_lute-57-lost_is...
31379,58907,57,Lost is my Lyberty,Jacob Heringman,Blame Not My Lute,http://www.magnatune.com/artists/albums/hering...,30,59,http://he3.magnatune.com/all/57-Lost%20is%20my...,8/jacob_heringman-blame_not_my_lute-57-lost_is...
31380,58908,57,Lost is my Lyberty,Jacob Heringman,Blame Not My Lute,http://www.magnatune.com/artists/albums/hering...,59,88,http://he3.magnatune.com/all/57-Lost%20is%20my...,8/jacob_heringman-blame_not_my_lute-57-lost_is...


# n tags with the largest quantity in dataset

In [7]:
n = 50

tags = adnotations.filter(regex='[^(clip_id|mp3_path)]').sum().nlargest(20)
tags

guitar        4851
classical     4269
slow          3547
techno        2953
strings       2726
drums         2598
electronic    2519
rock          2371
fast          2304
piano         2056
ambient       1956
beat          1906
violin        1826
vocal         1729
synth         1716
female        1474
indian        1395
opera         1296
male          1279
singing       1211
dtype: int64

In [8]:
tags = tags.keys().to_numpy('str')
np.save(open(os.path.join(split_path, 'tags.npy'), 'wb'), tags)
tags

array(['guitar', 'classical', 'slow', 'techno', 'strings', 'drums',
       'electronic', 'rock', 'fast', 'piano', 'ambient', 'beat', 'violin',
       'vocal', 'synth', 'female', 'indian', 'opera', 'male', 'singing'],
      dtype='<U10')

# Binary tags

In [9]:
binary = adnotations[tags]
binary

Unnamed: 0,guitar,classical,slow,techno,strings,drums,electronic,rock,fast,piano,ambient,beat,violin,vocal,synth,female,indian,opera,male,singing
0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25858,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
25859,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
25860,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
25861,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
binary[binary.sum(axis=1) == 0].index

Int64Index([    6,     9,    13,    20,    21,    25,    26,    28,    52,
               53,
            ...
            25794, 25806, 25808, 25811, 25812, 25823, 25832, 25833, 25836,
            25853],
           dtype='int64', length=6140)

In [11]:
indexes_to_keep = binary[binary.sum(axis=1) != 0].index.values
binary = binary[binary.index.isin(indexes_to_keep)]
adnotations = adnotations[adnotations.index.isin(indexes_to_keep)]
binary.reset_index()
adnotations.reset_index()
binary

Unnamed: 0,guitar,classical,slow,techno,strings,drums,electronic,rock,fast,piano,ambient,beat,violin,vocal,synth,female,indian,opera,male,singing
0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
1,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25858,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
25859,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
25860,1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
25861,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [12]:
binary[binary.sum(axis=1) != 0].index

Int64Index([    0,     1,     2,     3,     4,     5,     7,     8,    10,
               11,
            ...
            25852, 25854, 25855, 25856, 25857, 25858, 25859, 25860, 25861,
            25862],
           dtype='int64', length=19718)

In [13]:
binary_np = binary.to_numpy()
np.save(open(os.path.join(split_path, 'binary.npy'), 'wb'), binary_np)
binary_np

array([[0, 1, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 1, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

# Spliting dataset to train, valid and test datasets

In [14]:
dataset = pd.DataFrame(adnotations.mp3_path, columns=['idx', 'mp3_path'])
dataset["idx"] = dataset.index
dataset

Unnamed: 0,idx,mp3_path
0,0,f/american_bach_soloists-j_s__bach_solo_cantat...
1,1,f/american_bach_soloists-j_s__bach_solo_cantat...
2,2,f/american_bach_soloists-j_s__bach_solo_cantat...
3,3,f/american_bach_soloists-j_s__bach_solo_cantat...
4,4,f/american_bach_soloists-j_s__bach_solo_cantat...
...,...,...
25858,25858,8/jacob_heringman-blame_not_my_lute-56-la_bres...
25859,25859,8/jacob_heringman-blame_not_my_lute-57-lost_is...
25860,25860,8/jacob_heringman-blame_not_my_lute-57-lost_is...
25861,25861,8/jacob_heringman-blame_not_my_lute-57-lost_is...


In [15]:
dataset.to_numpy()

array([[0,
        'f/american_bach_soloists-j_s__bach_solo_cantatas-01-bwv54__i_aria-30-59.mp3'],
       [1,
        'f/american_bach_soloists-j_s__bach_solo_cantatas-01-bwv54__i_aria-146-175.mp3'],
       [2,
        'f/american_bach_soloists-j_s__bach_solo_cantatas-01-bwv54__i_aria-262-291.mp3'],
       ...,
       [25860,
        '8/jacob_heringman-blame_not_my_lute-57-lost_is_my_lyberty-30-59.mp3'],
       [25861,
        '8/jacob_heringman-blame_not_my_lute-57-lost_is_my_lyberty-59-88.mp3'],
       [25862,
        '8/jacob_heringman-blame_not_my_lute-58-a_toy_for_monica_bishop-0-29.mp3']],
      dtype=object)

In [16]:
train_ratio = 0.75
valid_ratio = 0.15
test_ratio = 0.1

In [17]:
train, test = train_test_split(dataset, test_size=1 - train_ratio)
valid, test = train_test_split(test, test_size=test_ratio/(test_ratio + valid_ratio))

In [18]:
dataset_train = train.to_numpy()
np.save(open(os.path.join(split_path, 'train.npy'), 'wb'), dataset_train)
dataset_train

array([[3035,
        '6/dr_kuch-analog_disease-02-completly_mysticism-117-146.mp3'],
       [3340, 'd/tim_rayborn-chordae-02-fors_souvenir-59-88.mp3'],
       [5354,
        '4/magnatune-romantic_dinner_classical_compilation-03-asteria__vive_ma_dame-262-291.mp3'],
       ...,
       [4780,
        '5/human_response-survival-02-the_rainy_season-146-175.mp3'],
       [4129,
        '2/maryse_carlin-rameau__pieces_de_clavecin_en_concerts__forqueray_suites_4_and_5-02-premier_concert__la_livri_rameau-30-59.mp3'],
       [8282,
        'a/asteria-soyes_loyal-04-jatendray_tant_guillaume_dufay-30-59.mp3']],
      dtype=object)

In [19]:
dataset_valid = valid.to_numpy()
np.save(open(os.path.join(split_path, 'valid.npy'), 'wb'), dataset_valid)
dataset_valid

array([[8046, '1/jeff_wahl-guinevere-04-freedom-146-175.mp3'],
       [15950,
        '8/kourosh_zolani-peaceful_planet-08-butterflies-117-146.mp3'],
       [12132,
        'b/seismic_anamoly-dead_mans_hand-06-deep_blue_eee-59-88.mp3'],
       ...,
       [1425,
        '7/jeni_melia-the_last_of_old_england-01-o_waly_waly_trad-146-175.mp3'],
       [7780,
        'd/ambient_teknology-the_all_seeing_eye_project-04-confusion_says-0-29.mp3'],
       [15718,
        '1/jacob_heringman-holburns_passion-08-a_french_toy_cittern-0-29.mp3']],
      dtype=object)

In [20]:
dataset_test = test.to_numpy()
np.save(open(os.path.join(split_path, 'test.npy'), 'wb'), dataset_test)
dataset_test

array([[6328, '2/indidginus-seismic-03-master_of_masters-349-378.mp3'],
       [14314,
        '7/american_bach_soloists-j_s__bach__mass_in_b_minor_cd2-07-et_in_spiritum_sanctum_dominum-262-291.mp3'],
       [2669,
        '0/jeffrey_luck_lucas-what_we_whisper-01-you_knew_it_well-175-204.mp3'],
       ...,
       [21626, '0/apa_ya-apa_ya-12-african_wedding_song-146-175.mp3'],
       [16219,
        'f/kenji_williams-faces_of_epiphany-08-free_energy-88-117.mp3'],
       [14077, '3/jag-four_strings-07-country_romp-30-59.mp3']],
      dtype=object)