## DadaGP generator of separate token files

The goal is to have a programme that outputs the tokens only of the desired instruments

In [57]:
from aux_fun import get_tokens_inst_iter_folders
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import guitarpro as pygp
import glob

path_to_general_read_folder = "..\data\DadaGP\DadaGP-v1.1"
path_to_general_write_folder = "..\data\BGTG\BGTG-v0.1"

### Arbitrary file generator

This first version outputs the tokens of the selected instruments

In [58]:
# Select the instruments to keep by setting the value to True
instrument_to_keep = {
        "distorted0": False,
        "distorted1": False,
        "distorted2": False,
        "clean0": False,
        "clean1": False,
        "bass": True,
        "leads": False,    
        "pads": False,
        "drums": False,
    }

# get_tokens_inst_iter_folders(instrument_to_keep, path_to_general_read_folder, path_to_general_write_folder)

# 8min10s to generate all the token files with only bass and clean0

# 5min30s to generate all the bass tokens without checking if there is bass or not
# 4min49s for bass tokens if we check in the whole file if there is bass or not
# 6min37s to generate the bass tokens when checking at track-level the presence of bass (11 230 tracks without bass)

# 8min12s to generate the token files with the guitars (clean and distorted)

### Automatic output of the rythmic guitar's tokens

This version implements a link between the CSV Alexandre computed (which contains the results of what guitar is rythmic at each measure) and dadaGP's files.
The goal is for each track to find what guitar is rythmic overall on the track, and to output the tokens of this instrument specifically.

#### Adding column DadaGP_name to the CSV

In [59]:
# Take a look at the csv provided by Alexandre

df_rg = pd.read_csv('..\\data\\function_guitar-ext-with-predictions.csv')
df_rg.shape

(4165789, 40)

In [60]:
# We take a look at a specific track

track_sample = np.random.choice(df_rg['Track_Name'].unique())

df_sample = df_rg[df_rg['Track_Name'] == track_sample]

df_sample.shape

(73, 40)

In [61]:
track_sample = 'Arrows Fly\nEdguy' # Good example
track_sample = 'Falling Down\nEdguy'

df_sample = df_rg[df_rg['Track_Name'] == track_sample]

display("Instruments in the track:", df_sample['Instrument'].unique(),
        "Number of parts:", df_sample['Partie'].unique())

# We need to link the part to the instrument

df_inst_part = df_sample[['Partie', 'Instrument']].drop_duplicates()

display(df_inst_part)

# Let's hope the order in DadaGP is the same as in the csv
# --> The order is not the same, we use a different solution

# To retrieve the real track name from the file name, perform a split on the dash (-)
# Also clean file name by removing the extension and the beginning of the path.

# The idea:
# Iterate on the files in gp5, iterate on tracks names, find the csv rows that correspond to the instrument,
# look at the value in the Instrument column, add column Dadagp_name that is instrument name with number appended.
# The number is reset at each new track


'Instruments in the track:'

array(['bass', 'clean', 'distorted'], dtype=object)

'Number of parts:'

array(['Bass', 'Clean Guitar', 'Lead Guitar 1', 'Lead Guitar 2',
       'Rhytm Guitar'], dtype=object)

Unnamed: 0,Partie,Instrument
1192065,Bass,bass
1192188,Clean Guitar,clean
1192204,Lead Guitar 1,distorted
1192264,Lead Guitar 2,distorted
1192301,Rhytm Guitar,distorted


In [62]:
df_inst_part = df_sample[['Fichier', 'Partie', 'Instrument']].drop_duplicates()

def get_track(str):
    aux = str.split('/')[-1]
    try:
        aux = aux.split('-')[-2]
        aux = aux.strip()
    except:
        aux = 'error'
    return aux

def get_author(str):
    aux = str.split('/')[-1]
    try:
        aux = aux.split('-')[-3]
        # remove trailing whitespace
        aux = aux.strip()
    except:
        aux = 'error'
    return aux

def get_author_track(row):
    author_track = row['Author'] + ' - ' + row['Track']
    return author_track

# def get_dada_gp_path(row, general_path):
#     try:
#         path = general_path + '\\' + row['Author'][0] + '\\' + row['Author'] + '\\' + row['Author'] + ' - ' + row['Track']
#         # Don't add the extension because it can be gp3 or gp4
#     except:
#         path = 'error'
#     return path

df_rg_cleaned = df_rg.copy()

df_rg_cleaned['Track'] = df_rg_cleaned['Fichier'].apply(get_track)
df_rg_cleaned = df_rg_cleaned[df_rg_cleaned['Track'] != 'error']


df_rg_cleaned['Author'] = df_rg_cleaned['Fichier'].apply(get_author)
df_rg_cleaned = df_rg_cleaned[df_rg_cleaned['Author'] != 'error']

df_rg_cleaned['Author_Track'] = df_rg_cleaned.apply(get_author_track, axis=1)
# Needed because several artists have same track name

# df_rg['Dada_gp_path'] = df_rg.apply(lambda row: get_dada_gp_path(row, path_to_general_read_folder), axis=1)
# df_rg = df_rg[df_rg['Dada_gp_path'] != 'error']

print(df_rg.shape, df_rg_cleaned.shape, 'removed:', df_rg.shape[0] - df_rg_cleaned.shape[0])
# df_inst_part['Track'] = df_inst_part['Fichier'].apply(get_track)
# df_inst_part['Author'] = df_inst_part['Fichier'].apply(get_author)
# df_inst_part['Dada_gp_path'] = df_inst_part.apply(lambda row: get_dada_gp_path(row, path_to_general_read_folder), axis=1)


(4165789, 40) (3912938, 43) removed: 252851


In [74]:
# Remove measures
df_inst_part = df_rg_cleaned[['Fichier', 'Track_Name', 'Partie', 'Instrument']].drop_duplicates(subset=['Track_Name', 'Partie', 'Instrument'])
# Add column Num_Parts which contains the number of parts for each file in Fichier
# df_inst_part['Num_Parts'] = df_inst_part.groupby('Track_Name')['Partie'].transform('nunique')
df_inst_part.head()

Unnamed: 0,Fichier,Track_Name,Partie,Instrument
131,../data/DadaGP8-gpif/ - Tudo Que Vai-Trilha 1....,Tudo Que Vai\nCapital Inicial by Bifaum,Trilha 1,clean
341,../data/DadaGP8-gpif/ - Tudo Que Vai-Trilha 2....,Tudo Que Vai\nCapital Inicial by Bifaum,Trilha 2,clean
865,../data/DadaGP8-gpif/12 Stones - Crash-Bass.gpif,Crash\n12 Stones,Bass,bass
977,../data/DadaGP8-gpif/12 Stones - Crash-Clean G...,Crash\n12 Stones,Clean Guitar,clean
1096,../data/DadaGP8-gpif/12 Stones - Crash-Distort...,Crash\n12 Stones,Distortion Guitar,distorted


In [76]:
test = 'path/- Tudo Que Vai-Trilha 1.gp4'

get_filename(test)

'- Tudo Que Vai'

In [77]:
track = 'path/AC-DC - C.O.D..gp4'
track2 = 'path/ -Isolated.gp4'
track3 = 'path/Muse - Isolated System - Electric Guitar.gp4'

# I tried an implementation with num_parts but it is too complex to count the number of parts in a track
def get_filename(fichier):
    filename = fichier
    aux = filename.split('/')[-1] # Remove the path
    aux = aux.split('.')[0] # Remove the extension
    
    # If there are 2 dashes or more in aux
    if aux.count('-') >= 2:
        aux = aux.split('-')[0:-1]
        aux = '-'.join(aux)
        
    return aux

# Generate row File_Name

df_inst_part['File_Name'] = df_inst_part['Fichier'].apply(get_filename)

df_inst_part.head()

Unnamed: 0,Fichier,Track_Name,Partie,Instrument,File_Name
131,../data/DadaGP8-gpif/ - Tudo Que Vai-Trilha 1....,Tudo Que Vai\nCapital Inicial by Bifaum,Trilha 1,clean,- Tudo Que Vai
341,../data/DadaGP8-gpif/ - Tudo Que Vai-Trilha 2....,Tudo Que Vai\nCapital Inicial by Bifaum,Trilha 2,clean,- Tudo Que Vai
865,../data/DadaGP8-gpif/12 Stones - Crash-Bass.gpif,Crash\n12 Stones,Bass,bass,12 Stones - Crash
977,../data/DadaGP8-gpif/12 Stones - Crash-Clean G...,Crash\n12 Stones,Clean Guitar,clean,12 Stones - Crash
1096,../data/DadaGP8-gpif/12 Stones - Crash-Distort...,Crash\n12 Stones,Distortion Guitar,distorted,12 Stones - Crash


In [78]:
# print(df_inst_part['Num_Parts'].unique())
# df_inst_part[df_inst_part['Num_Parts'] > 25].head()

In [83]:
# Merge back Num_Parts to the main dataframe

df_rg_cleaned_aug = df_rg_cleaned.merge(df_inst_part[['Fichier', 'File_Name']], on='Fichier', how='left')

print(df_rg_cleaned_aug.shape, df_rg_cleaned.shape)

# Put Num_Parts as int
df_rg_cleaned_aug.head()

(3912938, 44) (3912938, 43)


Unnamed: 0.1,Unnamed: 0,Fichier,Track_Name,Partie,Instrument,Mesure,Has_SingleNotes,Has_Chords,Has_2_note_chords,Has_3_note_chords,...,Presence_VibratoMG,Nb_Whammy_Bar,Presence_bends,has_triad,has_fifth,rg-estimation,Track,Author,Author_Track,File_Name
0,131,../data/DadaGP8-gpif/ - Tudo Que Vai-Trilha 1....,Tudo Que Vai\nCapital Inicial by Bifaum,Trilha 1,clean,2,1,0,0,0,...,0,0,0,1,0,3.2e-05,Tudo Que Vai,,- Tudo Que Vai,- Tudo Que Vai
1,132,../data/DadaGP8-gpif/ - Tudo Que Vai-Trilha 1....,Tudo Que Vai\nCapital Inicial by Bifaum,Trilha 1,clean,3,1,0,0,0,...,0,0,0,1,1,4.2e-05,Tudo Que Vai,,- Tudo Que Vai,- Tudo Que Vai
2,133,../data/DadaGP8-gpif/ - Tudo Que Vai-Trilha 1....,Tudo Que Vai\nCapital Inicial by Bifaum,Trilha 1,clean,4,1,0,0,0,...,0,0,0,1,0,3.8e-05,Tudo Que Vai,,- Tudo Que Vai,- Tudo Que Vai
3,134,../data/DadaGP8-gpif/ - Tudo Que Vai-Trilha 1....,Tudo Que Vai\nCapital Inicial by Bifaum,Trilha 1,clean,5,1,0,0,0,...,0,0,0,1,1,4.1e-05,Tudo Que Vai,,- Tudo Que Vai,- Tudo Que Vai
4,135,../data/DadaGP8-gpif/ - Tudo Que Vai-Trilha 1....,Tudo Que Vai\nCapital Inicial by Bifaum,Trilha 1,clean,6,1,0,0,0,...,0,0,0,1,0,2.2e-05,Tudo Que Vai,,- Tudo Que Vai,- Tudo Que Vai


##### First method, iterating on the tracks in the CSV


In [15]:
df_rg.head()

Unnamed: 0.1,Unnamed: 0,Fichier,Track_Name,Partie,Instrument,Mesure,Has_SingleNotes,Has_Chords,Has_2_note_chords,Has_3_note_chords,...,Presence_VibratoMG,Nb_Whammy_Bar,Presence_bends,has_triad,has_fifth,rg-estimation,Track,Author,Author_Track,Dada_gp_path
865,865,../data/DadaGP8-gpif/12 Stones - Crash-Bass.gpif,Crash\n12 Stones,Bass,bass,9,1,0,0,0,...,0,0,0,0,0,0.953768,Crash,12 Stones,12 Stones - Crash,..\data\DadaGP\DadaGP-v1.1\1\12 Stones\12 Ston...
866,866,../data/DadaGP8-gpif/12 Stones - Crash-Bass.gpif,Crash\n12 Stones,Bass,bass,10,1,0,0,0,...,0,0,0,0,0,0.964169,Crash,12 Stones,12 Stones - Crash,..\data\DadaGP\DadaGP-v1.1\1\12 Stones\12 Ston...
867,867,../data/DadaGP8-gpif/12 Stones - Crash-Bass.gpif,Crash\n12 Stones,Bass,bass,11,1,0,0,0,...,0,0,0,0,0,0.962112,Crash,12 Stones,12 Stones - Crash,..\data\DadaGP\DadaGP-v1.1\1\12 Stones\12 Ston...
868,868,../data/DadaGP8-gpif/12 Stones - Crash-Bass.gpif,Crash\n12 Stones,Bass,bass,12,1,0,0,0,...,0,0,0,1,0,0.986466,Crash,12 Stones,12 Stones - Crash,..\data\DadaGP\DadaGP-v1.1\1\12 Stones\12 Ston...
869,869,../data/DadaGP8-gpif/12 Stones - Crash-Bass.gpif,Crash\n12 Stones,Bass,bass,13,1,0,0,0,...,0,0,0,0,0,0.983138,Crash,12 Stones,12 Stones - Crash,..\data\DadaGP\DadaGP-v1.1\1\12 Stones\12 Ston...


In [13]:
df_inst_part = df_rg[['Dada_gp_path', 'Author', 'Track', 'Author_Track', 'Partie', 'Instrument']]
df_inst_part = df_inst_part.drop_duplicates(subset=['Dada_gp_path', 'Partie'])
total_df_track = pd.DataFrame()
i=0

# For each track we add the DadaGP_name column
for author_track in tqdm(df_inst_part['Author_Track'].unique()):
    df_track = df_inst_part[df_inst_part['Author_Track'] == author_track].copy() # We copy to avoid SettingWithCopyWarning
    
    dada_gp_path = df_track['Dada_gp_path'].values[0]
    path_gp3 = dada_gp_path + '.gp3.pygp.gp5'
    path_gp4 = dada_gp_path + '.gp4.pygp.gp5'
    
    try:
        gp_file = pygp.parse(path_gp3)
        
    except:
        try:
            gp_file = pygp.parse(path_gp4)
        except:
            print('Error with file:', dada_gp_path)
            continue
        
    inst_list = []
    for part in gp_file.tracks:
        part_name = part.name
        
        if part_name not in df_track['Partie'].values:
            continue
        inst_name = df_track[df_track['Partie'] == part_name]['Instrument'].values[0]
        inst_list.append((part_name, inst_name))
    
    # Create the DadaGP_name column filled with 'not_filled_yet'
    df_track['DadaGP_name'] = 'not_filled_yet'
    nb_clean = 0
    nb_distorted = 0
    for part_inst_tuple in inst_list:
        part = part_inst_tuple[0]
        inst = part_inst_tuple[1]
        if inst == 'clean':
            df_track.loc[df_track['Partie'] == part, 'DadaGP_name'] = 'clean' + str(nb_clean)
            nb_clean += 1
        elif inst == 'distorted':
            df_track.loc[df_track['Partie'] == part, 'DadaGP_name'] = 'distorted' + str(nb_distorted)
            nb_distorted += 1
        else:
            df_track.loc[df_track['Partie'] == part, 'DadaGP_name'] = inst
    
    # Join df_track with df_inst_part
    total_df_track = pd.concat([total_df_track, df_track])
    i+=1
    if i == 100:
        break

df_inst_part.shape, total_df_track.shape

# Iterating on the csv is an awful method, we now try to iterate on the gp5 files

  0%|          | 18/20330 [00:04<54:49,  6.17it/s]  

Error with file: ..\data\DadaGP\DadaGP-v1.1\7\70\70 - Harmonious Distruption
Error with file: ..\data\DadaGP\DadaGP-v1.1\7\70\70 - Overwound Music Box
Error with file: ..\data\DadaGP\DadaGP-v1.1\7\70\70 - To Hell And Back
Error with file: ..\data\DadaGP\DadaGP-v1.1\7\70\70 - Zinfandel 2006


  0%|          | 35/20330 [00:06<35:01,  9.66it/s]  

Error with file: ..\data\DadaGP\DadaGP-v1.1\N\Not Enough\Not Enough - Guitar 1
Error with file: ..\data\DadaGP\DadaGP-v1.1\N\Not Enough\Not Enough - Guitar 2
Error with file: ..\data\DadaGP\DadaGP-v1.1\N\Not Enough\Not Enough - Guitar 3
Error with file: ..\data\DadaGP\DadaGP-v1.1\N\Not Enough\Not Enough - Guitar 4


  0%|          | 46/20330 [00:08<35:48,  9.44it/s]

Error with file: ..\data\DadaGP\DadaGP-v1.1\A\All Mixed Up (2)\All Mixed Up (2) - P


  0%|          | 51/20330 [00:09<50:32,  6.69it/s]  

Error with file: ..\data\DadaGP\DadaGP-v1.1\B\Beautiful Disaster (3)\Beautiful Disaster (3) - P
Error with file: ..\data\DadaGP\DadaGP-v1.1\B\Beautiful Disaster (3)\Beautiful Disaster (3) - [p.m. for gtr. 1
Error with file: ..\data\DadaGP\DadaGP-v1.1\B\Beautiful Disaster\Beautiful Disaster - [p.m. for gtr. 1


  0%|          | 55/20330 [00:09<42:37,  7.93it/s]

Error with file: ..\data\DadaGP\DadaGP-v1.1\C\Come Original\Come Original - 5
Error with file: ..\data\DadaGP\DadaGP-v1.1\D\Don't Stay Home\Don't Stay Home - P


  0%|          | 68/20330 [00:11<27:40, 12.20it/s]  

Error with file: ..\data\DadaGP\DadaGP-v1.1\H\Homebrew (2)\Homebrew (2) - P
Error with file: ..\data\DadaGP\DadaGP-v1.1\H\Homebrew\Homebrew - P
Error with file: ..\data\DadaGP\DadaGP-v1.1\H\Hydroponic\Hydroponic - p


  0%|          | 70/20330 [00:11<36:31,  9.25it/s]

Error with file: ..\data\DadaGP\DadaGP-v1.1\P\Prisoner\Prisoner - P


  0%|          | 79/20330 [00:12<34:03,  9.91it/s]

Error with file: ..\data\DadaGP\DadaGP-v1.1\Y\You Wouldn_'t Believe\You Wouldn_'t Believe - P


  1%|          | 117/20330 [00:19<47:30,  7.09it/s]  

Error with file: ..\data\DadaGP\DadaGP-v1.1\R\Romance\Romance - Rock mix
Error with file: ..\data\DadaGP\DadaGP-v1.1\T\Tuoi da buon\Tuoi da buon - Rock remix


  1%|          | 119/20330 [00:20<37:04,  9.09it/s]

Error with file: ..\data\DadaGP\DadaGP-v1.1\9\96\96 - 181
Error with file: ..\data\DadaGP\DadaGP-v1.1\9\96\96 - Beast
Error with file: ..\data\DadaGP\DadaGP-v1.1\9\96\96 - Exercises 1
Error with file: ..\data\DadaGP\DadaGP-v1.1\9\96\96 - Exercises 141
Error with file: ..\data\DadaGP\DadaGP-v1.1\9\96\96 - Exercises 191
Error with file: ..\data\DadaGP\DadaGP-v1.1\9\96\96 - Exercises 335
Error with file: ..\data\DadaGP\DadaGP-v1.1\9\96\96 - Exercises 383
Error with file: ..\data\DadaGP\DadaGP-v1.1\9\96\96 - Exercises 97


  1%|          | 128/20330 [00:20<54:46,  6.15it/s]


((51606, 6), (258, 7))

In [None]:
# Perform a join on Track_Name and Partie to add the column DadaGP_name to the original dataframe
df_rg_augmented = df_rg.merge(total_df_track[['Track_Name', 'Partie', 'DadaGP_name']], on=['Track_Name', 'Partie'], how='left')

print(df_rg.shape, df_rg_augmented.shape)

(4165789, 40) (4165789, 41)


##### Second method: iterating on the pygp.gp5 files in DadaGP

In [95]:
a='path/AC-DC - Ain"t No Fun (Wai - .gp4'

get_filename(a)

'AC-DC - Ain"t No Fun (Wai '

In [89]:
n_found_in_csv = 0
n_not_found_in_csv = 0
total_df_track = pd.DataFrame()
list_tracks_not_found = []

with tqdm(total=4871, desc="Scores computed") as pbar:
        
    for read_alphabetical_folder in glob.glob(f"{path_to_general_read_folder}/*"):
        
        # Retrieve the alphabetical folder name (last part of the path)
        first_char = read_alphabetical_folder.split("\\")[-1]
                
        for read_group_folder in glob.glob(f"{read_alphabetical_folder}/*"):
            
            group = read_group_folder.split('\\')[-1]
            
            for current_file in glob.glob(f"{read_group_folder}/*.pygp.gp5"):
                
                if n_found_in_csv + n_not_found_in_csv == 300:
                    break
                
                file_name = current_file.split('\\')[-1]
                file_name = file_name.split('.')[0]

                # Look in the CSV if we find the file that corresponds to the current file
                df_track = df_rg_cleaned_aug[df_rg_cleaned_aug['File_Name'] == file_name].copy()
                
                if df_track.shape[0] == 0:
                    n_not_found_in_csv += 1
                    list_tracks_not_found.append(file_name)
                    continue
                
                n_found_in_csv += 1
                # Open the file with guitarpro
                try:
                    gp_file = pygp.parse(current_file)
                except:
                    print('Could not parse file:', current_file)
                    continue
                
                inst_list = []
                for part in gp_file.tracks:
                    part_name = part.name
                    
                    if part_name not in df_track['Partie'].values:
                        continue
                    inst_name = df_track[df_track['Partie'] == part_name]['Instrument'].values[0]
                    inst_list.append((part_name, inst_name))
                
                # Create the DadaGP_name column filled with 'not_filled_yet'
                df_track.loc[:, 'DadaGP_name'] = 'not_filled_yet'
                nb_clean = 0
                nb_distorted = 0
                for part_inst_tuple in inst_list:
                    part = part_inst_tuple[0]
                    inst = part_inst_tuple[1]
                    if inst == 'clean':
                        df_track.loc[df_track['Partie'] == part, 'DadaGP_name'] = 'clean' + str(nb_clean)
                        nb_clean += 1
                    elif inst == 'distorted':
                        df_track.loc[df_track['Partie'] == part, 'DadaGP_name'] = 'distorted' + str(nb_distorted)
                        nb_distorted += 1
                    else:
                        df_track.loc[df_track['Partie'] == part, 'DadaGP_name'] = inst
                
                # Join df_track with df_inst_part
                total_df_track = pd.concat([total_df_track, df_track])

            pbar.update(1)


Scores computed:   0%|          | 0/4871 [00:00<?, ?it/s]

Scores computed:   1%|          | 59/4871 [02:34<3:30:21,  2.62s/it]


ValueError: 255 is not a valid NoteType

In [93]:
n_found_in_csv, n_not_found_in_csv, list_tracks_not_found

# It doesn't work better at all...
df_rg_cleaned_aug[df_rg_cleaned_aug['Author'] == 'AC']

Unnamed: 0.1,Unnamed: 0,Fichier,Track_Name,Partie,Instrument,Mesure,Has_SingleNotes,Has_Chords,Has_2_note_chords,Has_3_note_chords,...,Presence_VibratoMG,Nb_Whammy_Bar,Presence_bends,has_triad,has_fifth,rg-estimation,Track,Author,Author_Track,File_Name
29886,31058,../data/DadaGP8-gpif/AC-DC - Ain't No Fun (Wai...,Ain't no fun (waiting round to be a millionair...,Piste 1,distorted,1,1,1,0,1,...,0,0,0,1,1,1.031175e-05,DC,AC,AC - DC,AC-DC
29887,31059,../data/DadaGP8-gpif/AC-DC - Ain't No Fun (Wai...,Ain't no fun (waiting round to be a millionair...,Piste 1,distorted,2,0,1,0,1,...,0,0,0,1,0,3.334402e-04,DC,AC,AC - DC,AC-DC
29888,31060,../data/DadaGP8-gpif/AC-DC - Ain't No Fun (Wai...,Ain't no fun (waiting round to be a millionair...,Piste 1,distorted,3,1,0,0,0,...,0,0,0,1,1,3.699338e-04,DC,AC,AC - DC,AC-DC
29889,31061,../data/DadaGP8-gpif/AC-DC - Ain't No Fun (Wai...,Ain't no fun (waiting round to be a millionair...,Piste 1,distorted,4,1,0,0,0,...,0,0,0,1,0,6.140505e-05,DC,AC,AC - DC,AC-DC
29890,31062,../data/DadaGP8-gpif/AC-DC - Ain't No Fun (Wai...,Ain't no fun (waiting round to be a millionair...,Piste 1,distorted,5,0,1,1,0,...,0,0,0,1,0,1.241184e-05,DC,AC,AC - DC,AC-DC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34391,35563,../data/DadaGP8-gpif/AC-DC - Night Of The Long...,Night of the long knives\nAC/DC,Piste 1,distorted,65,0,1,0,0,...,0,0,0,1,0,4.072296e-08,DC,AC,AC - DC,AC-DC
34392,35564,../data/DadaGP8-gpif/AC-DC - Night Of The Long...,Night of the long knives\nAC/DC,Piste 1,distorted,66,0,1,0,0,...,0,0,0,1,1,2.512243e-07,DC,AC,AC - DC,AC-DC
34393,35565,../data/DadaGP8-gpif/AC-DC - Night Of The Long...,Night of the long knives\nAC/DC,Piste 1,distorted,67,0,1,0,0,...,0,0,0,1,0,1.642829e-08,DC,AC,AC - DC,AC-DC
34394,35566,../data/DadaGP8-gpif/AC-DC - Night Of The Long...,Night of the long knives\nAC/DC,Piste 1,distorted,68,0,1,0,0,...,0,0,0,1,1,1.946738e-07,DC,AC,AC - DC,AC-DC


#### Assigning the rythmic guitar to each track

In [86]:
# now let's take a look at the instruments that generally are rythmic
df_rg_augmented.columns
# We remove the features columns of the dataset

df_rg_aug_essentials = df_rg_augmented[['Fichier', 'Track_Name', 'Partie', 'Instrument', 'DadaGP_name', 'Mesure', 'rg-estimation']]

df_rg_aug_essentials.head()

Unnamed: 0,Fichier,Track_Name,Partie,Instrument,DadaGP_name,Mesure,rg-estimation
0,../data/DadaGP8-gpif/ - Isolated.gpif,Isolated\nDr. Sin,E. Ardanuy,distorted,distorted0,1,2.570728e-07
1,../data/DadaGP8-gpif/ - Isolated.gpif,Isolated\nDr. Sin,E. Ardanuy,distorted,distorted0,2,4.580104e-07
2,../data/DadaGP8-gpif/ - Isolated.gpif,Isolated\nDr. Sin,E. Ardanuy,distorted,distorted0,3,6.601997e-06
3,../data/DadaGP8-gpif/ - Isolated.gpif,Isolated\nDr. Sin,E. Ardanuy,distorted,distorted0,4,0.0004111985
4,../data/DadaGP8-gpif/ - Isolated.gpif,Isolated\nDr. Sin,E. Ardanuy,distorted,distorted0,5,4.352387e-05


In [None]:
# Now let's loop again on the tracks and take a look at what instruments are generally rythmic

# First describe column rg-estimation
df_rg_aug_essentials['rg-estimation'].describe()

# We could loop on tracks, within tracks loop on measures and for each measure take the instrument that has the highest rg-estimation
# But that would be very long?
# Maybe we can just say if rg-estimation < thr then the instrument is rythmic

thr = 0.5 # General decision threshold in the paper that is used to determine if an instrument is rythmic or not
# When rg-estimation is near 1, the instrument is lead, when it is near 0 the instrument is rythmic.
df_rg_aug_essentials.loc[:, 'is_rythmic'] = df_rg_aug_essentials['rg-estimation'] < thr

display(df_rg_aug_essentials['is_rythmic'].value_counts())


is_rythmic
False    3494864
True      670925
Name: count, dtype: int64

In [None]:
# However, now that I need this I need to build a dataframe with for each track and each instrument its proportion of rythmic measures

total_df_rythmic = pd.DataFrame()
i=0
# For each track we add the DadaGP_name column
for track in tqdm(df_rg_aug_essentials['Track_Name'].unique()):
    i=i+1
    df_rythmic = df_rg_aug_essentials[df_rg_aug_essentials['Track_Name'] == track].copy() # We copy to avoid SettingWithCopyWarning
    max_nb_measures = df_rythmic['Mesure'].max()
    
    for instrument in df_rythmic['DadaGP_name'].unique():
        df_inst = df_rythmic[df_rythmic['DadaGP_name'] == instrument]
        
        # Proportion of rythmic measures for the instrument (in the whole track and not only in the measures where the instrument plays)
        prop_rythmic = df_inst['is_rythmic'].sum() / max_nb_measures
        df_rythmic.loc[df_rythmic['DadaGP_name'] == instrument, 'prop_rythmic'] = prop_rythmic  
    
    # The instrument with the highest proportion of rythmic measures is the rythmic instrument
    if df_rythmic['prop_rythmic'].isnull().all(): # No rythmic instrument at all in the track
        df_rythmic.loc[:, 'is_track_rythmic'] = False
    
    else:
        # Add column is_track_rythmic set to True at the rows where prop_rythmic is its max
        df_rythmic.loc[:, 'is_track_rythmic'] = df_rythmic['prop_rythmic'] == df_rythmic['prop_rythmic'].max()
          
    # Concatenate the dataframe
    total_df_rythmic = pd.concat([total_df_rythmic, df_rythmic])
    
    if i==100: # Test on 100 tracks
        break

# Around an hour of computation according to tqdm...

  0%|          | 99/21859 [00:15<58:15,  6.22it/s]  


In [115]:
total_df_rythmic[total_df_rythmic['Track_Name'] == 'Isolated\nDr. Sin']

Unnamed: 0,Fichier,Track_Name,Partie,Instrument,DadaGP_name,Mesure,rg-estimation,is_rythmic,prop_rythmic,is_track_rythmic
0,../data/DadaGP8-gpif/ - Isolated.gpif,Isolated\nDr. Sin,E. Ardanuy,distorted,distorted0,1,2.570728e-07,False,0.519084,False
1,../data/DadaGP8-gpif/ - Isolated.gpif,Isolated\nDr. Sin,E. Ardanuy,distorted,distorted0,2,4.580104e-07,False,0.519084,False
2,../data/DadaGP8-gpif/ - Isolated.gpif,Isolated\nDr. Sin,E. Ardanuy,distorted,distorted0,3,6.601997e-06,False,0.519084,False
3,../data/DadaGP8-gpif/ - Isolated.gpif,Isolated\nDr. Sin,E. Ardanuy,distorted,distorted0,4,4.111985e-04,False,0.519084,False
4,../data/DadaGP8-gpif/ - Isolated.gpif,Isolated\nDr. Sin,E. Ardanuy,distorted,distorted0,5,4.352387e-05,False,0.519084,False
...,...,...,...,...,...,...,...,...,...,...
1142920,../data/DadaGP8-gpif/Dr Sin - Isolated.gpif,Isolated\nDr. Sin,Trilha 1,distorted,distorted1,26,9.999995e-01,True,1.000000,True
1142921,../data/DadaGP8-gpif/Dr Sin - Isolated.gpif,Isolated\nDr. Sin,Trilha 1,distorted,distorted1,27,9.999988e-01,True,1.000000,True
1142922,../data/DadaGP8-gpif/Dr Sin - Isolated.gpif,Isolated\nDr. Sin,Trilha 1,distorted,distorted1,28,9.999994e-01,True,1.000000,True
1142923,../data/DadaGP8-gpif/Dr Sin - Isolated.gpif,Isolated\nDr. Sin,Trilha 1,distorted,distorted1,29,9.999986e-01,True,1.000000,True
