## DadaGP generator of separate token files

The goal is to have a programme that outputs the tokens only of the desired instruments

In [41]:
from aux_fun import get_tokens_inst_iter_folders
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import guitarpro as pygp
import glob

path_to_general_read_folder = "..\data\DadaGP\DadaGP-v1.1"
path_to_general_write_folder = "..\data\BGTG\BGTG-v0.1"

### Arbitrary file generator

This first version outputs the tokens of the selected instruments

In [42]:
# Select the instruments to keep by setting the value to True
instrument_to_keep = {
        "distorted0": False,
        "distorted1": False,
        "distorted2": False,
        "clean0": False,
        "clean1": False,
        "bass": True,
        "leads": False,    
        "pads": False,
        "drums": False,
    }

# get_tokens_inst_iter_folders(instrument_to_keep, path_to_general_read_folder, path_to_general_write_folder)

# 8min10s to generate all the token files with only bass and clean0

# 5min30s to generate all the bass tokens without checking if there is bass or not
# 4min49s for bass tokens if we check in the whole file if there is bass or not
# 6min37s to generate the bass tokens when checking at track-level the presence of bass (11 230 tracks without bass)

# 8min12s to generate the token files with the guitars (clean and distorted)

### Automatic output of the rythmic guitar's tokens

This version implements a link between the CSV Alexandre computed (which contains the results of what guitar is rythmic at each measure) and dadaGP's files.
The goal is for each track to find what guitar is rythmic overall on the track, and to output the tokens of this instrument specifically.

#### Adding column DadaGP_name to the CSV

In [43]:
# Take a look at the csv provided by Alexandre

df_rg = pd.read_csv('..\\data\\function_guitar-ext-with-predictions.csv')
df_rg.shape

(4165789, 40)

In [44]:
# We take a look at a specific track

track_sample = np.random.choice(df_rg['Track_Name'].unique())

df_sample = df_rg[df_rg['Track_Name'] == track_sample]

df_sample.shape

(298, 40)

In [45]:
track_sample = 'Arrows Fly\nEdguy' # Good example
track_sample = 'Falling Down\nEdguy'

df_sample = df_rg[df_rg['Track_Name'] == track_sample]

display("Instruments in the track:", df_sample['Instrument'].unique(),
        "Number of parts:", df_sample['Partie'].unique())

# We need to link the part to the instrument

df_inst_part = df_sample[['Partie', 'Instrument']].drop_duplicates()

display(df_inst_part)

# Let's hope the order in DadaGP is the same as in the csv
# --> The order is not the same, we use a different solution

# To retrieve the real track name from the file name, perform a split on the dash (-)
# Also clean file name by removing the extension and the beginning of the path.

# The idea:
# Iterate on the files in gp5, iterate on tracks names, find the csv rows that correspond to the instrument,
# look at the value in the Instrument column, add column Dadagp_name that is instrument name with number appended.
# The number is reset at each new track


'Instruments in the track:'

array(['bass', 'clean', 'distorted'], dtype=object)

'Number of parts:'

array(['Bass', 'Clean Guitar', 'Lead Guitar 1', 'Lead Guitar 2',
       'Rhytm Guitar'], dtype=object)

Unnamed: 0,Partie,Instrument
1192065,Bass,bass
1192188,Clean Guitar,clean
1192204,Lead Guitar 1,distorted
1192264,Lead Guitar 2,distorted
1192301,Rhytm Guitar,distorted


In [46]:
df_inst_part = df_sample[['Fichier', 'Partie', 'Instrument']].drop_duplicates()

def get_track(str):
    aux = str.split('/')[-1]
    try:
        aux = aux.split('-')[-2]
        aux = aux.strip()
    except:
        aux = 'error'
    return aux

def get_author(str):
    aux = str.split('/')[-1]
    try:
        aux = aux.split('-')[-3]
        # remove trailing whitespace
        aux = aux.strip()
    except:
        aux = 'error'
    return aux

def get_author_track(row):
    author_track = row['Author'] + ' - ' + row['Track']
    return author_track

# def get_dada_gp_path(row, general_path):
#     try:
#         path = general_path + '\\' + row['Author'][0] + '\\' + row['Author'] + '\\' + row['Author'] + ' - ' + row['Track']
#         # Don't add the extension because it can be gp3 or gp4
#     except:
#         path = 'error'
#     return path

df_rg_cleaned = df_rg.copy()

df_rg_cleaned['Track'] = df_rg_cleaned['Fichier'].apply(get_track)
df_rg_cleaned = df_rg_cleaned[df_rg_cleaned['Track'] != 'error']


df_rg_cleaned['Author'] = df_rg_cleaned['Fichier'].apply(get_author)
df_rg_cleaned = df_rg_cleaned[df_rg_cleaned['Author'] != 'error']

df_rg_cleaned['Author_Track'] = df_rg_cleaned.apply(get_author_track, axis=1)
# Needed because several artists have same track name

# df_rg['Dada_gp_path'] = df_rg.apply(lambda row: get_dada_gp_path(row, path_to_general_read_folder), axis=1)
# df_rg = df_rg[df_rg['Dada_gp_path'] != 'error']

print(df_rg.shape, df_rg_cleaned.shape, 'removed:', df_rg.shape[0] - df_rg_cleaned.shape[0])
# df_inst_part['Track'] = df_inst_part['Fichier'].apply(get_track)
# df_inst_part['Author'] = df_inst_part['Fichier'].apply(get_author)
# df_inst_part['Dada_gp_path'] = df_inst_part.apply(lambda row: get_dada_gp_path(row, path_to_general_read_folder), axis=1)


(4165789, 40) (3912938, 43) removed: 252851


In [47]:
# Remove measures
df_inst_part = df_rg_cleaned[['Fichier', 'Track_Name', 'Partie', 'Instrument']].drop_duplicates(subset=['Track_Name', 'Partie', 'Instrument'])
# Add column Num_Parts which contains the number of parts for each file in Fichier
# df_inst_part['Num_Parts'] = df_inst_part.groupby('Track_Name')['Partie'].transform('nunique')
df_inst_part.head()

Unnamed: 0,Fichier,Track_Name,Partie,Instrument
131,../data/DadaGP8-gpif/ - Tudo Que Vai-Trilha 1....,Tudo Que Vai\nCapital Inicial by Bifaum,Trilha 1,clean
341,../data/DadaGP8-gpif/ - Tudo Que Vai-Trilha 2....,Tudo Que Vai\nCapital Inicial by Bifaum,Trilha 2,clean
865,../data/DadaGP8-gpif/12 Stones - Crash-Bass.gpif,Crash\n12 Stones,Bass,bass
977,../data/DadaGP8-gpif/12 Stones - Crash-Clean G...,Crash\n12 Stones,Clean Guitar,clean
1096,../data/DadaGP8-gpif/12 Stones - Crash-Distort...,Crash\n12 Stones,Distortion Guitar,distorted


In [48]:
track = 'path/AC-DC - C.O.D..gp4'
track2 = 'path/ -Isolated.gp4'
track3 = 'path/Muse - Isolated System - Electric Guitar.gp4'

# I tried an implementation with num_parts but it is too complex to count the number of parts in a track
def get_filename(fichier):
    filename = fichier
    aux = filename.split('/')[-1] # Remove the path
    aux = aux.split('.')[0] # Remove the extension
    
    # If there are 2 dashes or more in aux
    if aux.count('-') >= 2:
        aux = aux.split('-')[0:-1]
        aux = '-'.join(aux)
        
    return aux

def get_filename_alexandre(name):
     name = name.split('/')[-1]
     if name.count('-') < 2 and not 'AC-DC' in name:
             # On garde le nom complet on vire juste l'extension
             return name[:-5]
     else:
             splits = name.split('-')
             name_without_track_name = '-'.join(splits[:-1])
             while name_without_track_name.count('-') >= 2 and (not 'AC-DC' in name_without_track_name):
                 splits = name_without_track_name.split('-')
                 name_without_track_name = '-'.join(splits[:-1])
             return name_without_track_name

# Generate row File_Name

df_inst_part['File_Name'] = df_inst_part['Fichier'].apply(get_filename_alexandre)

df_inst_part.head()

Unnamed: 0,Fichier,Track_Name,Partie,Instrument,File_Name
131,../data/DadaGP8-gpif/ - Tudo Que Vai-Trilha 1....,Tudo Que Vai\nCapital Inicial by Bifaum,Trilha 1,clean,- Tudo Que Vai
341,../data/DadaGP8-gpif/ - Tudo Que Vai-Trilha 2....,Tudo Que Vai\nCapital Inicial by Bifaum,Trilha 2,clean,- Tudo Que Vai
865,../data/DadaGP8-gpif/12 Stones - Crash-Bass.gpif,Crash\n12 Stones,Bass,bass,12 Stones - Crash
977,../data/DadaGP8-gpif/12 Stones - Crash-Clean G...,Crash\n12 Stones,Clean Guitar,clean,12 Stones - Crash
1096,../data/DadaGP8-gpif/12 Stones - Crash-Distort...,Crash\n12 Stones,Distortion Guitar,distorted,12 Stones - Crash


In [49]:
# print(df_inst_part['Num_Parts'].unique())
# df_inst_part[df_inst_part['Num_Parts'] > 25].head()

In [50]:
# Merge back Num_Parts to the main dataframe

df_rg_cleaned_aug = df_rg_cleaned.merge(df_inst_part[['Fichier', 'File_Name']], on='Fichier', how='left')

print(df_rg_cleaned_aug.shape, df_rg_cleaned.shape)

# Put Num_Parts as int
df_rg_cleaned_aug.head()

(3912938, 44) (3912938, 43)


Unnamed: 0.1,Unnamed: 0,Fichier,Track_Name,Partie,Instrument,Mesure,Has_SingleNotes,Has_Chords,Has_2_note_chords,Has_3_note_chords,...,Presence_VibratoMG,Nb_Whammy_Bar,Presence_bends,has_triad,has_fifth,rg-estimation,Track,Author,Author_Track,File_Name
0,131,../data/DadaGP8-gpif/ - Tudo Que Vai-Trilha 1....,Tudo Que Vai\nCapital Inicial by Bifaum,Trilha 1,clean,2,1,0,0,0,...,0,0,0,1,0,3.2e-05,Tudo Que Vai,,- Tudo Que Vai,- Tudo Que Vai
1,132,../data/DadaGP8-gpif/ - Tudo Que Vai-Trilha 1....,Tudo Que Vai\nCapital Inicial by Bifaum,Trilha 1,clean,3,1,0,0,0,...,0,0,0,1,1,4.2e-05,Tudo Que Vai,,- Tudo Que Vai,- Tudo Que Vai
2,133,../data/DadaGP8-gpif/ - Tudo Que Vai-Trilha 1....,Tudo Que Vai\nCapital Inicial by Bifaum,Trilha 1,clean,4,1,0,0,0,...,0,0,0,1,0,3.8e-05,Tudo Que Vai,,- Tudo Que Vai,- Tudo Que Vai
3,134,../data/DadaGP8-gpif/ - Tudo Que Vai-Trilha 1....,Tudo Que Vai\nCapital Inicial by Bifaum,Trilha 1,clean,5,1,0,0,0,...,0,0,0,1,1,4.1e-05,Tudo Que Vai,,- Tudo Que Vai,- Tudo Que Vai
4,135,../data/DadaGP8-gpif/ - Tudo Que Vai-Trilha 1....,Tudo Que Vai\nCapital Inicial by Bifaum,Trilha 1,clean,6,1,0,0,0,...,0,0,0,1,0,2.2e-05,Tudo Que Vai,,- Tudo Que Vai,- Tudo Que Vai


##### First method, iterating on the tracks in the CSV


In [15]:
df_rg.head()

Unnamed: 0.1,Unnamed: 0,Fichier,Track_Name,Partie,Instrument,Mesure,Has_SingleNotes,Has_Chords,Has_2_note_chords,Has_3_note_chords,...,Presence_VibratoMG,Nb_Whammy_Bar,Presence_bends,has_triad,has_fifth,rg-estimation,Track,Author,Author_Track,Dada_gp_path
865,865,../data/DadaGP8-gpif/12 Stones - Crash-Bass.gpif,Crash\n12 Stones,Bass,bass,9,1,0,0,0,...,0,0,0,0,0,0.953768,Crash,12 Stones,12 Stones - Crash,..\data\DadaGP\DadaGP-v1.1\1\12 Stones\12 Ston...
866,866,../data/DadaGP8-gpif/12 Stones - Crash-Bass.gpif,Crash\n12 Stones,Bass,bass,10,1,0,0,0,...,0,0,0,0,0,0.964169,Crash,12 Stones,12 Stones - Crash,..\data\DadaGP\DadaGP-v1.1\1\12 Stones\12 Ston...
867,867,../data/DadaGP8-gpif/12 Stones - Crash-Bass.gpif,Crash\n12 Stones,Bass,bass,11,1,0,0,0,...,0,0,0,0,0,0.962112,Crash,12 Stones,12 Stones - Crash,..\data\DadaGP\DadaGP-v1.1\1\12 Stones\12 Ston...
868,868,../data/DadaGP8-gpif/12 Stones - Crash-Bass.gpif,Crash\n12 Stones,Bass,bass,12,1,0,0,0,...,0,0,0,1,0,0.986466,Crash,12 Stones,12 Stones - Crash,..\data\DadaGP\DadaGP-v1.1\1\12 Stones\12 Ston...
869,869,../data/DadaGP8-gpif/12 Stones - Crash-Bass.gpif,Crash\n12 Stones,Bass,bass,13,1,0,0,0,...,0,0,0,0,0,0.983138,Crash,12 Stones,12 Stones - Crash,..\data\DadaGP\DadaGP-v1.1\1\12 Stones\12 Ston...


In [13]:
df_inst_part = df_rg[['Dada_gp_path', 'Author', 'Track', 'Author_Track', 'Partie', 'Instrument']]
df_inst_part = df_inst_part.drop_duplicates(subset=['Dada_gp_path', 'Partie'])
total_df_track = pd.DataFrame()
i=0

# For each track we add the DadaGP_name column
for author_track in tqdm(df_inst_part['Author_Track'].unique()):
    df_track = df_inst_part[df_inst_part['Author_Track'] == author_track].copy() # We copy to avoid SettingWithCopyWarning
    
    dada_gp_path = df_track['Dada_gp_path'].values[0]
    path_gp3 = dada_gp_path + '.gp3.pygp.gp5'
    path_gp4 = dada_gp_path + '.gp4.pygp.gp5'
    
    try:
        gp_file = pygp.parse(path_gp3)
        
    except:
        try:
            gp_file = pygp.parse(path_gp4)
        except:
            print('Error with file:', dada_gp_path)
            continue
        
    inst_list = []
    for part in gp_file.tracks:
        part_name = part.name
        
        if part_name not in df_track['Partie'].values:
            continue
        inst_name = df_track[df_track['Partie'] == part_name]['Instrument'].values[0]
        inst_list.append((part_name, inst_name))
    
    # Create the DadaGP_name column filled with 'not_filled_yet'
    df_track['DadaGP_name'] = 'not_filled_yet'
    nb_clean = 0
    nb_distorted = 0
    for part_inst_tuple in inst_list:
        part = part_inst_tuple[0]
        inst = part_inst_tuple[1]
        if inst == 'clean':
            df_track.loc[df_track['Partie'] == part, 'DadaGP_name'] = 'clean' + str(nb_clean)
            nb_clean += 1
        elif inst == 'distorted':
            df_track.loc[df_track['Partie'] == part, 'DadaGP_name'] = 'distorted' + str(nb_distorted)
            nb_distorted += 1
        else:
            df_track.loc[df_track['Partie'] == part, 'DadaGP_name'] = inst
    
    # Join df_track with df_inst_part
    total_df_track = pd.concat([total_df_track, df_track])
    i+=1
    if i == 100:
        break

df_inst_part.shape, total_df_track.shape

# Iterating on the csv is an awful method, we now try to iterate on the gp5 files

  0%|          | 18/20330 [00:04<54:49,  6.17it/s]  

Error with file: ..\data\DadaGP\DadaGP-v1.1\7\70\70 - Harmonious Distruption
Error with file: ..\data\DadaGP\DadaGP-v1.1\7\70\70 - Overwound Music Box
Error with file: ..\data\DadaGP\DadaGP-v1.1\7\70\70 - To Hell And Back
Error with file: ..\data\DadaGP\DadaGP-v1.1\7\70\70 - Zinfandel 2006


  0%|          | 35/20330 [00:06<35:01,  9.66it/s]  

Error with file: ..\data\DadaGP\DadaGP-v1.1\N\Not Enough\Not Enough - Guitar 1
Error with file: ..\data\DadaGP\DadaGP-v1.1\N\Not Enough\Not Enough - Guitar 2
Error with file: ..\data\DadaGP\DadaGP-v1.1\N\Not Enough\Not Enough - Guitar 3
Error with file: ..\data\DadaGP\DadaGP-v1.1\N\Not Enough\Not Enough - Guitar 4


  0%|          | 46/20330 [00:08<35:48,  9.44it/s]

Error with file: ..\data\DadaGP\DadaGP-v1.1\A\All Mixed Up (2)\All Mixed Up (2) - P


  0%|          | 51/20330 [00:09<50:32,  6.69it/s]  

Error with file: ..\data\DadaGP\DadaGP-v1.1\B\Beautiful Disaster (3)\Beautiful Disaster (3) - P
Error with file: ..\data\DadaGP\DadaGP-v1.1\B\Beautiful Disaster (3)\Beautiful Disaster (3) - [p.m. for gtr. 1
Error with file: ..\data\DadaGP\DadaGP-v1.1\B\Beautiful Disaster\Beautiful Disaster - [p.m. for gtr. 1


  0%|          | 55/20330 [00:09<42:37,  7.93it/s]

Error with file: ..\data\DadaGP\DadaGP-v1.1\C\Come Original\Come Original - 5
Error with file: ..\data\DadaGP\DadaGP-v1.1\D\Don't Stay Home\Don't Stay Home - P


  0%|          | 68/20330 [00:11<27:40, 12.20it/s]  

Error with file: ..\data\DadaGP\DadaGP-v1.1\H\Homebrew (2)\Homebrew (2) - P
Error with file: ..\data\DadaGP\DadaGP-v1.1\H\Homebrew\Homebrew - P
Error with file: ..\data\DadaGP\DadaGP-v1.1\H\Hydroponic\Hydroponic - p


  0%|          | 70/20330 [00:11<36:31,  9.25it/s]

Error with file: ..\data\DadaGP\DadaGP-v1.1\P\Prisoner\Prisoner - P


  0%|          | 79/20330 [00:12<34:03,  9.91it/s]

Error with file: ..\data\DadaGP\DadaGP-v1.1\Y\You Wouldn_'t Believe\You Wouldn_'t Believe - P


  1%|          | 117/20330 [00:19<47:30,  7.09it/s]  

Error with file: ..\data\DadaGP\DadaGP-v1.1\R\Romance\Romance - Rock mix
Error with file: ..\data\DadaGP\DadaGP-v1.1\T\Tuoi da buon\Tuoi da buon - Rock remix


  1%|          | 119/20330 [00:20<37:04,  9.09it/s]

Error with file: ..\data\DadaGP\DadaGP-v1.1\9\96\96 - 181
Error with file: ..\data\DadaGP\DadaGP-v1.1\9\96\96 - Beast
Error with file: ..\data\DadaGP\DadaGP-v1.1\9\96\96 - Exercises 1
Error with file: ..\data\DadaGP\DadaGP-v1.1\9\96\96 - Exercises 141
Error with file: ..\data\DadaGP\DadaGP-v1.1\9\96\96 - Exercises 191
Error with file: ..\data\DadaGP\DadaGP-v1.1\9\96\96 - Exercises 335
Error with file: ..\data\DadaGP\DadaGP-v1.1\9\96\96 - Exercises 383
Error with file: ..\data\DadaGP\DadaGP-v1.1\9\96\96 - Exercises 97


  1%|          | 128/20330 [00:20<54:46,  6.15it/s]


((51606, 6), (258, 7))

In [None]:
# Perform a join on Track_Name and Partie to add the column DadaGP_name to the original dataframe
df_rg_augmented = df_rg.merge(total_df_track[['Track_Name', 'Partie', 'DadaGP_name']], on=['Track_Name', 'Partie'], how='left')

print(df_rg.shape, df_rg_augmented.shape)

(4165789, 40) (4165789, 41)


##### Second method: iterating on the pygp.gp5 files in DadaGP

In [53]:
a='path/AC-DC - Ain"t No Fun (Wai - guitar.gp4'
b='path/ACDC - Classic riffs.gp4'
get_filename_alexandre(b)

'ACDC - Classic riffs'

In [53]:
n_found_in_csv = 0
n_not_found_in_csv = 0
total_df_track = pd.DataFrame()
list_tracks_not_found = []
list_tracks_found_but_not_parsed = []

with tqdm(total=4871, desc="Scores computed") as pbar:
        
    for read_alphabetical_folder in glob.glob(f"{path_to_general_read_folder}/*"):
        
        # Retrieve the alphabetical folder name (last part of the path)
        first_char = read_alphabetical_folder.split("\\")[-1]
                
        for read_group_folder in glob.glob(f"{read_alphabetical_folder}/*"):
            
            group = read_group_folder.split('\\')[-1]
            
            for current_file in glob.glob(f"{read_group_folder}/*.pygp.gp5"):
                
                if n_found_in_csv + n_not_found_in_csv == 2000:
                    break
                
                file_name = current_file.split('\\')[-1]
                file_name = file_name.split('.')[0]

                # Look in the CSV if we find the file that corresponds to the current file
                df_track = df_rg_cleaned_aug[df_rg_cleaned_aug['File_Name'] == file_name].copy()
                
                if df_track.shape[0] == 0:
                    n_not_found_in_csv += 1
                    list_tracks_not_found.append(file_name)
                    continue
                
                n_found_in_csv += 1
                # Open the file with guitarpro
                try:
                    gp_file = pygp.parse(current_file)
                except:
                    print('Could not parse file:', current_file)
                    list_tracks_found_but_not_parsed.append(file_name)
                    continue
                
                inst_list = []
                for part in gp_file.tracks:
                    part_name = part.name
                    
                    if part_name not in df_track['Partie'].values:
                        continue
                    inst_name = df_track[df_track['Partie'] == part_name]['Instrument'].values[0]
                    inst_list.append((part_name, inst_name))
                
                # Create the DadaGP_name column filled with 'not_filled_yet'
                df_track.loc[:, 'DadaGP_name'] = 'not_filled_yet'
                nb_clean = 0
                nb_distorted = 0
                for part_inst_tuple in inst_list:
                    part = part_inst_tuple[0]
                    inst = part_inst_tuple[1]
                    if inst == 'clean':
                        df_track.loc[df_track['Partie'] == part, 'DadaGP_name'] = 'clean' + str(nb_clean)
                        nb_clean += 1
                    elif inst == 'distorted':
                        df_track.loc[df_track['Partie'] == part, 'DadaGP_name'] = 'distorted' + str(nb_distorted)
                        nb_distorted += 1
                    else:
                        df_track.loc[df_track['Partie'] == part, 'DadaGP_name'] = inst
                
                # Join df_track with df_inst_part
                total_df_track = pd.concat([total_df_track, df_track])

            pbar.update(1)


Scores computed:   1%|          | 59/4871 [00:42<38:50,  2.06it/s]  

Could not parse file: ..\data\DadaGP\DadaGP-v1.1\A\AC-DC\ACDC - Back In Black (4).gp3.pygp.gp5
Could not parse file: ..\data\DadaGP\DadaGP-v1.1\A\AC-DC\ACDC - Hells Bells.gp3.pygp.gp5
Could not parse file: ..\data\DadaGP\DadaGP-v1.1\A\AC-DC\ACDC - Hell_'s Bells (2).gp3.pygp.gp5
Could not parse file: ..\data\DadaGP\DadaGP-v1.1\A\AC-DC\ACDC - Meltdown.gp3.pygp.gp5
Could not parse file: ..\data\DadaGP\DadaGP-v1.1\A\AC-DC\ACDC - You Shook Me All Night Long (2).gp3.pygp.gp5


Scores computed:   2%|▏         | 82/4871 [01:33<1:05:11,  1.22it/s] 

Could not parse file: ..\data\DadaGP\DadaGP-v1.1\A\Aerosmith\Aerosmith - Livin' on the Edge.gp3.pygp.gp5


Scores computed:   3%|▎         | 142/4871 [02:50<1:35:57,  1.22s/it]

Could not parse file: ..\data\DadaGP\DadaGP-v1.1\A\ALMAFUERTE\ALMAFUERTE - Aguante Bonavena.gp3.pygp.gp5
Could not parse file: ..\data\DadaGP\DadaGP-v1.1\A\ALMAFUERTE\ALMAFUERTE - Ceibo.gp3.pygp.gp5
Could not parse file: ..\data\DadaGP\DadaGP-v1.1\A\ALMAFUERTE\ALMAFUERTE - En Este Viaje.gp3.pygp.gp5
Could not parse file: ..\data\DadaGP\DadaGP-v1.1\A\ALMAFUERTE\ALMAFUERTE - Hombre Peste.gp3.pygp.gp5
Could not parse file: ..\data\DadaGP\DadaGP-v1.1\A\ALMAFUERTE\ALMAFUERTE - La Llaga.gp3.pygp.gp5
Could not parse file: ..\data\DadaGP\DadaGP-v1.1\A\ALMAFUERTE\ALMAFUERTE - TC.gp3.pygp.gp5


Scores computed:   4%|▍         | 183/4871 [03:33<30:51,  2.53it/s]  

Could not parse file: ..\data\DadaGP\DadaGP-v1.1\A\Anathema\Anathema - Forgotten Hopes.gp3.pygp.gp5


Scores computed:   6%|▋         | 306/4871 [06:07<1:46:21,  1.40s/it]

Could not parse file: ..\data\DadaGP\DadaGP-v1.1\A\Atkins, Chet\Atkins, Chet - All I Ever Need Is You.gp3.pygp.gp5


Scores computed:   7%|▋         | 348/4871 [06:58<32:53,  2.29it/s]  

Could not parse file: ..\data\DadaGP\DadaGP-v1.1\B\Bach, Johann Sebastian\Bach, Johann Sebastian - Air on a G string (metal arrangement).gp3.pygp.gp5


Scores computed: 100%|██████████| 4871/4871 [07:17<00:00, 11.13it/s]  


In [54]:
n_found_in_csv, n_not_found_in_csv, list_tracks_not_found

# It doesn't work better at all...
# df_rg_cleaned_aug[df_rg_cleaned_aug['Author'] == 'AC']

(1381,
 619,
 ['1 God - Grace',
  '1 God - Loom',
  '1 God - Sativa Arabica',
  '1 God - Tali',
  '1 God - Wierdo',
  '704 exercises for guitar, 1-96 - 181-190',
  '704 exercises for guitar, 1-96 - Beast',
  '704 exercises for guitar, 1-96 - Exercises 1-96',
  '704 exercises for guitar, 1-96 - Exercises 141-180',
  '704 exercises for guitar, 1-96 - Exercises 191-334',
  '704 exercises for guitar, 1-96 - Exercises 335-382',
  '704 exercises for guitar, 1-96 - Exercises 383-416',
  '704 exercises for guitar, 1-96 - Exercises 97-140',
  '20-70 - Harmonious Distruption',
  '20-70 - Overwound Music Box',
  '20-70 - To Hell And Back',
  '20-70 - Zinfandel 2006',
  '23 P',
  '2pac - Do For Love',
  '3 Doors Down - By My Side',
  '3 Doors Down - Going Down In Flames',
  '3 Doors Down - Here Without You',
  '3 Doors Down - Kryptonite (4)',
  '3 Doors Down - Kryptonite (4)',
  '31 Minutos - Mico El Micofono',
  '311 - All Mixed Up',
  '311 - Beautiful Disaster',
  '311 - Light Years',
  '311 - L

In [58]:
total_df_track.shape

(298881, 45)

#### Alexandre method

In [9]:
import pathlib

df = df_rg.copy()
DADAGPPATH = pathlib.Path("../data/DadaGP/DadaGP-v1.1")

def get_filename_alexandre(name):
     name = name.split('/')[-1]
     if name.count('-') < 2 and not 'AC-DC' in name:
             # On garde le nom complet on vire juste l'extension
             return name[:-5]
     else:
             splits = name.split('-')
             name_without_track_name = '-'.join(splits[:-1])
             while name_without_track_name.count('-') >= 2 and (not 'AC-DC' in name_without_track_name):
                 splits = name_without_track_name.split('-')
                 name_without_track_name = '-'.join(splits[:-1])
             return name_without_track_name


In [10]:
a='path/AC-DC - Ain"t No Fun (Wai - .gp4'
b='path/ACDC - Classic riffs.gp4'
get_filename_alexandre(b)

'ACDC - Classic riff'

In [11]:

df['nameNoTrack'] = df['Fichier'].apply(get_filename_alexandre)
files = df['nameNoTrack'].unique()

all_files = list(DADAGPPATH.rglob('**/*.gp3'))
all_files += list(DADAGPPATH.rglob('**/*.gp4'))
all_files += list(DADAGPPATH.rglob('**/*.gp5'))
all_filenames = [f.name for f in all_files]
print("Data was loaded!")

counter = 0
fail = 0
fail_name_list = []
for filename in tqdm(files):
    # retrieve ALL filenames in DadaGP 
    # get only the filename
    if filename + '.gp3' in all_filenames:
        index = all_filenames.index(filename + '.gp3')
        actual_file = all_files[index]
    elif filename + '.gp4' in all_filenames:
        index = all_filenames.index(filename + '.gp4')
        actual_file = all_files[index]
    elif filename + '.gp5' in all_filenames:
        index = all_filenames.index(filename + '.gp5')
        actual_file = all_files[index]
    else:
        # print("Cannot find corresponding file for ", filename)
        fail += 1
        fail_name_list.append(filename)
        continue
    counter += 1

print(f"{counter} files found!")
print(f"{fail} files missing...") # 604 with Alexandre method, 1806 with mine

Data was loaded!


100%|██████████| 23728/23728 [00:34<00:00, 697.00it/s]

23124 files found!
604 files missing...





In [86]:
fail_name_list

[' - Isolated',
 ' - Tudo Que Vai',
 '*',
 '20-70 ',
 '69ers (The) - Romance ',
 '69ers (The) - Tuoi da buon ',
 '704 exercises for guitar, 1-96 ',
 'A-YUE & FreeNight ',
 'A-ha ',
 'AA Le Composer - Ouverture ',
 'AC-DC ',
 'ALMAFUERTE - Alma Fuerte ',
 'AZ-DEAN ',
 'Abelha, Kid - L',
 'Accept - Screaming For A Love',
 "Acheron's Idol - Interlud ",
 "Acheron's Idol - Lost Idol ",
 'Advanced Technique Exercises - Two',
 'Aerosmith - Remember ',
 'Age Of Empires - Age of Empires ',
 'Aguado, Dionso - EtAde in A',
 'Aikawa, Nanase - Daria ',
 'Air - Air ',
 'Akela - FelkCszAltem A Hal',
 'Alan Jackson - Alan Jackson ',
 'Albeniz, Isaac - Rumores De La Caleta (Malaguena ',
 'All-American Rejects ',
 'Allwright, Graeme - Emmene',
 'Alo-A ',
 'Always-Peaceful ',
 'American Hi-Fi ',
 'Amorphis - Amorphis ',
 'Ange - Au',
 'Anti-Flag ',
 'Anti-Silence ',
 'Antoinenadim - Cu',
 'Arch Enemy - I Am Legend ',
 'At The Drive-In ',
 'Ataris (The) - 1',
 'Atherakia - Burning Down The Opera_ Lamento,

In [89]:
df_dadagp_files = pd.DataFrame(all_filenames, columns=['filename'])
df_dadagp_files[df_dadagp_files['filename'].str.contains('Abelha, Kid - L')] # a replaced by a dash...

Unnamed: 0,filename
13926,"Abelha, Kid - L-grimas e chuva.gp4"
26552,"Abelha, Kid - L-grimas e chuva.gp4.gp2tokens2g..."
26553,"Abelha, Kid - L-grimas e chuva.gp4.pygp.gp5"


In [30]:
df.head()

df_not_found = df[df['nameNoTrack'].isin(fail_name_list)]
df_not_found.shape, df_not_found['nameNoTrack'].nunique()

((130548, 41), 604)

#### Assigning the rythmic guitar to each track

In [62]:
# # now let's take a look at the instruments that generally are rythmic
# df_rg_augmented.columns
# # We remove the features columns of the dataset

# df_rg_aug_essentials = df_rg_augmented[['Fichier', 'Track_Name', 'Partie', 'Instrument', 'DadaGP_name', 'Mesure', 'rg-estimation']]

df_rg_aug_essentials = total_df_track[['Fichier', 'Track_Name', 'Partie', 'Instrument', 'Mesure', 'DadaGP_name', 'File_Name', 'rg-estimation']]
display(df_rg_aug_essentials.head())
df_rg_aug_essentials.shape, df_rg_aug_essentials['DadaGP_name'].nunique()

Unnamed: 0,Fichier,Track_Name,Partie,Instrument,Mesure,DadaGP_name,File_Name,rg-estimation
255,../data/DadaGP8-gpif/12 Stones - Crash-Bass.gpif,Crash\n12 Stones,Bass,bass,9,bass,12 Stones - Crash,0.953768
256,../data/DadaGP8-gpif/12 Stones - Crash-Bass.gpif,Crash\n12 Stones,Bass,bass,10,bass,12 Stones - Crash,0.964169
257,../data/DadaGP8-gpif/12 Stones - Crash-Bass.gpif,Crash\n12 Stones,Bass,bass,11,bass,12 Stones - Crash,0.962112
258,../data/DadaGP8-gpif/12 Stones - Crash-Bass.gpif,Crash\n12 Stones,Bass,bass,12,bass,12 Stones - Crash,0.986466
259,../data/DadaGP8-gpif/12 Stones - Crash-Bass.gpif,Crash\n12 Stones,Bass,bass,13,bass,12 Stones - Crash,0.983138


((298881, 8), 10)

In [64]:
# Now let's loop again on the tracks and take a look at what instruments are generally rythmic

# First describe column rg-estimation
df_rg_aug_essentials['rg-estimation'].describe()

# We could loop on tracks, within tracks loop on measures and for each measure take the instrument that has the highest rg-estimation
# But that would be very long?
# Maybe we can just say if rg-estimation < thr then the instrument is rythmic

thr = 0.5 # General decision threshold in the paper that is used to determine if an instrument is rythmic or not
# When rg-estimation is near 1, the instrument is lead, when it is near 0 the instrument is rythmic.
df_rg_aug_essentials.loc[:, 'is_rythmic'] = df_rg_aug_essentials['rg-estimation'] < thr

display(df_rg_aug_essentials['is_rythmic'].value_counts())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_rg_aug_essentials.loc[:, 'is_rythmic'] = df_rg_aug_essentials['rg-estimation'] < thr


is_rythmic
True     254041
False     44840
Name: count, dtype: int64

In [65]:
# However, now that I need this I need to build a dataframe with for each track and each instrument its proportion of rythmic measures

total_df_rythmic = pd.DataFrame()
i=0
# For each track we add the DadaGP_name column
for track in tqdm(df_rg_aug_essentials['File_Name'].unique()):
    i=i+1
    df_rythmic = df_rg_aug_essentials[df_rg_aug_essentials['File_Name'] == track].copy() # We copy to avoid SettingWithCopyWarning
    max_nb_measures = df_rythmic['Mesure'].max()
    
    for instrument in df_rythmic['DadaGP_name'].unique():
        df_inst = df_rythmic[df_rythmic['DadaGP_name'] == instrument]
        
        # Proportion of rythmic measures for the instrument (in the whole track and not only in the measures where the instrument plays)
        prop_rythmic = df_inst['is_rythmic'].sum() / max_nb_measures
        if prop_rythmic > 1:
            print(track, max_nb_measures, prop_rythmic)
        df_rythmic.loc[df_rythmic['DadaGP_name'] == instrument, 'prop_rythmic'] = prop_rythmic  
    
    # The instrument with the highest proportion of rythmic measures is the rythmic instrument
    if df_rythmic['prop_rythmic'].isnull().all(): # No rythmic instrument at all in the track
        df_rythmic.loc[:, 'is_track_rythmic'] = False
    
    else:
        # Add column is_track_rythmic set to True at the rows where prop_rythmic is its max
        df_rythmic.loc[:, 'is_track_rythmic'] = df_rythmic['prop_rythmic'] == df_rythmic['prop_rythmic'].max()
          
    # Concatenate the dataframe
    total_df_rythmic = pd.concat([total_df_rythmic, df_rythmic])
    
    # if i==100: # Test on 100 tracks
    #     break

# Around an hour of computation according to tqdm...

  2%|▏         | 24/1315 [00:00<00:17, 73.58it/s]

3 Doors Down - Be Like That 91 2.5824175824175826


  4%|▍         | 56/1315 [00:00<00:18, 69.88it/s]

311 - Homebrew (2) 73 4.780821917808219
311 - Homebrew (2) 73 1.9452054794520548
311 - Homebrew (2) 73 1.8356164383561644


  6%|▌         | 78/1315 [00:01<00:17, 69.95it/s]

4 Non Blondes - What's Up 69 1.826086956521739
4 Non Blondes - What's Up 69 2.0
5th Prokurator - Turkish Delight 65 1.9692307692307693
5th Prokurator - Turkish Delight 65 1.9384615384615385


  8%|▊         | 99/1315 [00:01<00:17, 68.86it/s]

69ers (The) - From The Heart Of My Bottom 49 1.5918367346938775
69ers (The) - From The Heart Of My Bottom 49 1.7551020408163265
69ers (The) - Go Aways From Me 74 1.837837837837838
69ers (The) - Go Aways From Me 74 1.6756756756756757
69ers (The) - Go Aways From Me 74 1.8108108108108107
69ers (The) - Pass The Past 96 1.2708333333333333
69ers (The) - Pass The Past 96 1.8958333333333333
69ers (The) - Solo bass 41 1.5121951219512195
69ers (The) - Way to nirvana 48 1.9166666666666667
69ers (The) - Way to nirvana 48 1.6666666666666667


 16%|█▌        | 204/1315 [00:03<00:17, 64.80it/s]

ACDC - Hard As A Rock (2) 125 1.744


 19%|█▊        | 246/1315 [00:03<00:17, 62.15it/s]

ACDC - You Shook Me All Night Long 110 2.881818181818182
ACDC - You Shook Me All Night Long 110 1.8545454545454545
ACDC - You Shook Me All Night Long 110 2.0
Accept - Fast As A Shark (2) 121 1.7024793388429753
Accept - Fast As A Shark (2) 121 1.9504132231404958
Accept - Fast As A Shark (2) 121 1.6528925619834711


 20%|█▉        | 260/1315 [00:03<00:17, 60.61it/s]

Accept - Head Over Heels 101 1.8415841584158417
Accept - Head Over Heels 101 2.9702970297029703
Accept - Head Over Heels 101 1.5742574257425743
Accept - I Don't Wanna Be Like You (2) 124 1.9193548387096775
Accept - I Don't Wanna Be Like You (2) 124 1.9193548387096775
Accept - I Don't Wanna Be Like You (2) 124 2.0
Accept - Love Child 146 1.8767123287671232
Accept - Love Child 146 2.0
Accept - Love Child 146 1.6027397260273972


 21%|██        | 273/1315 [00:04<00:17, 58.60it/s]

Accept - Shake Your Heads 92 1.7826086956521738
Accept - Shake Your Heads 92 1.826086956521739
Accept - Shake Your Heads 92 1.4565217391304348
Accept - Shake Your Heads 92 1.4565217391304348


 27%|██▋       | 361/1315 [00:05<00:17, 55.69it/s]

Aerosmith - Mama Kin 169 1.2544378698224852
Aerosmith - Mama Kin 169 1.7396449704142012
Aerosmith - Mama Kin 169 1.5384615384615385


 30%|███       | 397/1315 [00:06<00:16, 55.11it/s]

AFI - Total Immortal 70 1.6571428571428573
AFI - Total Immortal 70 3.4285714285714284
AFI - Total Immortal 70 1.7714285714285714


 31%|███       | 409/1315 [00:06<00:16, 55.43it/s]

Agnostic Front - Crucified 55 1.6727272727272726
Agnostic Front - Crucified 55 2.0


 37%|███▋      | 481/1315 [00:07<00:15, 54.11it/s]

Alice In Chains - Bleed The Freak 83 1.9759036144578312
Alice In Chains - Frogs 60 1.9333333333333333
Alice In Chains - Godsmack 46 2.108695652173913
Alice In Chains - Got Me Wrong (Unplugged) 56 1.8928571428571428
Alice In Chains - Got Me Wrong (Unplugged) 56 2.0
Alice In Chains - Got Me Wrong 88 1.0454545454545454


 37%|███▋      | 493/1315 [00:08<00:15, 52.57it/s]

Alice In Chains - I Stay Away 84 1.4285714285714286
Alice In Chains - I Stay Away 84 1.3333333333333333
Alice In Chains - I Stay Away 84 1.2857142857142858
Alice In Chains - Junkhead 57 1.087719298245614
Alice In Chains - Junkhead 57 1.9649122807017543
Alice In Chains - No Excuses 123 1.934959349593496
Alice In Chains - No Excuses 123 1.934959349593496


 39%|███▉      | 517/1315 [00:08<00:15, 52.56it/s]

Alice In Chains - Would_ 87 1.9080459770114941
Alien Ant Farm - Smooth Criminal (2) 83 2.0
Alien Ant Farm - Smooth Criminal (2) 83 1.7108433734939759


 41%|████      | 535/1315 [00:08<00:14, 53.57it/s]

Aliustaoglu, Cuneyt - Hard 54 1.7222222222222223
Aliustaoglu, Cuneyt - Hard 54 1.3333333333333333
Aliustaoglu, Cuneyt - Hard 54 1.6851851851851851


 43%|████▎     | 571/1315 [00:09<00:14, 50.27it/s]

Allegro - The Betrayer's Song 113 1.9115044247787611
Allegro - The Betrayer's Song 113 1.2035398230088497


 61%|██████    | 799/1315 [00:14<00:10, 46.93it/s]

Angra - Carry On (4) 203 1.2807881773399015


 66%|██████▌   | 869/1315 [00:15<00:10, 44.04it/s]

Annihilator - Mending 32 2.0


 71%|███████   | 929/1315 [00:17<00:08, 44.05it/s]

Anthrax - Metal Thrashing Mad 129 2.0
Anthrax - Metal Thrashing Mad 129 4.0
Anthrax - Metal Thrashing Mad 129 2.0
Anthrax - Taking The Music Back 109 1.963302752293578
Anthrax - Taking The Music Back 109 1.4128440366972477


 73%|███████▎  | 959/1315 [00:17<00:08, 44.23it/s]

Apocalypse Orchestra Band be Maxime Peltier (The) - Hot like 61 1.9672131147540983
Apocalypse Orchestra Band be Maxime Peltier (The) - Hot like 61 1.2459016393442623
Apocalypse Orchestra Band be Maxime Peltier (The) - Hot like 61 1.9672131147540983
Apulanta - Hiekka 102 4.392156862745098
Apulanta - Hiekka 102 1.7352941176470589
Apulanta - Hiekka 102 1.8137254901960784


 75%|███████▌  | 989/1315 [00:18<00:07, 43.73it/s]

Arch Enemy - Bridge Of Destiny 237 3.0126582278481013
Arch Enemy - Bridge Of Destiny 237 1.0253164556962024


 76%|███████▋  | 1004/1315 [00:18<00:07, 43.43it/s]

Arch Enemy - The Immortal (2) 112 1.1607142857142858
Arch Enemy - The Immortal (2) 112 1.8928571428571428
Arch Enemy - The Immortal (2) 112 2.0
Archive - Again 25 1.92


 85%|████████▍ | 1114/1315 [00:21<00:04, 40.75it/s]

Ash - Burn Baby Burn 131 1.9389312977099236
Ash - Burn Baby Burn 131 1.6030534351145038


 87%|████████▋ | 1139/1315 [00:22<00:04, 40.70it/s]

At The Gates - Forever Blind (2) 180 1.0888888888888888
At The Gates - Forever Blind (2) 180 1.9333333333333333


 87%|████████▋ | 1149/1315 [00:22<00:04, 39.85it/s]

At The Gates - World Of Lies 113 1.5044247787610618
At The Gates - World Of Lies 113 2.0


 89%|████████▉ | 1176/1315 [00:23<00:03, 40.31it/s]

Atkins, Chet - Caravan 66 1.2424242424242424


 91%|█████████ | 1193/1315 [00:23<00:03, 39.25it/s]

attaque 77 - Amigo 137 2.562043795620438


 92%|█████████▏| 1211/1315 [00:24<00:02, 39.50it/s]

Audioslave - Set It Off 66 1.4545454545454546
Audioslave - Set It Off 66 1.878787878787879


 95%|█████████▍| 1244/1315 [00:24<00:01, 38.36it/s]

Axel Rudi Pell, Axel - Fool Fool 170 1.8705882352941177
Axel Rudi Pell, Axel - Fool Fool 170 1.5176470588235293
Axel Rudi Pell, Axel - Fool Fool 170 1.5176470588235293


 97%|█████████▋| 1276/1315 [00:25<00:01, 37.97it/s]

Lavigne, Avril - Things I'll Never Say 91 1.4725274725274726


100%|██████████| 1315/1315 [00:26<00:00, 49.17it/s]


In [68]:
total_df_rythmic_track_level = total_df_rythmic.copy()
total_df_rythmic_track_level = total_df_rythmic_track_level.drop_duplicates(subset=['Track_Name', 'File_Name'])
total_df_rythmic_track_level.head(20)

Unnamed: 0,Fichier,Track_Name,Partie,Instrument,Mesure,DadaGP_name,File_Name,rg-estimation,is_rythmic,prop_rythmic,is_track_rythmic
255,../data/DadaGP8-gpif/12 Stones - Crash-Bass.gpif,Crash\n12 Stones,Bass,bass,9,bass,12 Stones - Crash,0.9537677,False,0.674074,False
634,../data/DadaGP8-gpif/1349 - Deathmarch-Guitar....,Deathmarch\n1349,Guitar,distorted,1,distorted0,1349 - Deathmarch,1.147128e-06,True,1.0,True
742,../data/DadaGP8-gpif/1349 - Riders Of The Apoc...,Riders Of The Apocalypse\n1349,Bass,bass,3,bass,1349 - Riders Of The Apocalypse,1.927962e-11,True,0.973822,False
1115,"../data/DadaGP8-gpif/1974 AD, Pahilo Junema - ...",Jungi Nishan\nSajal Pradhan,Bass,bass,2,bass,"1974 AD, Pahilo Junema - Jungi Nishan",2.114279e-08,True,0.93617,False
1389,"../data/DadaGP8-gpif/1974 ad, pahilo junema - ...",Manta Mero Nepali\n1974 AD,Track 1,distorted,1,distorted0,"1974 ad, pahilo junema - Manta Mero Nepali",2.982967e-11,True,0.75,True
1205,"../data/DadaGP8-gpif/1974 AD, Pahilo Junema - ...",Sisire jhari\nAdrian pradhan,Track 1,clean,1,clean0,"1974 AD, Pahilo Junema - Sisire Jhai",2.234772e-07,True,1.0,True
1445,../data/DadaGP8-gpif/1Kempfi1 - 4 Seasons- 4....,4 Seasons\nKempfi,4,clean,1,clean0,1Kempfi1 - 4 Seasons,3.403829e-07,True,1.0,True
1761,../data/DadaGP8-gpif/1Kempfi1 - Short & Stupid...,Short & Stupid\nMCA,Spur 1,distorted,1,distorted0,1Kempfi1 - Short & Stupid,5.604797e-05,True,0.84,False
1895,../data/DadaGP8-gpif/1Kempfi1 - Sunrise-Spur 1...,Sunrise\nKempfi,Spur 1,bass,1,bass,1Kempfi1 - Sunrise,1.273318e-12,True,0.989796,True
2112,../data/DadaGP8-gpif/1st Man For Her - A Peach...,A Peach Fantasy\nBernard Gervais,Bass,bass,19,bass,1st Man For Her - A Peach Fantasy,4.222868e-05,True,0.527273,True
