In [1]:
from aux_fun import get_tokens_inst_iter_folders
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
import guitarpro as pygp
import glob

path_to_general_read_folder = "..\..\data\DadaGP\DadaGP-v1.1"
path_to_general_write_folder = "..\..\data\BGTG\BGTG-v0.1"

#### Assigning the rythmic guitar to each track

In [7]:
# # now let's take a look at the instruments that generally are rythmic


df_rg_2 = pd.read_csv('../../data/df_rg_2.csv')
display(df_rg_2.head())
df_rg_2.shape, df_rg_2['File_Name'].nunique()

Unnamed: 0,Fichier,Dadagp_Path,File_Name,Track_Name,Partie,Instrument,Mesure,rg-estimation,Dadagp_Name
0,../data/DadaGP8-gpif/ - Isolated.gpif,..\..\data\DadaGP\DadaGP-v1.1\U\Unknown\ - Iso...,- Isolated,Isolated\nDr. Sin,E. Ardanuy,distorted,1,2.570728e-07,distorted0
1,../data/DadaGP8-gpif/ - Isolated.gpif,..\..\data\DadaGP\DadaGP-v1.1\U\Unknown\ - Iso...,- Isolated,Isolated\nDr. Sin,E. Ardanuy,distorted,2,4.580104e-07,distorted0
2,../data/DadaGP8-gpif/ - Isolated.gpif,..\..\data\DadaGP\DadaGP-v1.1\U\Unknown\ - Iso...,- Isolated,Isolated\nDr. Sin,E. Ardanuy,distorted,3,6.601997e-06,distorted0
3,../data/DadaGP8-gpif/ - Isolated.gpif,..\..\data\DadaGP\DadaGP-v1.1\U\Unknown\ - Iso...,- Isolated,Isolated\nDr. Sin,E. Ardanuy,distorted,4,0.0004111985,distorted0
4,../data/DadaGP8-gpif/ - Isolated.gpif,..\..\data\DadaGP\DadaGP-v1.1\U\Unknown\ - Iso...,- Isolated,Isolated\nDr. Sin,E. Ardanuy,distorted,5,4.352387e-05,distorted0


((4026353, 9), 23123)

In [8]:
# Now let's loop again on the tracks and take a look at what instruments are generally rythmic

# First describe column rg-estimation
df_rg_2['rg-estimation'].describe()

# We could loop on tracks, within tracks loop on measures and for each measure take the instrument that has the highest rg-estimation
# But that would be very long?
# Maybe we can just say if rg-estimation < thr then the instrument is rythmic

thr = 0.5 # General decision threshold in the paper that is used to determine if an instrument is rythmic or not
# When rg-estimation is near 1, the instrument is lead, when it is near 0 the instrument is rythmic.
df_rg_2.loc[:, 'is_rythmic'] = df_rg_2['rg-estimation'] < thr

display(df_rg_2['is_rythmic'].value_counts())


is_rythmic
True     3379154
False     647199
Name: count, dtype: int64

In [10]:
# Initialize an empty list to store dataframes for faster concatenation later
dataframes = []

# Group by 'File_Name' once instead of iterating through unique track names
for track, df_track in tqdm(df_rg_2.groupby('File_Name')):
    max_nb_measures = df_track['Mesure'].max()

    # Precompute proportions for all instruments
    df_instrument_props = (
        df_track.groupby('Dadagp_Name')['is_rythmic'].sum() / max_nb_measures
    ).reset_index(name='prop_rythmic')

    # Set bass instruments' proportions to 0 directly
    df_instrument_props.loc[
        df_instrument_props['Dadagp_Name'] == 'bass', 'prop_rythmic'
    ] = 0

    # Merge proportions back to the track dataframe
    df_track = df_track.merge(
        df_instrument_props, on='Dadagp_Name', how='left'
    )

    # Identify the maximum prop_rythmic and set the is_track_rythmic column
    if df_track['prop_rythmic'].notnull().any():
        max_prop = df_track['prop_rythmic'].max()
        df_track['is_track_rythmic'] = df_track['prop_rythmic'] == max_prop
    else:
        df_track['is_track_rythmic'] = False

    # Append the processed track dataframe to the list
    dataframes.append(df_track)

# Concatenate all track dataframes at once for better performance
total_df = pd.concat(dataframes, ignore_index=True)


100%|██████████| 23123/23123 [04:01<00:00, 95.85it/s] 


In [9]:
# TERRIBLE UNOPTIMIZED VERSIOn

# # However, now that I need this I need to build a dataframe with for each track and each instrument its proportion of rythmic measures

# total_df = pd.DataFrame()
# i=0
# # For each track we add the DadaGP_name column
# for track in tqdm(df_rg_2['File_Name'].unique()):
#     i=i+1
#     df_track = df_rg_2[df_rg_2['File_Name'] == track].copy() # We copy to avoid SettingWithCopyWarning
#     max_nb_measures = df_track['Mesure'].max()
#     # Maybe we should use a count instead of a max because the instrument may start at measure 10 and end at measure 20 for instance 
#     # But using the max is great to avoid the case where an instrument plays only a few measures and get a high prop.
    
#     for instrument in df_track['Dadagp_Name'].unique():
#         df_inst = df_track[df_track['Dadagp_Name'] == instrument]
        
#         if instrument == 'bass':
#             df_track.loc[df_track['Dadagp_Name'] == instrument, 'prop_rythmic'] = 0
#             continue
#             # Bass should not be rythmic as we don't want to extract the bass to condition the generation of the bass...
    
        
#         # Proportion of rythmic measures for the instrument (in the whole track and not only in the measures where the instrument plays)
#         prop_rythmic = df_inst['is_rythmic'].sum() / max_nb_measures
#         if prop_rythmic > 1:
#             print(track, max_nb_measures, prop_rythmic)
#         df_track.loc[df_track['Dadagp_Name'] == instrument, 'prop_rythmic'] = prop_rythmic
        
#     # The instrument with the highest proportion of rythmic measures is the rythmic instrument
#     if df_track['prop_rythmic'].isnull().all(): # No rythmic instrument at all in the track
#         df_track.loc[:, 'is_track_rythmic'] = False
    
#     else:
#         # Add column is_track_rythmic set to True at the rows where prop_rythmic is its max
#         df_track.loc[:, 'is_track_rythmic'] = df_track['prop_rythmic'] == df_track['prop_rythmic'].max()
          
#     # Concatenate the dataframe
#     total_df = pd.concat([total_df, df_track])
    
#     # if i==100: # Test on 100 tracks
#     #     break

# # Around 2 hours of computation according to tqdm...

  0%|          | 54/23123 [00:30<3:38:23,  1.76it/s]


KeyboardInterrupt: 

In [11]:
total_df.to_csv('../../data/df_rg_3.csv', index=False)