### Loading vocab dictionary (mapping tokens to an id)

In this notebook we perform the conversion of the txt tokens to a 2D file containing for each song the sequence of ids of its tokens.
For a song to be added to the dataset, it needs to contain bass guitar and a rhythmic guitar.

In [64]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import guitarpro as pygp
import pathlib
import pickle


In [65]:
event2word = pickle.load(open("vocab_song_artist.pkl", 'rb')) # fulldataset non-splitted
vocab_size = len(event2word)
print("event2word size: ", vocab_size)

event2word size:  3020


In [66]:

path_to_rg_folder = pathlib.Path("..\..\data\BGTG\BGTG_RG")
path_to_bass_folder = pathlib.Path("..\..\data\BGTG\BGTG_Bass")
# Iterate over all the alphabetical and group folders within each folders 
# For the first implementation we will assume that if a song has bass it also has RG.

In [67]:
# Loop on the files of the bass folder, for each file check if there is a corresponding file in the RG folder
# Then generate the bass sequence and the RG sequence 
path_errors=0
path_errors_list=[]
big_id_file = []
for bass_file_path in tqdm(path_to_bass_folder.rglob("*.txt"), total=14480, desc="Generating sequences of ids"):
    # Replace _bass with _rythmic and BGTG_Bass by BGTG_RG to get the corresponding RG file
    rg_file_path = pathlib.Path((str(bass_file_path).replace("_bass.txt", "_rythmic.txt")).replace("BGTG_Bass", "BGTG_RG"))
    song_name = bass_file_path.stem.split("_")[0]
    
    if rg_file_path.exists():
        bass_sequence = []
        rg_sequence = []
        
        # Open the bass file
        with open(bass_file_path, 'r') as bass_file:
            bass_lines = bass_file.readlines()
            for line in bass_lines:
                if line.strip() in event2word:
                    bass_sequence.append(event2word[line.strip()])
                else:
                    # Add it to the vocab
                    event2word[line.strip()] = len(event2word)

            
        with open(rg_file_path, 'r') as rg_file:
            rg_lines = rg_file.readlines()
            for line in rg_lines:
                if line.strip() in event2word:
                    rg_sequence.append(event2word[line.strip()])
                else:
                    # Add it to the vocab
                    event2word[line.strip()] = len(event2word)
        
        big_id_file.append((song_name, bass_sequence, rg_sequence))
        
        
    
    else:
        path_errors+=1
        path_errors_list.append((song_name, bass_file_path, rg_file_path))

vocab_errors = len(event2word) - vocab_size
print("Path errors: ", path_errors)
print("We added", vocab_errors, "new words to the vocab", "Total vocab size: ", len(event2word))

Generating sequences of ids: 100%|██████████| 14480/14480 [00:28<00:00, 515.65it/s]

Path errors:  724
We added 168 new words to the vocab Total vocab size:  3188





In [68]:
# After looking it up, the vocab errors are due to new wait tokens that come from the sum of consecutive wait tokens.

pd_dataset = pd.DataFrame(big_id_file, columns=["Song_Name", "Decoder_Bass", "Encoder_RG"])

In [69]:
# Truncate the sequences to 597 tokens for the decoder and 545 tokens for the encoder
# If the sequence is shorter than the truncation length, remove it
truncation = True
encoder_truncation = 545
decoder_truncation = 597

def truncate_sequence(sequence, max_length):
    seq_length = len(sequence)
    if seq_length < max_length:
        # Remove the sequence
        return None
    sequence = sequence[:max_length]
    return sequence

if truncation:
    pd_dataset["Decoder_Bass"] = pd_dataset["Decoder_Bass"].apply(lambda x: truncate_sequence(x, decoder_truncation))
    pd_dataset["Encoder_RG"] = pd_dataset["Encoder_RG"].apply(lambda x: truncate_sequence(x, encoder_truncation))
    
    # Remove the rows with None values
    filtered_pd_dataset = pd_dataset.dropna()
    
    print("We removed", pd_dataset.shape[0] - filtered_pd_dataset.shape[0], "sequences")


We removed 3118 sequences


In [70]:
filtered_pd_dataset.to_csv("..\..\data\BGTG_dataset.csv", index=False)

In [71]:
n_encoder = 545
n_decoder = 597
n = filtered_pd_dataset.shape[0]

encoder_input = np.zeros((n, n_encoder), dtype=int)
decoder_input = np.zeros((n, n_decoder), dtype=int)

filtered_pd_dataset = filtered_pd_dataset.reset_index(drop=True)

for i in tqdm(range(n)):
    for j in range(n_encoder):
        encoder_input[i][j] = filtered_pd_dataset['Encoder_RG'][i][j]
    
    for j in range(n_decoder):
        decoder_input[i][j] = filtered_pd_dataset['Decoder_Bass'][i][j]
        
encoder_input.shape, decoder_input.shape

  0%|          | 0/10638 [00:00<?, ?it/s]

100%|██████████| 10638/10638 [00:42<00:00, 249.69it/s]


((10638, 545), (10638, 597))

In [72]:
# Save the numpy arrays
np.save("..\..\data\encoder_input.npy", encoder_input)
np.save("..\..\data\decoder_input.npy", decoder_input)