In [1]:
#Importing the libraries
import numpy as np
import torch as th
from transformers import BertTokenizer, BertModel
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Tokenize the text

device = th.device('cuda') if th.cuda.is_available() else th.device('cpu')
print(f'Using device: {device}')
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Load the data
path=r"C:\Users\JOSDA\Desktop\Autonomous systems\Second semester\NLP\Project\GIT\NLP_project\Data_set\best_200songs_per25artist.csv"
data = pd.read_csv(path)
lyrics = data['clean_lyrics'].tolist()

#tokenize the text
tokenized_lyrics = [tokenizer.tokenize(text) for text in lyrics]
data['tokenized_lyrics'] = tokenized_lyrics

#Convert the tokens to their token IDs
input_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokenized_lyrics]


# Encode the inputs
encoded_inputs = tokenizer(lyrics, padding=True, truncation=True, return_tensors='pt')

#Move the encoded data to the GPU
encoded_inputs = {key: value.to(device) for key, value in encoded_inputs.items()}

# add the encoded inputs to the DataFrame
data['encoded_inputs'] = encoded_inputs['input_ids'].cpu().tolist()  # Mover de vuelta a la CPU para guardarlas en el DataFrame


# Save the updated DataFrame
data.to_csv(path, index=False)


Using device: cuda


In [3]:
batch_size = 16  # Adjust dependt on GPU
num_batches = len(lyrics) // batch_size + (1 if len(lyrics) % batch_size != 0 else 0)

all_embeddings = []

# Procesar en lotes
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(lyrics))
    
    batch_inputs = {key: value[start_idx:end_idx] for key, value in encoded_inputs.items()}
    
    with th.no_grad():
        outputs = model(**batch_inputs)
        batch_embeddings = outputs.last_hidden_state[:, 0, :]  # 0 is the index of the [CLS] token
        all_embeddings.append(batch_embeddings.cpu())  # move emb to cpu

    # free memory gpu
    th.cuda.empty_cache()

# Cat all embeddings
all_embeddings = th.cat(all_embeddings, dim=0)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [4]:
# Save the embeddings in the DataFrame

data['embeddings'] = all_embeddings.tolist()

data.to_csv(path, index=False)

In [5]:
#CREATE LABELS AND STORE THEM IN THE DATAFRAME


# Convert embeddings to tensores
embeddings = th.tensor(data['embeddings'].tolist())

#Create labels from the artist names
artist_labels = data['artist'].tolist()
artist_to_index = {artist: idx for idx, artist in enumerate(set(artist_labels))}
labels = th.tensor([artist_to_index[artist] for artist in artist_labels])

#Save the "labels" in the dataframe
data['labels'] = labels.tolist()
data.to_csv(path, index=False)


In [10]:
#Split the data into training and testing sets (FOR NEW TRAINIGS FROM HERE)

#load the data
data = pd.read_csv(path)

# Convertir los embeddings a tensores
embeddings = th.tensor(data['embeddings'].tolist())

# Convertir las etiquetas a tensores    
labels = th.tensor(data['labels'].tolist())


# Dividir los datos en conjuntos de entrenamiento y prueba
train_embeddings, test_embeddings, train_labels, test_labels = train_test_split(
    embeddings, labels, test_size=0.2, random_state=42, stratify=labels
)

# Verificar la forma de los conjuntos
print("Train embeddings shape:", train_embeddings.shape)
print("Test embeddings shape:", test_embeddings.shape)
print("Train labels shape:", train_labels.shape)
print("Test labels shape:", test_labels.shape)

Train embeddings shape: torch.Size([2832, 768])
Test embeddings shape: torch.Size([708, 768])
Train labels shape: torch.Size([2832])
Test labels shape: torch.Size([708])
