In [11]:
import re
import numpy as np

In [12]:
import pandas as pd
df = pd.read_csv(r"C:\Users\nunez\Facultad\Factor Data\Proyecto NLP canciones\Datasets Scrapeados\Rock_Nacional.csv")

In [13]:
## Cleansing
df.dropna(subset="Lyrics",inplace=True)
def clean(text):
    cleaned_text = re.sub(r"^.*?Lyrics", "", text)
    cleaned_text = re.sub(r'\[.*?\]', '', cleaned_text)
    cleaned_text = re.sub(r'\([^)]*\)', '', cleaned_text)
    cleaned_text = re.sub(r'\\n', '', cleaned_text)
    cleaned_text = re.sub(r'\n', ' ', cleaned_text)
    cleaned_text = re.sub(r'Embed', '', cleaned_text)
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
    cleaned_text = re.sub(r'you might also like', '', cleaned_text)
    cleaned_text = re.sub(r'you', '', cleaned_text)
    cleaned_text = re.sub(r'might', '', cleaned_text)
    cleaned_text = re.sub(r'also', '', cleaned_text)
    cleaned_text = re.sub(r'like', '', cleaned_text)
    cleaned_text = cleaned_text.lower()
    return cleaned_text

df["Lyrics"] = df["Lyrics"].astype(str).apply(clean)
df["Artist_genres"] = df["Artist_genres"].replace("[]",np.nan)
df.dropna(subset="Artist_genres",inplace=True)
df = df.drop_duplicates(subset=["Lyrics","Artist_ID","Track"])
df["Lyrics"]

0          la espera me agotó no sé nada de vos dejast...
1          hoy te busqué en la rima que duerme con tod...
2          suspiraban lo mismo los dos y hoy son parte...
3          cuando no hay más que decirnos habla el hum...
4          un lago en el cielo quiero ser suave para e...
                              ...                        
3994    do  remember when in winter we reached the isl...
3995    ohhhh na na na na na na nananananana eiiieee c...
3996     lyrics te convertí en una reina pa cualquier ...
3997       yeah yeah yeah vine a romper un beat que pu...
3998       me gustas tú me gusta la lluvia me gustas t...
Name: Lyrics, Length: 2569, dtype: object

In [14]:
import pickle
from sklearn.model_selection import train_test_split

# Divido y guardo en train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


with open(r'C:\Users\nunez\Facultad\Factor Data\Proyecto NLP canciones\pkl dataframe\train_rock.pkl', 'wb') as file:
    pickle.dump(train_df, file)

with open(r'C:\Users\nunez\Facultad\Factor Data\Proyecto NLP canciones\pkl dataframe\test_rock.pkl', 'wb') as file:
    pickle.dump(test_df, file)

## Transformers

In [15]:
# Cargo el modelo general a usar
from transformers import AutoModelForMaskedLM

model_checkpoint = "dccuchile/distilbert-base-spanish-uncased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [16]:
# Cargo el Tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Veamos q tan argentinizado está el modelo

import torch

text = "Argentina [MASK]"

inputs = tokenizer(text, return_tensors="pt")
token_logits = model(**inputs).logits
# Find the location of [MASK] and extract its logits
mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
mask_token_logits = token_logits[0, mask_token_index, :]
# Pick the [MASK] candidates with the highest logits
top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()

for token in top_5_tokens:
    print(f"'>>> {text.replace(tokenizer.mask_token, tokenizer.decode([token]))}'")

In [17]:
#Cargo mi dataset
from datasets import load_dataset
data_files = {"train":
                  r'C:\Users\nunez\Facultad\Factor Data\Proyecto NLP canciones\pkl dataframe\train_rock.pkl',
              "test":
                  r'C:\Users\nunez\Facultad\Factor Data\Proyecto NLP canciones\pkl dataframe\test_rock.pkl'}

rock = load_dataset("pandas", data_files=data_files)
rock

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'Artist', 'Artist_ID', 'Artist_genres', 'Artist_popularity', 'Track', 'Track_ID', 'Track_release_date', 'Track_popularity', 'Lyrics', 'features_dict', '__index_level_0__'],
        num_rows: 2055
    })
    test: Dataset({
        features: ['Unnamed: 0', 'Artist', 'Artist_ID', 'Artist_genres', 'Artist_popularity', 'Track', 'Track_ID', 'Track_release_date', 'Track_popularity', 'Lyrics', 'features_dict', '__index_level_0__'],
        num_rows: 514
    })
})

In [18]:
def tokenize_function(examples):
    result = tokenizer(examples['Lyrics'])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result


# Use batched=True to activate fast multithreading!
tokenized_datasets = rock.map(
    tokenize_function, batched=True, remove_columns= ['Unnamed: 0', 'Artist', 'Artist_ID', 'Artist_genres', 'Artist_popularity', 'Track', 'Track_ID', 'Track_release_date', 'Track_popularity', 'Lyrics', 'features_dict', '__index_level_0__']
)
tokenized_datasets

Map:   0%|          | 0/2055 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (753 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/514 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 2055
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 514
    })
})

# Slicing produces a list of lists for each feature
tokenized_samples = tokenized_datasets["train"][:3]

for idx, sample in enumerate(tokenized_samples["input_ids"]):
    print(f"'>>> Review {idx} length: {len(sample)}'")

concatenated_examples = {
    k: sum(tokenized_samples[k], []) for k in tokenized_samples.keys()
}
total_length = len(concatenated_examples["input_ids"])
print(f"'>>> Concatenated reviews length: {total_length}'")

chunk_size = 128

chunks = {
    k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
    for k, t in concatenated_examples.items()
}

for chunk in chunks["input_ids"]:
    print(f"'>>> Chunk length: {len(chunk)}'")

In [19]:
chunk_size = 128

def group_texts(examples):
    # Concatenate all texts
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    # Compute length of concatenated texts
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the last chunk if it's smaller than chunk_size
    total_length = (total_length // chunk_size) * chunk_size
    # Split by chunks of max_len
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    # Create a new labels column
    result["labels"] = result["input_ids"].copy()
    return result

In [20]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)
lm_datasets

Map:   0%|          | 0/2055 [00:00<?, ? examples/s]

Map:   0%|          | 0/514 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 40822
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 10468
    })
})

tokenizer.decode(lm_datasets["train"][10]["input_ids"])

tokenizer.decode(lm_datasets["train"][10]["labels"])

In [23]:
import collections
import numpy as np

from transformers import default_data_collator

wwm_probability = 0.2


def whole_word_masking_data_collator(features):
    for feature in features:
        word_ids = feature.pop("word_ids")

        # Create a map between words and corresponding token indices
        mapping = collections.defaultdict(list)
        current_word_index = -1
        current_word = None
        for idx, word_id in enumerate(word_ids):
            if word_id is not None:
                if word_id != current_word:
                    current_word = word_id
                    current_word_index += 1
                mapping[current_word_index].append(idx)

        # Randomly mask words
        mask = np.random.binomial(1, wwm_probability, (len(mapping),))
        input_ids = feature["input_ids"]
        labels = feature["labels"]
        new_labels = [-100] * len(labels)
        for word_id in np.where(mask)[0]:
            word_id = word_id.item()
            for idx in mapping[word_id]:
                new_labels[idx] = labels[idx]
                input_ids[idx] = tokenizer.mask_token_id
        feature["labels"] = new_labels

    return default_data_collator(features)

train_size = 10_000
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

In [24]:
from transformers import TrainingArguments

batch_size = 64
# Show the training loss with every epoch
logging_steps = len(lm_datasets["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

training_args = TrainingArguments(
    output_dir="LuisAlBERTo",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
)

In [25]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [26]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [27]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


>>> Perplexity: 484.81


trainer.train()


eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

trainer.push_to_hub()

## Accelerator

In [28]:
def insert_random_mask(batch):
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    masked_inputs = data_collator(features)
    # Create a new "masked" column for each column in the dataset
    return {"masked_" + k: v.numpy() for k, v in masked_inputs.items()}

In [29]:
downsampled_dataset = lm_datasets.remove_columns(["word_ids"])
eval_dataset = downsampled_dataset["test"].map(
    insert_random_mask,
    batched=True,
    remove_columns=downsampled_dataset["test"].column_names,
)
eval_dataset = eval_dataset.rename_columns(
    {
        "masked_input_ids": "input_ids",
        "masked_attention_mask": "attention_mask",
        "masked_labels": "labels",
    }
)

Map:   0%|          | 0/10468 [00:00<?, ? examples/s]

In [30]:
from torch.utils.data import DataLoader
from transformers import default_data_collator

batch_size = 64
train_dataloader = DataLoader(
    downsampled_dataset["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)
eval_dataloader = DataLoader(
    eval_dataset, batch_size=batch_size, collate_fn=default_data_collator
)

In [31]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

In [32]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [33]:
from transformers import get_scheduler

num_train_epochs = 3
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [34]:
from huggingface_hub import get_full_repo_name

model_name = "distilbert-base-spanish-uncased-finetuned-rock-argentino-factor-data"
repo_name = get_full_repo_name(model_name)
repo_name

'SantiRimedio/distilbert-base-spanish-uncased-finetuned-rock-argentino-factor-data'

In [35]:
from huggingface_hub import Repository

output_dir = model_name
repo = Repository(output_dir, clone_from=repo_name)

C:\Users\nunez\Facultad\Factor Data\distilbert-base-spanish-uncased-finetuned-rock-argentino-factor-data is already a clone of https://huggingface.co/SantiRimedio/distilbert-base-spanish-uncased-finetuned-rock-argentino-factor-data. Make sure you pull the latest changes with `repo.git_pull()`.


In [36]:
from tqdm.auto import tqdm
import torch
import math

progress_bar = tqdm(range(num_training_steps))

for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

    # Evaluation
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)

        loss = outputs.loss
        losses.append(accelerator.gather(loss.repeat(batch_size)))

    losses = torch.cat(losses)
    losses = losses[: len(eval_dataset)]
    try:
        perplexity = math.exp(torch.mean(losses))
    except OverflowError:
        perplexity = float("inf")

    print(f">>> Epoch {epoch}: Perplexity: {perplexity}")

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process:
        tokenizer.save_pretrained(output_dir)
        repo.push_to_hub(
            commit_message=f"Training in progress epoch {epoch}", blocking=False
        )

  0%|          | 0/1914 [00:00<?, ?it/s]

KeyboardInterrupt: 