## 2.17- Exercícios *Fine-Tuning*


1.   Alterar alguns parâmetros de treinamento, como: batch_size, learning rate e número de épocas. Avaliar qual o impacto negativo ou positivo na alteração desses parâmetros.
2.   Utilizar alguma técnica de balanceamento de *dataset* e avaliar os resultados, Ex.: *Oversampling* e *Undersampling*

**Importante:**

*   Todas as alterações devem ser registradas no Wandb para que seja possível realizar comparações entre os experimentos.

### Bibliotecas

In [None]:
%pip install transformers
%pip install datasets
%pip install torch
%pip install evaluate
%pip install wandb

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
from datasets import load_dataset, DatasetDict
import evaluate
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import wandb

## Constants

In [None]:
MODEL_ID="adalbertojunior/distilbert-portuguese-cased"
DATASET_ID="verissimomanoel/olist_customers_review"
MAX_LENGTH=512
NUM_LABELS=3
RESULTS_PATH = "./results"
PRETRAINED_PATH = "./sentiment-analysis-bert-portuguese"
WANDB_PROJECT="aula2_finetuning"
WANDB_NAME="experimentos_balanceamento"

try:
    import google.colab
    IS_GOOGLE_COLAB = True
    print("Running in Google Colab")
except ImportError:
    IS_GOOGLE_COLAB = False

GOOGLE_COLAB_TRAIN_SIZE=5000
GOOGLE_COLAB_TEST_SIZE=1000
GOOGLE_COLAB_VAL_SIZE=800

### Start Wandb

In [None]:
wandb.login()
wandb.init(project=WANDB_PROJECT, name=WANDB_NAME)

### Carregando o Dataset

In [None]:
dataset = load_dataset(DATASET_ID, trust_remote_code=True)

# Splita o dataset de treino tirando 20% para validação
ds_train_split = dataset["train"].train_test_split(test_size=0.2)

dataset = DatasetDict({
    "train": ds_train_split["train"],
    "test": dataset["test"],
    "val": ds_train_split["test"],
})

train_dataset = dataset['train']
test_dataset = dataset['test']
val_dataset = dataset['val']

print(dataset)

### Mostra informações do dataset

In [None]:
def show_info_dataset(dataset, title):
    df = dataset.to_pandas()

    label_counts = df['label'].value_counts()
    label_names = {0: 'Negativo', 1: 'Positivo', 2: 'Neutro'}
    labels = [label_names[label] for label in label_counts.index]

    colors = ['green', 'red', 'blue']

    plt.figure(figsize=(10, 6))
    bars = plt.bar(labels, label_counts, color=colors)

    for bar in bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2, yval, int(yval), va='bottom')

    plt.title(title)
    plt.xlabel('Label')
    plt.ylabel('Total')
    plt.show()

show_info_dataset(train_dataset, 'Distribuição por Classe - Treino')
show_info_dataset(val_dataset, 'Distribuição por Classe - Validação')
show_info_dataset(test_dataset, 'Distribuição por Classe - Teste')

### Prepara o do DataSet

#### Tokenização

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_ID)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=MAX_LENGTH)


train_dataset = train_dataset.shuffle()
test_dataset = test_dataset.shuffle()
val_dataset = val_dataset.shuffle()

if IS_GOOGLE_COLAB:
    train_dataset = train_dataset.select(range(GOOGLE_COLAB_TRAIN_SIZE))
    test_dataset = test_dataset.select(range(GOOGLE_COLAB_TEST_SIZE))
    val_dataset = test_dataset.select(range(GOOGLE_COLAB_VAL_SIZE))

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

#### Formatação

In [None]:
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

### Carregando o modelo

In [None]:
model = BertForSequenceClassification.from_pretrained(MODEL_ID, num_labels=NUM_LABELS)

### Definição de métricas

In [None]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy.compute(predictions=preds, references=labels)
    f1_score = f1.compute(predictions=preds, references=labels, average="weighted")
    return {"accuracy": acc["accuracy"], "f1": f1_score["f1"]}

### Definição de parâmetros

In [None]:
train_params = [
    {
        "batch_size": 28,
        "epochs": 3,
        "learning_rate": 3e-5
    },
    {
        "batch_size": 50,
        "epochs": 3,
        "learning_rate": 3e-5
    },
    {
        "batch_size": 28,
        "epochs": 5,
        "learning_rate": 3e-5
    },
    {
        "batch_size": 28,
        "epochs": 3,
        "learning_rate": 5e-5
    },
    {
        "batch_size": 50,
        "epochs": 5,
        "learning_rate": 5e-5
    }
]

### Definição do treino

In [None]:
def train(dataset, prefix: str = ""):
  results = []

  for index, params in enumerate(train_params):
    run_dir = f"{RESULTS_PATH}/run_{prefix}_{index:02d}_{params['batch_size']}_ep{params['epochs']}_lr{params['learning_rate']}_wloss"

    training_args = TrainingArguments(
      output_dir=run_dir,
      eval_strategy="epoch",
      learning_rate=params["learning_rate"],
      per_device_train_batch_size=params["batch_size"],
      per_device_eval_batch_size=params["batch_size"],
      num_train_epochs=params["epochs"],
      weight_decay=0.01,
      metric_for_best_model="f1",
      report_to=["wandb"],      # envia logs pro WandB
      run_name=f"run_{prefix}_{index:02d}")

    trainer = Trainer(
      model=model,
      args=training_args,
      train_dataset=dataset,
      eval_dataset=val_dataset,
      data_collator=data_collator,
      compute_metrics=compute_metrics
    )

    trainer.train()

    metrics = trainer.evaluate()
    trainer.save_metrics(f"eval_{index}", metrics)

    results.append({"trainer": trainer, "params": params, "metrics": metrics})

  return results

### Executa o treino do dataset base

In [None]:
base_result = train(train_dataset, prefix="base")
display("Resultado base", base_result)

### Definição de como pegar o melhor modelo entre os treinados

In [None]:
def result_to_sorted_pandas(dataset):
  df = pd.DataFrame([
      {
          **r["params"],
          **r["metrics"]
      }

      for r in dataset
  ])

  df = df.sort_values(by="eval_f1", ascending=False)
  return df

def print_best_result(df: pd.DataFrame, name: str):
  print(f"Melhor resultado com o dataset {name}: batch_size={df["batch_size"]}, epochs={df["epochs"]}, learning_rate={df["learning_rate"]}")

### Pega o melhor modelo treinado no dataset base

In [None]:
df_base_result = result_to_sorted_pandas(base_result)
best_df_base_result = df_base_result.iloc[0]

print_best_result(df=best_df_base_result, name="base")

### Gera oversample e treina o mesmo

In [None]:
def oversample(ds, label_col):
    y = np.array(ds[label_col])
    classes, counts = np.unique(y, return_counts=True)
    max_n = counts.max()
    idxs = np.concatenate([
        np.random.choice(np.where(y == c)[0], size=max_n, replace=True) for c in classes
    ])
    np.random.shuffle(idxs)
    return ds.select(idxs)

balanced_train = oversample(train_dataset, "label")

balanced_result = train(balanced_train, prefix="balanced")

display("Resultado balanceado", balanced_result)

### Pega o melhor modelo com oversample

In [None]:
df_balanced_result = result_to_sorted_pandas(balanced_result)
best_df_balanced_result = df_balanced_result.iloc[0]
print_best_result(df=best_df_balanced_result, name="balanced")



### Pega o melhor modelo entre todos

In [None]:
df_best_results = pd.concat([best_df_base_result, best_df_balanced_result], axis=1).T
df_best_results = df_best_results.sort_values(by="eval_f1", ascending=False)
best_df_best_results = df_best_results.iloc[0]
print_best_result(df=best_df_best_results, name="melhor de todos")

### Finaliza o wandb

In [None]:
wandb.finish()