# POS TECH - IA PARA DEVS
### Tech Challenge - Fase 03

**Aluno:** Inacio Ribeiro - RM362328



O objetivo deste Tech Challenge é executar o processo de fine-tuning de um modelo de fundação (BERT) utilizando o dataset "The Amazon Titles-1.3MM".

In [1]:
!pip install torch transformers datasets evaluate scikit-learn -q


In [34]:
import json
import torch
import transformers
import accelerate
from datasets import Dataset
from transformers import BertTokenizer
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    pipeline
)
import torch

### 1. Importando o Dataset

In [3]:
file_path = '/home/trn.json'

MAX_RECORDS = 10000

data = []
with open(file_path, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        if i >= MAX_RECORDS:
            break
        line = line.strip()
        if not line:
            continue
        try:
            data.append(json.loads(line))
        except json.JSONDecodeError as e:
            continue

records = []
for entry in data:
    title = entry.get("title", "").strip()
    content = entry.get("content", "").strip()

    if not content:
        continue

    question = f"What is the product '{title}'?"
    answer = content
    records.append({"question": question, "answer": answer})

dataset = Dataset.from_list(records)
dataset.to_json("/home/data.json")
print(dataset)
print(dataset[0])




Creating json from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Dataset({
    features: ['question', 'answer'],
    num_rows: 7529
})
{'question': "What is the product 'Girls Ballet Tutu Neon Pink'?", 'answer': 'High quality 3 layer ballet tutu. 12 inches in length'}


In [13]:
import json

input_path = "/home/data.json"
output_path = "/home/data_list.json"

with open(input_path, "r") as infile:
    raw = infile.read()

entries = []
for item in raw.strip().split("}\n{"):
    item = item.strip()
    if not item.startswith("{"):
        item = "{" + item
    if not item.endswith("}"):
        item = item + "}"
    try:
        entries.append(json.loads(item))
    except json.JSONDecodeError as e:
        print(f"Skipping invalid entry: {e}")
        continue


with open(output_path, "w") as outfile:
    json.dump(entries, outfile, indent=2)


print(f"Converted {len(entries)} entries to JSON list at: {output_path}")

dataset = load_dataset("json", data_files="/home/data_list.json", split="train")
print(dataset[0])


Converted 7529 entries to JSON list at: /home/data_list.json


Generating train split: 0 examples [00:00, ? examples/s]

{'question': "What is the product 'Girls Ballet Tutu Neon Pink'?", 'answer': 'High quality 3 layer ballet tutu. 12 inches in length'}


In [14]:
input_path = "/home/data_list.json"
output_path = "/home/data_list.json"

with open(input_path, "r") as f:
    data = json.load(f)

for entry in data:
    entry["context"] = entry["answer"]

with open(output_path, "w") as f:
    json.dump(data, f, indent=2)

### 2. BERT Tokenizer

In [15]:
from datasets import load_dataset, DatasetDict
from transformers import BertTokenizerFast

dataset = load_dataset("json", data_files="/home/data_list.json", split="train")
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def prepare_train_features(example):
    context = example["context"]
    question = example["question"]
    answer_text = example["answer"]

    start_char = context.find(answer_text)
    if start_char == -1:
        start_char = 0
        end_char = 0
    else:
        end_char = start_char + len(answer_text)

    tokenized_example = tokenizer(
        question,
        context,
        truncation=True,
        max_length=384,
        stride=128,
        return_overflowing_tokens=False,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offsets = tokenized_example["offset_mapping"]

    start_token = end_token = 0
    for idx, (start, end) in enumerate(offsets):
        if start <= start_char < end:
            start_token = idx
        if start < end_char <= end:
            end_token = idx

    tokenized_example["start_positions"] = start_token
    tokenized_example["end_positions"] = end_token
    tokenized_example.pop("offset_mapping")

    return tokenized_example

tokenized_dataset = dataset.map(prepare_train_features)

print(tokenized_dataset)


Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/7529 [00:00<?, ? examples/s]

Dataset({
    features: ['question', 'answer', 'context', 'input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 7529
})


### 3. Argumentos de Treinamento

In [23]:
from transformers import BertForQuestionAnswering, TrainingArguments, Trainer
import numpy as np
from datasets import load_dataset
import evaluate

model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

squad_metric = evaluate.load("squad")

def compute_metrics(p):
    start_preds = np.argmax(p.predictions[0], axis=1)
    end_preds = np.argmax(p.predictions[1], axis=1)

    start_labels, end_labels = p.label_ids

    return {
        "start_accuracy": (start_preds == start_labels).mean(),
        "end_accuracy": (end_preds == end_labels).mean()
    }



Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 4. Treinamento Setup

In [24]:
import os
from transformers import TrainingArguments, Trainer

os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./bert_finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_dir="./logs",
    logging_steps=100,
    fp16=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model("./bert_finetuned")


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Start Accuracy,End Accuracy
1,0.0054,0.004931,1.0,0.999203
2,0.0094,0.000757,1.0,0.999867
3,0.001,0.000671,1.0,0.999867


### 5. Testando antes e depois do treinamento

In [33]:
from transformers import BertTokenizerFast, BertForQuestionAnswering
import torch

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

model_base = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

model_finetuned = BertForQuestionAnswering.from_pretrained("./bert_finetuned")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_base.to(device)
model_finetuned.to(device)

question = "What features the 'Girls Ballet Tutu Neon Pink' have?"
context = "The Girls Ballet Tutu Neon Pink is a high-quality 3-layer ballet tutu. 12 inches in length, soft material for comfort."

inputs = tokenizer(question, context, return_tensors="pt").to(device)

with torch.no_grad():
    outputs_base = model_base(**inputs)
    outputs_finetuned = model_finetuned(**inputs)

start_base = torch.argmax(outputs_base.start_logits, dim=1)
end_base = torch.argmax(outputs_base.end_logits, dim=1)

start_finetuned = torch.argmax(outputs_finetuned.start_logits, dim=1)
end_finetuned = torch.argmax(outputs_finetuned.end_logits, dim=1)

answer_base = tokenizer.decode(inputs["input_ids"][0][start_base:end_base+1])
answer_finetuned = tokenizer.decode(inputs["input_ids"][0][start_finetuned:end_finetuned+1])

print("Pretrained model answer: ", answer_base)
print("Fine-tuned model answer: ", answer_finetuned)


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Pretrained model answer:  
Fine-tuned model answer:  the girls ballet tutu neon pink is a high - quality 3 - layer ballet tutu. 12 inches in length, soft material for comfort.
