## Biblotecas

In [7]:
import sys
!{sys.executable} -m pip install datasets matplotlib transformers


from datasets import Dataset

import matplotlib.pyplot as plt
from transformers import BertTokenizerFast
import time as tm

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




## Data

In [8]:
# Carregar texto do livro
with open("a_revolucao_dos_bichos.txt", "r", encoding="utf-8") as f:
    livro = f.read()

# Dividir por parágrafos (ou capítulos)
paragrafos = [p.strip() for p in livro.split("\n") if len(p.strip()) > 50] 

# Visualizar alguns trechos
paragrafos[:2]

data = {
    "id": ["1"],
    "title": ["A Revolução dos Bichos"],
    "context": [paragrafos[0]],
    "question": ["Quem inspirou os animais com um discurso revolucionário?"],
    "answers": [{"text": ["Major"], "answer_start": [17]}]  # ajuste conforme necessário
}


## Pipeline

In [11]:

init = tm.time()
dataset = Dataset.from_dict(data)
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

def preprocess(example):
    inputs = tokenizer(
        example["question"],
        example["context"],
        truncation=True,
        padding="max_length",
        max_length=384,
        return_offsets_mapping=True
    )

    # Pega posição inicial da resposta
    answer = example["answers"]["text"][0]
    start_char = example["answers"]["answer_start"][0]
    end_char = start_char + len(answer)

    # Offset mapping: onde cada token aparece no texto original
    offsets = inputs["offset_mapping"]

    start_pos = end_pos = None
    for i, (start, end) in enumerate(offsets):
        if start <= start_char < end:
            start_pos = i
        if start < end_char <= end:
            end_pos = i
            break

    # fallback caso a correspondência seja exata
    if start_pos is None:
        start_pos = 0
    if end_pos is None:
        end_pos = 0

    inputs["start_positions"] = start_pos
    inputs["end_positions"] = end_pos
    inputs.pop("offset_mapping")  # remover para evitar erro

    return inputs


encoded_dataset = dataset.map(preprocess)

from transformers import BertForQuestionAnswering, TrainingArguments, Trainer

model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

training_args = TrainingArguments(
    output_dir="./bert-qa-revolucao_001",
    num_train_epochs=2,
    per_device_train_batch_size=1,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset,
)

trainer.train()


# Seu código aqui

from transformers import pipeline

qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

Map: 100%|██████████| 1/1 [00:00<00:00,  4.46 examples/s]


ImportError: 
BertForQuestionAnswering requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.


## Questions

In [None]:




resposta = qa_pipeline({
    "context": paragrafos[0],
    "question": "Napoleão morre?"
})

print("1_Resposta:", resposta["answer"])

result = sum(range(1000000))
end_time = time.time()
execution_time = end_time - start_time
print(f"1_Tempo de execução: {execution_time:.4f} segundos")


# step = 250
# teto = 15000
# min = 5000
# tokens = [min+x*step for x in range((int) ((teto-min)/step+1))]
# for qtdTokens in tokens:
#     qtd = (int) (qtdTokens)
#     tempo = takeTime(filecontent[0:qtd],qtd)
#     tempos.append(tempo)
#     print(f"Tokens: {qtd}; Tempo: {tempo}s")

# plt.plot(tokens,tempos,marker="")
# plt.xlabel(f"Quantidade de Tokens {min} -> {teto} step {step}")
# plt.ylabel("Tempo de Greação (s)")
# plt.title("Tempo de Geração (s) vs. Quantidade de Tokens c/ armazenamento")
# plt.grid(True)
# plt.show()