In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
from torch.utils.data import Dataset, DataLoader
import os
# Baixa e carrega o modelo pequeno do GPT-2
model_name = "distilgpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)
model.eval()


# Definir token de padding
tokenizer.pad_token = tokenizer.eos_token

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
class DialogDataset(Dataset):
    def __init__(self, file_path):
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()
        # Divide em blocos de diálogo (pergunta + resposta)
        self.dialogues = []
        blocks = text.split("\n\n")
        for block in blocks:
            lines = block.split("\n")
            if len(lines) >= 2:
                self.dialogues.append((lines[0].strip(), lines[1].strip()))
    
    def __len__(self):
        return len(self.dialogues)
    
    def __getitem__(self, idx):
        prompt, response = self.dialogues[idx]
        full_text = f"{prompt}\n{response}"
        inputs = tokenizer(full_text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
        inputs["labels"] = inputs["input_ids"]
        return {key: val.squeeze() for key, val in inputs.items()}

In [None]:
# Dataset e dataloader
dataset1 = DialogDataset("treino_dialogo.txt")
dataset2 = DialogDataset("treino_dialogo2.txt")
dataset_combinado = torch.utils.data.ConcatDataset([dataset1, dataset2])
loader = DataLoader(dataset_combinado, batch_size=2, shuffle=True)

# Configuração do otimizador
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Treinamento simples
model.train()
for epoch in range(10):  # 3 épocas
    for batch in loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    print(f"Época {epoch + 1} finalizada.")

In [None]:
def gerar_resposta(prompt, max_length=100):
    prompt = f"Pessoa: {prompt}\nBot:"  # Use os mesmos prefixos do dataset
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        inputs,
        max_length=max_length,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.9
    )
    resposta = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Captura apenas a parte após o "Bot:"
    if "Bot:" in resposta:
        resposta = resposta.split("Bot:")[-1].strip()
    return resposta


In [None]:
while True:
    entrada = input("Você: ")
    if entrada.lower() in ['sair', 'exit']:
        break
    resposta = gerar_resposta(entrada)
    print("Você:", entrada)
    print("Bot:", resposta)

NameError: name 'tokenizer' is not defined