In [6]:
# Instalação das bibliotecas
!pip install transformers torch sentencepiece pandas accelerate bitsandbytes

# Atualização do transformers para garantir compatibilidade
!pip install --upgrade transformers

# Necessário para usar o Phi-3
!pip install 'optimum[onnxruntime]'

import sys
import os
import re
import json
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# Garante a codificação UTF-8
try:
    sys.stdout.reconfigure(encoding='utf-8')
except Exception:
    pass

print("Setup de bibliotecas concluído.")

Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Downloading transformers-4.57.1-py3-none-any.whl (12.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m99.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m111.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.4
    Uninstalling tokenizers-0.21.4:
      Successfully uninstalled tokenizer

Collecting transformers>=4.29 (from optimum[onnxruntime])
  Using cached transformers-4.55.4-py3-none-any.whl.metadata (41 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers>=4.29->optimum[onnxruntime])
  Using cached tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached transformers-4.55.4-py3-none-any.whl (11.3 MB)
Using cached tokenizers-0.21.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.22.1
    Uninstalling tokenizers-0.22.1:
      Successfully uninstalled tokenizers-0.22.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.57.1
    Uninstalling transformers-4.57.1:
      Successfully uninstalled transformers-4.57.1
^C
Setup de bibliotecas concluído.


In [2]:
# CÉLULA 2: Upload e Carregamento de Dados (Para Lista Completa)

# Importações essenciais
import sys
import os
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# (As outras importações serão carregadas na próxima célula)


# =========================================================================
# 1. CARREGAMENTO DOS DADOS LOCAIS (Lê todos os exemplos)
# =========================================================================

DATA_FILE_PATH = "/content/data/train.json"
data_lines = [] # Inicializa a lista de todos os exemplos

try:
    print(f"Iniciando o carregamento manual de TODAS AS LINHAS JSON no ambiente Colab: {DATA_FILE_PATH}...")

    # Criamos o diretório de dados
    OUTPUT_DATA_DIR = os.path.dirname(DATA_FILE_PATH)
    os.makedirs(OUTPUT_DATA_DIR, exist_ok=True)

    # Verifica se o arquivo está na raiz e o move, se necessário
    if os.path.exists("/content/train.json") and not os.path.exists(DATA_FILE_PATH):
        os.rename("/content/train.json", DATA_FILE_PATH)
        print("Arquivo 'train.json' movido da raiz para /content/data/.")


    with open(DATA_FILE_PATH, 'r', encoding='utf-8') as f:
        # Lê o formato JSON Lines (JSONL)
        data_lines = [json.loads(line) for line in f]

    if not data_lines:
        raise ValueError("O arquivo JSON está vazio ou o formato está incorreto.")

    print(f"\n✅ SUCESSO! {len(data_lines)} exemplos carregados do arquivo local do Colab.")

except (FileNotFoundError, ValueError, Exception) as e:
    print(f"\n❌ ERRO FATAL: Falha no carregamento dos dados locais no Colab.")
    print(f"Erro: {e}")
    sys.exit()


# =========================================================================
# VARIÁVEIS GLOBAIS DE DADOS
# =========================================================================

TARGET_INFERENCE = "pronominal bridging"
# 'data_lines' agora contém a lista completa de exemplos

Iniciando o carregamento manual de TODAS AS LINHAS JSON no ambiente Colab: /content/data/train.json...
Arquivo 'train.json' movido da raiz para /content/data/.

✅ SUCESSO! 8548 exemplos carregados do arquivo local do Colab.


In [3]:
# CÉLULA 3: Carregamento dos Modelos (Phi-3 e BERT) - CORRIGIDA

# =========================================================================
# 3. CARREGAMENTO DE MODELOS (PHI-3 e BERT)
# =========================================================================

# Importações Essenciais (Repetidas por segurança do escopo no Colab)
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

# Ajuste para usar a GPU (se disponível) ou CPU.
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# Usamos float16 para economia de VRAM (essencial para o Phi-3)
MODEL_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32

print(f"\nCarregando gerador de perguntas (PHI-3-mini-4k-instruct) para dispositivo: {device}...")
qg_model_id = "microsoft/Phi-3-mini-4k-instruct"

try:
    # Phi-3 usa o pipeline 'text-generation'
    qg = pipeline(
        "text-generation",
        model=qg_model_id,
        device=device,
        model_kwargs={"torch_dtype": MODEL_DTYPE}
    )
except Exception as e:
    print(f"❌ ERRO ao carregar o modelo Phi-3: {e}")
    sys.exit() # sys.exit é seguro pois sys foi importado na Célula 1


print("Carregando classificador de skills (Modelo BERT)...")
cls_model_id = "curious008/BertForStorySkillClassification"
cls_tokenizer = AutoTokenizer.from_pretrained(cls_model_id, use_fast=False)
cls_model = AutoModelForSequenceClassification.from_pretrained(
    cls_model_id, ignore_mismatched_sizes=True
)
if torch.cuda.is_available():
    cls_model.to("cuda")

skill_labels = [
    "Character", "Setting", "Feeling", "Action",
    "Causal Relationship", "Outcome Resolution", "Prediction"
]

print("Carregamento de modelos concluído.")


Carregando gerador de perguntas (PHI-3-mini-4k-instruct) para dispositivo: cuda:0...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/599 [00:00<?, ?B/s]

Device set to use cuda:0


Carregando classificador de skills (Modelo BERT)...


tokenizer_config.json:   0%|          | 0.00/434 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at curious008/BertForStorySkillClassification and are newly initialized because the shapes did not match:
- bert.embeddings.word_embeddings.weight: found shape torch.Size([30524, 768]) in the checkpoint and torch.Size([30523, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Carregamento de modelos concluído.


In [4]:
# CÉLULA 4: Geração Controlada (QG), Classificação e Salvamento - COM LOOP

# Importações necessárias
import re
import os
import json
import torch
from tqdm.notebook import tqdm # Para barra de progresso no Colab

# =========================================================================
# CONFIGURAÇÃO DE PROCESSAMENTO
# =========================================================================

# **AJUSTE AQUI:** Limite o número de exemplos a serem processados.
# Remova esta linha ou defina para 'len(data_lines)' para processar o dataset completo.
MAX_EXAMPLES_TO_PROCESS = 5

# Lista para armazenar todos os resultados
all_results = []

# =========================================================================
# FUNÇÕES E TEMPLATES (MANTIDOS)
# =========================================================================

TARGET_INFERENCE = "pronominal bridging"

FEW_SHOT_EXAMPLE_TEMPLATE = f"""
[INSTRUCTION]
TASK: Generate a multiple-choice question for '{TARGET_INFERENCE}' inference type.
RULES: The output must adhere strictly to the format: NEW_TYPE, REASONING, QUESTION, OPTIONS.

Example:
TYPE: pronominal bridging
CONTEXT: A greenhouse is a building where plants such as flowers and vegetables are grown. It usually has a glass or translucent plastic roof.
ANSWER: greenhouses
REASONING: The pronoun 'it' refers to 'greenhouse' in the previous sentence, bridging the concepts.
QUESTION: According to the passage, what can have translucent plastic roofs?
OPTIONS: backyards; living spaces; greenhouses; botanic gardens

NEW_CONTEXT: {{context}}
NEW_ANSWER: {{answer}}
NEW_TYPE: pronominal bridging
REASONING: (Generate the reasoning for the new question here)
QUESTION: (Generate the question stem here)
OPTIONS: (Generate 4 options, separated by semicolons)
"""

def create_phi3_prompt_for_inference_type(context, answer, template):
    """Cria o prompt Few-Shot."""
    prompt = template.replace("{{context}}", context) \
                     .replace("{{answer}}", answer)
    return f"<|user|>{prompt}<|end|><|assistant|>"

# =========================================================================
# 4. LOOP PRINCIPAL DE GERAÇÃO E AVALIAÇÃO
# =========================================================================

print("\n=== INICIANDO PROCESSAMENTO EM LOTE (QG PHI-3 e CLASSIFICAÇÃO) ===")
print(f"Processando {min(MAX_EXAMPLES_TO_PROCESS, len(data_lines))} de {len(data_lines)} exemplos...")


for i, example_row in enumerate(tqdm(data_lines[:MAX_EXAMPLES_TO_PROCESS])):

    # 1. Extração de Contexto e Resposta para o Exemplo Atual
    context = example_row.get('content', 'ERRO').strip()
    expected_answer = example_row.get('answer', 'ERRO').strip()

    # Ignora exemplos onde o contexto é inválido
    if context == 'ERRO' or expected_answer == 'ERRO':
        continue

    # 2. Geração Controlada (QG)
    qg_input = create_phi3_prompt_for_inference_type(context, expected_answer, FEW_SHOT_EXAMPLE_TEMPLATE)

    try:
        qg_output_list = qg(qg_input, max_new_tokens=512, do_sample=False, return_full_text=False)
        generated_output_full = qg_output_list[0]['generated_text'].strip()
    except Exception as e:
        print(f"\n[ERRO GERAÇÃO]: Falha na Geração QG para o índice {i}. Erro: {e}")
        generated_output_full = "ERRO NA GERAÇÃO DO MODELO"

    # 3. Parsing (RQ3 e RQ1)
    reasoning = "N/A"
    generated_q = generated_output_full

    if generated_output_full != "ERRO NA GERAÇÃO DO MODELO":
        try:
            reasoning_match = re.search(r"REASONING:\s*(.*?)(?=\s*QUESTION:|$)", generated_output_full, re.DOTALL | re.IGNORECASE)
            question_match = re.search(r"QUESTION:\s*(.*?)(?=\s*OPTIONS:|$)", generated_output_full, re.DOTALL | re.IGNORECASE)

            reasoning = reasoning_match.group(1).strip() if reasoning_match else "ERRO: Parsing do Raciocínio Falhou."
            generated_q = question_match.group(1).strip() if question_match else "ERRO: Parsing da Pergunta Falhou."

            if "ERRO" in generated_q:
                # Fallback se o parsing falhar: usa o output completo como pergunta.
                generated_q = generated_output_full.strip()
                reasoning = "Parsing falhou; usando output completo como pergunta."
        except Exception:
            generated_q = generated_output_full.strip()
            reasoning = "Parsing Geral Falhou."

    # 4. Classificação (RQ2)
    try:
        enc = cls_tokenizer(generated_q, return_tensors="pt", truncation=True, padding=True, max_length=512)
        if torch.cuda.is_available():
            enc = {k:v.to("cuda") for k,v in enc.items()}

        logits = cls_model(**enc).logits
        probs = torch.softmax(logits, dim=-1).cpu().detach().numpy().flatten()
        pred_idx = int(probs.argmax())
        pred_label = skill_labels[pred_idx]
    except Exception:
        pred_label = "ERRO NA CLASSIFICAÇÃO"
        probs = [0.0] * len(skill_labels) # Garante que a lista de probs tenha o mesmo tamanho

    # 5. Coleta de Resultados
    results_entry = {
        "id": i,
        "story_name": example_row.get('story_name', 'N/A'),
        "context_snippet": context,
        "expected_answer": expected_answer,
        "input_prompt_full": qg_input,
        "generated_question_stem_rq1": generated_q,
        "generated_reasoning_cot_rq3": reasoning,
        "predicted_skill_label_rq2": pred_label,
        "probabilities_by_class": {lbl: float(p) for lbl, p in zip(skill_labels, probs)}
    }
    all_results.append(results_entry)

print("\nProcessamento em lote concluído.")

# =========================================================================
# 5. SALVAMENTO DOS OUTPUTS
# =========================================================================

OUTPUT_DIR = "/content/output"
OUTPUT_FILENAME = "all_qg_classification_results_phi3.json"
outpath = os.path.join(OUTPUT_DIR, OUTPUT_FILENAME)

os.makedirs(OUTPUT_DIR, exist_ok=True)

try:
    with open(outpath, "w", encoding="utf-8") as f:
        json.dump(all_results, f, indent=2, ensure_ascii=False)
    print(f"\n✅ RESULTADOS FINAIS salvos com sucesso em: {outpath}")

except Exception as e:
    print(f"\n❌ ERRO ao salvar o arquivo JSON.")
    print(f"Erro: {e}")


=== INICIANDO PROCESSAMENTO EM LOTE (QG PHI-3 e CLASSIFICAÇÃO) ===
Processando 5 de 8548 exemplos...


  0%|          | 0/5 [00:00<?, ?it/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Processamento em lote concluído.

✅ RESULTADOS FINAIS salvos com sucesso em: /content/output/all_qg_classification_results_phi3.json
