# SR-Automation — Análise de Resultados

Notebook para análise completa dos resultados do pipeline de revisão sistemática automatizada.
Ref: Capítulo 4 da dissertação.

In [None]:
# Célula 1: Setup
import json
import os
import sys

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import yaml
from sklearn.metrics import cohen_kappa_score

sns.set_theme(style="whitegrid")
plt.rcParams["figure.dpi"] = 150

# Paths
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
with open(os.path.join(PROJECT_ROOT, "config.yaml"), "r") as f:
    config = yaml.safe_load(f)

OUTPUTS = os.path.join(PROJECT_ROOT, config["paths"]["outputs"])
DATA = os.path.join(PROJECT_ROOT, "data")

def load_jsonl(path):
    records = []
    with open(path, "r") as f:
        for line in f:
            if line.strip():
                records.append(json.loads(line))
    return pd.DataFrame(records)

print(f"Project root: {PROJECT_ROOT}")
print(f"Outputs dir: {OUTPUTS}")

In [None]:
# Célula 2: Estatísticas do Corpus (§4.1)
corpus_path = os.path.join(PROJECT_ROOT, config["paths"]["corpus"])
corpus = pd.read_csv(corpus_path)

print(f"Total de artigos: {len(corpus)}")
print(f"\nArtigos por fonte:")
print(corpus["source"].value_counts())

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Distribuição por ano
corpus["year"].value_counts().sort_index().plot(kind="bar", ax=axes[0], color="steelblue")
axes[0].set_title("Distribuição por Ano")
axes[0].set_xlabel("Ano")
axes[0].set_ylabel("Artigos")

# Distribuição do tamanho dos abstracts
corpus["abstract_len"] = corpus["abstract"].str.len()
corpus["abstract_len"].hist(bins=30, ax=axes[1], color="steelblue", edgecolor="white")
axes[1].set_title("Tamanho dos Abstracts (caracteres)")
axes[1].set_xlabel("Caracteres")
axes[1].set_ylabel("Frequência")

plt.tight_layout()
plt.savefig(os.path.join(OUTPUTS, "figures", "corpus_stats.png"), dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Célula 3: Gold Standard (§4.1)
gold_path = os.path.join(DATA, "gold_standard.csv")
if os.path.exists(gold_path):
    gold = pd.read_csv(gold_path)
    print(f"Gold standard: {len(gold)} artigos")
    
    # Concordância inter-anotador
    if "reviewer_a" in gold.columns and "reviewer_b" in gold.columns:
        to_bin = lambda x: 1 if str(x).upper() in ("YES", "INCLUDE", "1") else 0
        ra = gold["reviewer_a"].apply(to_bin)
        rb = gold["reviewer_b"].apply(to_bin)
        kappa = cohen_kappa_score(ra, rb)
        print(f"Cohen's Kappa inter-anotador: {kappa:.4f}")
        print(f"  Alvo: ≥ {config['gold_standard']['min_kappa']}")
    
    # Proporção
    if "consensus" in gold.columns:
        counts = gold["consensus"].value_counts()
        print(f"\nDistribuição do consenso:")
        print(counts)
        
        fig, ax = plt.subplots(figsize=(5, 5))
        counts.plot(kind="pie", autopct="%1.1f%%", ax=ax, colors=["#2196F3", "#FF5722"])
        ax.set_title("Gold Standard: Inclusão vs Exclusão")
        ax.set_ylabel("")
        plt.savefig(os.path.join(OUTPUTS, "figures", "gold_standard_dist.png"), dpi=300, bbox_inches="tight")
        plt.show()
else:
    print("Gold standard não encontrado. Preencha data/gold_standard.csv.")

In [None]:
# Célula 4: Resultados da Triagem (§4.2)
triage_path = os.path.join(OUTPUTS, "triage_results.jsonl")
triage = load_jsonl(triage_path)

print(f"Total triados: {len(triage)}")
print(f"\nDistribuição de decisões:")
print(triage["decision"].value_counts())

# Se métricas existem
metrics_path = os.path.join(OUTPUTS, "metrics.json")
if os.path.exists(metrics_path):
    with open(metrics_path) as f:
        metrics = json.load(f)
    
    print(f"\n=== Métricas ===")
    print(f"Recall: {metrics['recall']:.4f} (alvo: ≥{config['evaluation']['recall_target']})")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Specificity: {metrics['specificity']:.4f}")
    print(f"F1-Score: {metrics['f1_score']:.4f}")
    print(f"Workload Reduction: {metrics['workload_reduction_pct']:.1f}%")
    print(f"Recall target met: {metrics['recall_target_met']}")
    
    # Confusion matrix heatmap
    cm = metrics["confusion_matrix"]
    matrix = np.array([[cm["tp"], cm["fn"]], [cm["fp"], cm["tn"]]])
    labels = np.array([[f'VP\n{cm["tp"]}', f'FN\n{cm["fn"]}'],
                       [f'FP\n{cm["fp"]}', f'VN\n{cm["tn"]}']])
    
    fig, ax = plt.subplots(figsize=(6, 5))
    sns.heatmap(matrix, annot=labels, fmt="", cmap="Blues",
                xticklabels=["Incluir", "Excluir"],
                yticklabels=["Incluir", "Excluir"], ax=ax)
    ax.set_xlabel("Predição do Sistema")
    ax.set_ylabel("Gold Standard")
    ax.set_title("Matriz de Confusão")
    plt.savefig(os.path.join(OUTPUTS, "figures", "confusion_matrix_notebook.png"), dpi=300, bbox_inches="tight")
    plt.show()
    
    # Falsos negativos
    fn_path = os.path.join(OUTPUTS, "false_negatives_analysis.csv")
    if os.path.exists(fn_path):
        fn = pd.read_csv(fn_path)
        print(f"\nFalsos Negativos ({len(fn)}):")
        display(fn)
else:
    print("\nMétricas não disponíveis. Execute: python main.py --step metrics")

In [None]:
# Célula 5: Cross-Validation (§4.5)
cv_path = os.path.join(OUTPUTS, "cross_validation.json")
if os.path.exists(cv_path):
    with open(cv_path) as f:
        cv = json.load(f)
    
    print("=== Cross-Validation ===")
    print(f"Kappa R1-R2 (sinônimos): {cv['kappa_run1_run2']:.4f}")
    print(f"Kappa R1-R3 (ordem inv.): {cv['kappa_run1_run3']:.4f}")
    print(f"Kappa R2-R3: {cv['kappa_run2_run3']:.4f}")
    print(f"Kappa médio: {cv['kappa_mean']:.4f} (esperado: ≥{cv['expected_kappa']})")
    print(f"Concordância total: {cv['agreement_pct']:.1f}%")
    print(f"Disagreements: {len(cv.get('disagreements', []))}")
    
    # Tabela de kappas
    kappa_data = {
        "Par": ["R1-R2", "R1-R3", "R2-R3", "Média"],
        "Kappa": [cv["kappa_run1_run2"], cv["kappa_run1_run3"],
                  cv["kappa_run2_run3"], cv["kappa_mean"]],
    }
    display(pd.DataFrame(kappa_data))
    
    if cv.get("disagreements"):
        print(f"\nDisagreements:")
        display(pd.DataFrame(cv["disagreements"]))
else:
    print("Cross-validation não disponível. Execute: python main.py --step crossval")

In [None]:
# Célula 6: Extração e Sumarização (§4.3)
ext_path = os.path.join(OUTPUTS, "extraction_results.jsonl")
sum_path = os.path.join(OUTPUTS, "summaries.jsonl")

if os.path.exists(ext_path):
    extractions = load_jsonl(ext_path)
    print(f"Extrações: {len(extractions)} artigos")
    
    # Taxa de parse_error
    parse_errors = extractions.get("parse_error", pd.Series([False]*len(extractions))).sum()
    print(f"Parse errors: {parse_errors} ({parse_errors/len(extractions)*100:.1f}%)")
    
    # Campos NOT MENTIONED
    fields = ["study_objective", "methodology", "main_results",
              "conclusions_limitations", "sample_data"]
    for field in fields:
        if field in extractions.columns:
            nm = (extractions[field] == "NOT MENTIONED").sum()
            print(f"  {field}: {nm} NOT MENTIONED ({nm/len(extractions)*100:.1f}%)")
    
    # Exemplo
    print("\n--- Exemplo de extração ---")
    sample = extractions.iloc[0]
    for field in fields:
        if field in sample:
            print(f"  {field}: {sample[field][:100]}..." if len(str(sample.get(field, ''))) > 100 else f"  {field}: {sample.get(field, '')}")

if os.path.exists(sum_path):
    summaries = load_jsonl(sum_path)
    print(f"\nSumarizações: {len(summaries)} artigos")
    
    # Exemplo
    print("\n--- Exemplo de sumarização ---")
    sample = summaries.iloc[0]
    print(f"  Problem: {sample.get('problem', 'N/A')}")
    print(f"  Solution: {sample.get('solution', 'N/A')}")
    print(f"  Findings: {sample.get('findings', 'N/A')}")

In [None]:
# Célula 7: Eficiência Operacional (§4.4)
if os.path.exists(metrics_path):
    with open(metrics_path) as f:
        metrics = json.load(f)
    
    total_time = metrics["total_time_seconds"]
    manual_time = metrics["manual_baseline_seconds"]
    reduction = metrics["time_reduction_pct"]
    tokens = metrics["total_tokens"]
    
    print("=== Eficiência Operacional ===")
    print(f"Tempo pipeline: {total_time:.1f}s ({total_time/60:.1f} min)")
    print(f"Baseline manual: {manual_time:.1f}s ({manual_time/60:.1f} min)")
    print(f"Redução de tempo: {reduction:.1f}%")
    print(f"Tokens consumidos: {tokens:,}")
    
    # Gráfico comparativo
    fig, ax = plt.subplots(figsize=(6, 4))
    methods = ["Pipeline\nAutomatizado", "Processo\nManual"]
    times_min = [total_time / 60, manual_time / 60]
    colors = ["#2196F3", "#FF5722"]
    ax.bar(methods, times_min, color=colors)
    ax.set_ylabel("Tempo (minutos)")
    ax.set_title("Comparação de Tempo: Automatizado vs Manual")
    for i, v in enumerate(times_min):
        ax.text(i, v + 0.5, f"{v:.1f} min", ha="center", fontweight="bold")
    plt.savefig(os.path.join(OUTPUTS, "figures", "time_comparison.png"), dpi=300, bbox_inches="tight")
    plt.show()
else:
    print("Métricas não disponíveis.")

In [None]:
# Célula 8: Alucinação (§4.5)
hall_path = os.path.join(OUTPUTS, "hallucination_sample.csv")
if os.path.exists(hall_path):
    hall = pd.read_csv(hall_path)
    classified = hall[hall["classification"].notna() & (hall["classification"] != "")]
    
    if not classified.empty:
        print("=== Taxa de Alucinação ===")
        counts = classified["classification"].str.upper().value_counts()
        print(counts)
        
        total = len(classified)
        hall_count = counts.get("HALLUCINATED", 0)
        print(f"\nHallucination rate: {hall_count/total*100:.2f}%")
        
        # Por módulo
        fig, ax = plt.subplots(figsize=(8, 4))
        for module in ["triage", "extraction", "summarization"]:
            mod_df = classified[classified["module"] == module]
            if not mod_df.empty:
                mod_counts = mod_df["classification"].str.upper().value_counts()
                print(f"\n{module}:")
                print(mod_counts)
        
        # Gráfico
        by_module = classified.groupby(["module", "classification"]).size().unstack(fill_value=0)
        by_module.plot(kind="bar", ax=ax, colormap="Set2")
        ax.set_title("Classificação de Claims por Módulo")
        ax.set_xlabel("Módulo")
        ax.set_ylabel("Count")
        plt.xticks(rotation=0)
        plt.legend(title="Classificação")
        plt.savefig(os.path.join(OUTPUTS, "figures", "hallucination_by_module.png"), dpi=300, bbox_inches="tight")
        plt.show()
    else:
        print(f"Amostra gerada ({len(hall)} claims) mas sem classificação.")
        print("Preencha a coluna 'classification' em hallucination_sample.csv")
        print("Valores: GROUNDED | INFERRED | HALLUCINATED")
else:
    print("Amostra de alucinação não encontrada. Execute: python main.py --step hallcheck")

In [None]:
# Célula 9: Avaliação de Resumos (§4.3)
eval_path = os.path.join(OUTPUTS, "summary_evaluation_template.csv")
if os.path.exists(eval_path):
    eval_df = pd.read_csv(eval_path)
    # Filtrar apenas linhas preenchidas
    filled = eval_df[eval_df["clarity"].notna()]
    
    if not filled.empty:
        dims = ["clarity", "completeness", "accuracy", "utility"]
        
        print("=== Avaliação de Resumos (Likert 1-5) ===")
        for dim in dims:
            if dim in filled.columns:
                print(f"  {dim}: média={filled[dim].mean():.2f}, std={filled[dim].std():.2f}")
        
        # Kappa inter-avaliadores (se 2 avaliadores)
        evaluators = filled["evaluator"].unique()
        if len(evaluators) >= 2:
            e1 = filled[filled["evaluator"] == evaluators[0]]
            e2 = filled[filled["evaluator"] == evaluators[1]]
            merged = e1.merge(e2, on="article_id", suffixes=("_1", "_2"))
            for dim in dims:
                if f"{dim}_1" in merged.columns:
                    kappa = cohen_kappa_score(
                        merged[f"{dim}_1"].astype(int),
                        merged[f"{dim}_2"].astype(int),
                        weights="linear"
                    )
                    print(f"  Kappa {dim}: {kappa:.4f}")
        
        # Box plot
        fig, ax = plt.subplots(figsize=(8, 5))
        filled[dims].boxplot(ax=ax)
        ax.set_title("Avaliação de Resumos por Dimensão (Likert 1-5)")
        ax.set_ylabel("Score")
        ax.set_ylim(0.5, 5.5)
        plt.savefig(os.path.join(OUTPUTS, "figures", "summary_evaluation.png"), dpi=300, bbox_inches="tight")
        plt.show()
    else:
        print("Template de avaliação encontrado mas sem dados.")
        print("Preencha outputs/summary_evaluation_template.csv com scores Likert 1-5.")
else:
    print("Template de avaliação não encontrado.")

In [None]:
# Célula 10: Exportação
sys.path.insert(0, PROJECT_ROOT)
from src.report_generator import (
    generate_confusion_matrix_plot,
    generate_latex_tables,
    generate_metrics_bar_chart,
)

metrics_path = os.path.join(OUTPUTS, "metrics.json")
crossval_path = os.path.join(OUTPUTS, "cross_validation.json")

if os.path.exists(metrics_path):
    generate_latex_tables(metrics_path, crossval_path, config)
    generate_confusion_matrix_plot(metrics_path, config)
    generate_metrics_bar_chart(metrics_path, config)
    print("Exportação concluída!")
    print(f"  Tabelas LaTeX: {os.path.join(OUTPUTS, 'latex_tables.tex')}")
    print(f"  Figuras: {os.path.join(OUTPUTS, 'figures/')}")
else:
    print("Métricas não disponíveis para exportação.")
    print("Execute o pipeline completo primeiro.")