In [None]:
import json
import torch
import pandas as pd
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM
import os
from huggingface_hub import login

# =================== HUGGINGFACE TOKEN ===================
os.environ["HUGGINGFACE_TOKEN"] = "**"
login(token=os.environ["HUGGINGFACE_TOKEN"]) # Hidden

# =================== CONFIG ===================
model_id = "Qwen/Qwen2.5-72B-Instruct"
input_file = "./Arxiv-papers2000/Final-50.xlsx"
output_file = "llm_judgments_qwen2.5.json"

# =================== PROMPT FUNCTION ===================
def build_prompt(table: str, gen_para: str) -> str:
    return f"""
You are an expert in content alignment. You are given a Table (<Table>) and a Paragraph generated by an LLM (<LLM-generated>).

Your task is to evaluate the generated paragraph against the table content across the following four dimensions:

D1 – Factual Correctness:
Is the LLM-generated paragraph factually accurate based on the table content?
Score: Binary (0 or 1)
1 = All factual claims in the paragraph are correct based on the table.
0 = The paragraph includes any incorrect or misstated information.

D2 – Reasoning Depth:
Does the paragraph demonstrate reasoning beyond simply restating facts?
Score: Binary (0 or 1)
1 = Shows inference, interpretation, or synthesis.
0 = Only repeats or paraphrases information from the table.

D3 – Reasoning Type:
What is the primary type of reasoning demonstrated?
Score: One of the following categories only
Comparative, No Reasoning

D4 – Relevance:
Does the paragraph stay focused and include only relevant, meaningful information from the table?
Score: Binary (0 or 1)
1 = Shows relevance with the table's contents.
0 = doesn't Show any relevance with the table's content

Return only your scores in the following format (no explanation):

Factual_correctness: <your-answer>,
Reasoning_depth: <your-answer>,
Reasoning_type: <your-answer>,
Relevance: <your-answer>

<Table>
{table}

<LLM-generated>
{gen_para}
"""

# =================== PARSE FUNCTION ===================
def extract_structured_scores(response):
    result = {}
    for line in response.splitlines():
        if ":" not in line:
            continue
        k, v = line.split(":", 1)
        k = k.strip()
        v = v.strip("`,.;: ")
        if v:
            result[k] = v
    return result

# =================== LOAD MODEL ===================
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)
model.eval()

# =================== LOAD DATA ===================
df = pd.read_excel(input_file, sheet_name="Sheet1", skiprows=1)
df.columns = df.iloc[0]         
df = df[1:]                     # Delete Header 
df = df.reset_index(drop=True)                   
df = df.iloc[:50]

cols = [
    "source_models", "filename", "table_label", "table_caption", "table_content", "referencing_paragraphs_cleaned", "_drop",
    "llama_factual", "llama_Reasoningdepth", "llama_Reasoningtype", "llama_relevance", "llama3.3_70b_sentence", "llama_score",
    "mistral_score", "mistral_factual", "mistral_Reasoningdepth", "mistral_Reasoningtype", "mistral_relevance", "Mistral24B_sentence",
    "gemma_score", "gemma_factual", "gemma_Reasoningdepth", "gemma_Reasoningtype", "gemma_relevance", "Gemma_sentence_gemma"
]
df = df.iloc[:, :len(cols)]     
df.columns = cols              
df = df.drop(columns=["_drop"])

# =================== RUN JUDGMENT ===================
results = []

model_to_col_prefix = {
    "llama3.3_70b_sentence": "llama",
    "Mistral24B_sentence": "mistral",
    "Gemma_sentence_gemma": "gemma"
}

for idx, row in tqdm(df.iterrows(), total=len(df)):
    table = row["table_content"]
    for model_key in ["llama3.3_70b_sentence", "Mistral24B_sentence", "Gemma_sentence_gemma"]:
        model_output = row[model_key]
        prompt = build_prompt(table, model_output)
        input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to(model.device)

        with torch.no_grad():
            output = model.generate(
                **input_ids,
                max_new_tokens=100,
                temperature=0.0,
                do_sample=False,
            )

        gen_text = tokenizer.decode(output[0][input_ids["input_ids"].shape[-1]:], skip_special_tokens=True)
        scores = extract_structured_scores(gen_text)

        # human/model reference columns
        prefix = model_to_col_prefix[model_key]
        factual_col = f"{prefix}_factual"
        reasoningdepth_col = f"{prefix}_Reasoningdepth"
        reasoningtype_col = f"{prefix}_Reasoningtype"
        relevance_col = f"{prefix}_relevance"

        results.append({
            "filename": row["filename"],
            "model": model_key.replace("_output", ""),
            "llm_output": model_output,
            "llm_judge_scores": scores,
            "raw_response": gen_text,
            "original_by_human_scores": {
                "factual": row.get(factual_col, ""),
                "reasoningdepth": row.get(reasoningdepth_col, ""),
                "reasoningtype": row.get(reasoningtype_col, ""),
                "relevance": row.get(relevance_col, "")
            }
        })

# =================== SAVE ===================
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2, ensure_ascii=False)

print(f"Saved {len(results)} judgments to {output_file}")
