## Package Import

In [None]:
import os
import re
import math
import json
import glob
import argparse
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

from huggingface_hub import login

login(token="YourToken")

  from .autonotebook import tqdm as notebook_tqdm


## Data Inspection

In [2]:
# 项目根目录（如需在别处运行，改这里）
PROJECT_ROOT = "/Users/penglishao/Desktop/DS5500/project"
PROC_DIR = os.path.join(PROJECT_ROOT, "data", "processed")
SENT_DIR = os.path.join(PROC_DIR, "sentiment_data")
SARC_DIR = os.path.join(PROC_DIR, "sarcasm_data")

def normalize_split(s):
    s = str(s).strip().lower()
    if s in {"val", "valid", "validation", "dev"}:
        return "validation"
    if s in {"train", "training"}:
        return "train"
    if s in {"test", "testing"}:
        return "test"
    return s  # 其余保持原样，方便你发现异常取值

def load_and_merge(root, kind):
    """
    kind="sentiment" 需要列: text, sentiment, split
    kind="sarcasm"   需要列: text, sarcasm,  split
    """
    need = {"text", kind, "split"}
    frames = []
    for p in sorted(glob.glob(os.path.join(root, "*.csv"))):
        df = pd.read_csv(p)
        missing = need - set(df.columns)
        if missing:
            raise ValueError(f"File {os.path.basename(p)} 缺少列: {missing}. 现有列: {list(df.columns)}")
        sub = df[list(need)].copy()
        sub["split"] = sub["split"].map(normalize_split)
        sub["source"] = os.path.basename(p)
        frames.append(sub)
    if not frames:
        raise RuntimeError(f"目录为空: {root}")
    merged = pd.concat(frames, ignore_index=True)
    return merged

def describe(df, label_col, title):
    print(f"\n===== {title} =====")
    # 总体 split 统计
    split_counts = df["split"].value_counts(dropna=False).sort_index()
    print("Split counts:")
    print(split_counts.to_string())
    # 各数据源 × split
    ctab = df.pivot_table(index="source", columns="split", values=label_col, aggfunc="count", fill_value=0)
    print("\nBy source × split:")
    print(ctab.to_string())
    # 简要样本量
    print(f"\nTotal rows: {len(df)} (columns: {list(df.columns)})")

def main():
    os.makedirs(PROC_DIR, exist_ok=True)

    # 1) 合并情感
    sent_df = load_and_merge(SENT_DIR, kind="sentiment")
    out_sent_csv = os.path.join(PROC_DIR, "sentiment_df.csv")
    sent_df.to_csv(out_sent_csv, index=False, encoding="utf-8")
    describe(sent_df, label_col="sentiment", title="SENTIMENT MERGED")

    # 2) 合并讽刺
    sarc_df = load_and_merge(SARC_DIR, kind="sarcasm")
    out_sarc_csv = os.path.join(PROC_DIR, "sarcasm_df.csv")
    sarc_df.to_csv(out_sarc_csv, index=False, encoding="utf-8")
    describe(sarc_df, label_col="sarcasm", title="SARCASM MERGED")

    print(f"\n[OK] Saved:\n  {out_sent_csv}\n  {out_sarc_csv}")

if __name__ == "__main__":
    main()


===== SENTIMENT MERGED =====
Split counts:
split
test           19494
train         254159
validation      8101

By source × split:
split                       test   train  validation
source                                              
sentiment_df_TE.csv        12284   45615        2000
sentiment_marc_mapped.csv   5000  200000        5000
sentiment_sst5_mapped.csv   2210    8544        1101

Total rows: 281754 (columns: ['sentiment', 'text', 'split', 'source'])

===== SARCASM MERGED =====
Split counts:
split
test           2414
train         11009
validation      955

By source × split:
split                        test  train  validation
source                                              
isarcasmeval_merged.csv      1630   4326           0
sarcasm_df_TE.csv             784   2862         955
semeval2018_irony_train.csv     0   3821           0

Total rows: 14378 (columns: ['sarcasm', 'text', 'split', 'source'])

[OK] Saved:
  /Users/penglishao/Desktop/DS5500/project/data/process

## Sentiment and Sarvasm Baseline Run

In [20]:
# %% [markdown]
# Llama 3.1 8B zero-shot eval on validation split for sentiment & sarcasm

# %%
import os, json, re, glob
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# --------- Paths (auto-resolve for running from baseline/ or project/) ----------
def resolve_path(*candidates):
    for p in candidates:
        if p and os.path.exists(p):
            return p
    raise FileNotFoundError(f"None of these paths exist:\n{candidates}")

SENT_CSV = resolve_path(
    "../data/processed/sentiment_df.csv",   # running from baseline/
    "data/processed/sentiment_df.csv"       # running from project/
)
SARC_CSV = resolve_path(
    "../data/processed/sarcasm_df.csv",
    "data/processed/sarcasm_df.csv"
)
OUT_DIR  = "runs/llama31_zeroshot_eval"    # under baseline/

os.makedirs(OUT_DIR, exist_ok=True)
print("Sentiment CSV:", SENT_CSV)
print("Sarcasm  CSV:", SARC_CSV)
print("Out dir     :", OUT_DIR)

# --------- Device/Dtype ----------
if torch.cuda.is_available():
    DEVICE = "cuda"
    DTYPE  = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
# elif torch.backends.mps.is_available():
#     DEVICE = "mps"
#     DTYPE  = torch.float16
else:
    DEVICE = "cpu"
    DTYPE  = torch.float32
print(f"Device={DEVICE}, dtype={DTYPE}")

MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"

SENTIMENT_LABELS = {0: "negative", 1: "neutral", 2: "positive"}
SARCASM_LABELS   = {0: "non-sarcastic", 1: "sarcastic"}

def normalize_split(s: str) -> str:
    if s is None:
        return ""
    t = str(s).strip().lower()
    if t in {"val","valid","validation","dev"}: return "validation"
    if t in {"train","training"}:               return "train"
    if t in {"test","testing"}:                 return "test"
    return t


Sentiment CSV: ../data/processed/sentiment_df.csv
Sarcasm  CSV: ../data/processed/sarcasm_df.csv
Out dir     : runs/llama31_zeroshot_eval
Device=cpu, dtype=torch.float32


In [21]:
# %%
def build_chat_prompt_sentiment(text: str):
    system = (
        "You are a precise sentiment classifier. "
        "Classify the user's message into one of three classes: "
        "0=negative, 1=neutral, 2=positive. "
        "Return exactly one digit (0 or 1 or 2). Output nothing else."
    )
    user = f"Message:\n{text}\n\nReturn only one digit (0/1/2)."
    return system, user

def build_chat_prompt_sarcasm(text: str):
    system = (
        "You are a precise sarcasm/irony detector. "
        'Decide if the user\'s message is sarcastic. '
        "Use 0=non-sarcastic, 1=sarcastic. "
        "Return exactly one digit (0 or 1). Output nothing else."
    )
    user = f"Message:\n{text}\n\nReturn only one digit (0/1)."
    return system, user

_digit012 = re.compile(r"([012])")
_digit01  = re.compile(r"([01])")

def parse_digit_012(s: str) -> int | None:
    if s is None: return None
    t = s.strip()
    if t in {"0","1","2"}: return int(t)
    m = _digit012.search(t);  0
    if m: return int(m.group(1))
    lo = t.lower()
    if "negative" in lo: return 0
    if "neutral"  in lo: return 1
    if "positive" in lo: return 2
    return None

def parse_digit_01(s: str) -> int | None:
    if s is None: return None
    t = s.strip()
    if t in {"0","1"}: return int(t)
    m = _digit01.search(t)
    if m: return int(m.group(1))
    lo = t.lower()
    if "non-sarcastic" in lo or "non sarcastic" in lo: return 0
    if "sarcastic" in lo: return 1
    return None


In [22]:
# %%
def load_model(model_id: str = MODEL_ID):
    print(f"[INFO] Loading: {model_id}")
    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        dtype=DTYPE,           # 新版用 dtype
        device_map="auto",     # 自动放置；MPS/CPU/单卡CUDA都可
        low_cpu_mem_usage=True
    )
    if tok.pad_token_id is None and tok.eos_token_id is not None:
        tok.pad_token = tok.eos_token
    return tok, model

@torch.no_grad()
def generate_batch(tokenizer, model, systems, users, max_new_tokens=2):
    prompts = []
    for sys, usr in zip(systems, users):
        messages = [{"role":"system","content":sys}, {"role":"user","content":usr}]
        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        prompts.append(prompt)

    inputs = tokenizer(
        prompts, return_tensors="pt", padding=True, truncation=True, max_length=512
    ).to(model.device)

    out = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,                  # 贪心；稳定输出单字符
        eos_token_id=tokenizer.eos_token_id,
    )
    gen = out[:, inputs["input_ids"].shape[1]:]
    return tokenizer.batch_decode(gen, skip_special_tokens=True)


In [None]:
# %%
def run_eval_on_validation(df: pd.DataFrame, task: str, tokenizer, model, batch_size=8):
    """
    只评测 validation；若没有 validation，则返回空结果并提示。
    列要求：
      sentiment: text,sentiment,split
      sarcasm:   text,sarcasm,split
    """
    assert task in {"sentiment","sarcasm"}
    label_col = "sentiment" if task=="sentiment" else "sarcasm"
    id2name   = SENTIMENT_LABELS if task=="sentiment" else SARCASM_LABELS
    parser    = parse_digit_012 if task=="sentiment" else parse_digit_01
    build     = build_chat_prompt_sentiment if task=="sentiment" else build_chat_prompt_sarcasm

    # 只取 validation
    sub = df.copy()
    sub["split_norm"] = sub["split"].map(normalize_split)
    sub = sub[sub["split_norm"]=="validation"][["text", label_col, "split"]].reset_index(drop=True)

    if sub.empty:
        print(f"[Skip] No validation rows for task={task}.")
        return None

    preds, gts = [], []
    systems, users, idx_buf = [], [], []
    pbar = tqdm(total=len(sub), desc=f"{task.capitalize()} (validation)", unit="sample")

    for i, row in sub.iterrows():
        sys, usr = build(row["text"])
        systems.append(sys); users.append(usr); idx_buf.append(i)

        if len(systems) >= batch_size:
            outs = generate_batch(tokenizer, model, systems, users)
            for j, out in enumerate(outs):
                p = parser(out)
                if p is None: p = 1 if task=="sentiment" else 0
                preds.append(p); gts.append(int(sub.iloc[idx_buf[j]][label_col]))
                pbar.update(1)
            systems, users, idx_buf = [], [], []

    if systems:
        outs = generate_batch(tokenizer, model, systems, users)
        for j, out in enumerate(outs):
            p = parser(out)
            if p is None: p = 1 if task=="sentiment" else 0
            preds.append(p); gts.append(int(sub.iloc[idx_buf[j]][label_col]))
            pbar.update(1)

    pbar.close()

    preds = np.array(preds); gts = np.array(gts)
    labels = sorted(set(gts) | set(preds))
    cm = confusion_matrix(gts, preds, labels=labels)
    acc = accuracy_score(gts, preds)
    macro_f1 = f1_score(gts, preds, average="macro")
    rep = classification_report(gts, preds, target_names=[id2name[i] for i in labels], digits=4)

    # >>> 把 NumPy 类型都转成纯 Python 可 JSON 化的类型
    labels_list = [int(x) for x in labels]
    cm_list = cm.astype(int).tolist()

    return {
        "labels": labels_list,           # 纯 Python int
        "confusion_matrix": cm_list,     # 纯 Python list[list[int]]
        "accuracy": float(acc),          # 纯 Python float
        "macro_f1": float(macro_f1),     # 纯 Python float
        "report": str(rep),              # 字符串
        "n": int(len(sub)),              # 纯 Python int
    }


In [24]:
# %%
sent_df = pd.read_csv(SENT_CSV)
sarc_df = pd.read_csv(SARC_CSV)
# 基础列检查
assert {"text","sentiment","split"}.issubset(sent_df.columns), f"Sentiment cols wrong: {sent_df.columns}"
assert {"text","sarcasm","split"}.issubset(sarc_df.columns),  f"Sarcasm cols wrong: {sarc_df.columns}"

# 查看各自 validation 数量
def count_val(df):
    x = df["split"].map(normalize_split).value_counts()
    return int(x.get("validation", 0)), x.to_dict()

sent_val_n, sent_split_counts = count_val(sent_df)
sarc_val_n, sarc_split_counts = count_val(sarc_df)
print("Sentiment split counts:", sent_split_counts)
print("Sarcasm  split counts:", sarc_split_counts)
print(f"Sentiment validation size: {sent_val_n}")
print(f"Sarcasm  validation size: {sarc_val_n}")


Sentiment split counts: {'train': 254159, 'test': 19494, 'validation': 8101}
Sarcasm  split counts: {'train': 11009, 'test': 2414, 'validation': 955}
Sentiment validation size: 8101
Sarcasm  validation size: 955


In [None]:
# %%
tokenizer, model = load_model(MODEL_ID)

print("\n########## EVAL: SENTIMENT (validation only) ##########")
res_sent = run_eval_on_validation(sent_df, "sentiment", tokenizer, model, batch_size=8)
if res_sent is not None:
    print(f"\n=== SENTIMENT | validation | N={res_sent['n']} ===")
    print(f"Accuracy: {res_sent['accuracy']:.4f} | Macro-F1: {res_sent['macro_f1']:.4f}")
    print("Confusion matrix (rows=true, cols=pred):")
    print(np.array(res_sent["confusion_matrix"]))
    print(res_sent["report"])

    with open(os.path.join(OUT_DIR, "sentiment_results_validation.json"), "w") as f:
        json.dump(res_sent, f, indent=2)
    print("[OK] saved:", os.path.join(OUT_DIR, "sentiment_results_validation.json"))


[INFO] Loading: meta-llama/Llama-3.1-8B-Instruct


Loading checkpoint shards: 100%|██████████| 4/4 [03:20<00:00, 50.16s/it] 
Some parameters are on the meta device because they were offloaded to the disk.



########## EVAL: SENTIMENT (validation only) ##########


Sentiment (validation):   0%|          | 0/8101 [00:00<?, ?sample/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Sentiment (validation):   0%|          | 1/8101 [07:42<1040:14:33, 462.33s/sample]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


In [None]:
# %%
print("\n########## EVAL: SARCASM (validation only) ##########")
res_sarc = run_eval_on_validation(sarc_df, "sarcasm", tokenizer, model, batch_size=8)
if res_sarc is not None:
    print(f"\n=== SARCASM | validation | N={res_sarc['n']} ===")
    print(f"Accuracy: {res_sarc['accuracy']:.4f} | Macro-F1: {res_sarc['macro_f1']:.4f}")
    print("Confusion matrix (rows=true, cols=pred):")
    print(np.array(res_sarc["confusion_matrix"]))
    print(res_sarc["report"])

    with open(os.path.join(OUT_DIR, "sarcasm_results_validation.json"), "w") as f:
        json.dump(res_sarc, f, indent=2)
    print("[OK] saved:", os.path.join(OUT_DIR, "sarcasm_results_validation.json"))


In [2]:
# eval_llama_validation.py
# 读取 data/processed/sentiment_df.csv & sarcasm_df.csv
# 只在 split=validation 上评测 sentiment(0/1/2) 和 sarcasm(0/1)
# 结果保存到 --out_dir 下的 *_results_validation.json

import os
import re
import json
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer


# -----------------------
# Config: device & dtype
# -----------------------
DEFAULT_MODEL = "meta-llama/Llama-3.1-8B-Instruct"

# 检测顺序：CUDA > MPS > CPU
if torch.cuda.is_available():
    DEVICE = "cuda"; DTYPE = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
elif torch.backends.mps.is_available():
    DEVICE = "mps";  DTYPE = torch.float16
else:
    DEVICE = "cpu";  DTYPE = torch.float32

SENTIMENT_LABELS = {0: "negative", 1: "neutral", 2: "positive"}
SARCASM_LABELS   = {0: "non-sarcastic", 1: "sarcastic"}


# -----------------------
# Split helpers
# -----------------------
def normalize_split(s: str) -> str:
    if s is None:
        return ""
    t = str(s).strip().lower()
    if t in {"val", "valid", "validation", "dev"}: return "validation"
    if t in {"train", "training"}:                 return "train"
    if t in {"test", "testing"}:                   return "test"
    return t

def filter_validation(df: pd.DataFrame) -> pd.DataFrame:
    if "split" not in df.columns:
        return df.iloc[0:0].copy()
    sub = df.copy()
    sub["split_norm"] = sub["split"].map(normalize_split)
    sub = sub[sub["split_norm"] == "validation"].copy()
    return sub.reset_index(drop=True)


# -----------------------
# Prompts
# -----------------------
def build_chat_prompt_sentiment(text: str):
    system = (
        "You are a precise sentiment classifier. "
        "Classify the user's message into one of three classes: "
        "0=negative, 1=neutral, 2=positive. "
        "Return exactly one digit (0 or 1 or 2). Output nothing else."
    )
    user = f"Message:\n{text}\n\nReturn only one digit (0/1/2)."
    return system, user

def build_chat_prompt_sarcasm(text: str):
    system = (
        "You are a precise sarcasm detector. "
        "Decide if the user's message is sarcastic. "
        "Use 0=non-sarcastic, 1=sarcastic. "
        "Return exactly one digit (0 or 1). Output nothing else."
    )
    user = f"Message:\n{text}\n\nReturn only one digit (0/1)."
    return system, user


# -----------------------
# Parsers
# -----------------------
_digit012 = re.compile(r"([012])")
_digit01  = re.compile(r"([01])")

def parse_digit_012(s: str) -> int | None:
    if s is None: return None
    t = s.strip()
    if t in {"0","1","2"}: return int(t)
    m = _digit012.search(t)
    if m: return int(m.group(1))
    lo = t.lower()
    if "negative" in lo: return 0
    if "neutral"  in lo: return 1
    if "positive" in lo: return 2
    return None

def parse_digit_01(s: str) -> int | None:
    if s is None: return None
    t = s.strip()
    if t in {"0","1"}: return int(t)
    m = _digit01.search(t)
    if m: return int(m.group(1))
    lo = t.lower()
    if "non-sarcastic" in lo or "non sarcastic" in lo: return 0
    if "sarcastic" in lo: return 1
    return None


# -----------------------
# Model loading & generate
# -----------------------
def load_model(model_name: str = DEFAULT_MODEL):
    print(f"[INFO] Loading model: {model_name}")
    tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    # 关键：decoder-only 用左填充 + 左截断
    tok.padding_side = "left"
    tok.truncation_side = "left"
    if tok.pad_token_id is None and tok.eos_token_id is not None:
        tok.pad_token = tok.eos_token

    # 明确 device_map，MPS 上用 eager 更稳（不要 flash_attn）
    if DEVICE == "cuda":
        device_map = "auto"
        attn_impl = "flash_attention_2"  # 仅 CUDA 可用；如果报错就换 "eager"
    elif DEVICE == "mps":
        device_map = {"": "mps"}
        attn_impl = "eager"
    else:
        device_map = {"": "cpu"}
        attn_impl = "eager"

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        dtype=DTYPE,
        device_map=device_map,
        low_cpu_mem_usage=True,
        attn_implementation=attn_impl,
    )
    return tok, model

@torch.no_grad()
def generate_batch(tokenizer, model, systems, users, max_new_tokens=2):
    prompts = []
    for sys, usr in zip(systems, users):
        messages = [{"role":"system","content":sys}, {"role":"user","content":usr}]
        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        prompts.append(prompt)

    inputs = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,             # 现在会用左填充
        truncation=True,
        max_length=512,
    ).to(model.device)

    out = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,          # 贪心；不打印 top_p/temperature 的 warn 了
        pad_token_id=tokenizer.pad_token_id,   # 防止每次 fallback 提示
        eos_token_id=tokenizer.eos_token_id,
        use_cache=True,
    )
    gen_tokens = out[:, inputs["input_ids"].shape[1]:]
    return tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)


# -----------------------
# Eval (validation only)
# -----------------------
def run_eval_validation(df: pd.DataFrame, task: str, tokenizer, model, batch_size=8, desc=""):
    """
    只评测 validation。
      sentiment 需要列: text, sentiment, split
      sarcasm   需要列: text, sarcasm,  split
    """
    assert task in {"sentiment","sarcasm"}
    label_col = "sentiment" if task=="sentiment" else "sarcasm"
    id2name   = SENTIMENT_LABELS if task=="sentiment" else SARCASM_LABELS
    parser    = parse_digit_012 if task=="sentiment" else parse_digit_01
    build     = build_chat_prompt_sentiment if task=="sentiment" else build_chat_prompt_sarcasm

    sub = filter_validation(df)
    if sub.empty:
        print(f"[Skip] No validation rows for task={task}.")
        return None

    preds, gts = [], []
    systems, users, idx_buf = [], [], []
    pbar = tqdm(total=len(sub), desc=desc or f"{task.capitalize()} (validation)", unit="sample")

    for i, row in sub.iterrows():
        sys, usr = build(row["text"])
        systems.append(sys); users.append(usr); idx_buf.append(i)

        if len(systems) >= batch_size:
            outs = generate_batch(tokenizer, model, systems, users)
            for j, out in enumerate(outs):
                p = parser(out)
                if p is None: p = 1 if task=="sentiment" else 0
                preds.append(p)
                gts.append(int(sub.iloc[idx_buf[j]][label_col]))
                pbar.update(1)
            systems, users, idx_buf = [], [], []

    if systems:
        outs = generate_batch(tokenizer, model, systems, users)
        for j, out in enumerate(outs):
            p = parser(out)
            if p is None: p = 1 if task=="sentiment" else 0
            preds.append(p)
            gts.append(int(sub.iloc[idx_buf[j]][label_col]))
            pbar.update(1)

    pbar.close()

    preds = np.array(preds); gts = np.array(gts)
    labels = sorted(set(gts) | set(preds))
    cm = confusion_matrix(gts, preds, labels=labels)
    acc = accuracy_score(gts, preds)
    macro_f1 = f1_score(gts, preds, average="macro")
    rep = classification_report(gts, preds, target_names=[id2name[i] for i in labels], digits=4)

    return {
        "labels": labels,
        "confusion_matrix": cm.tolist(),
        "accuracy": acc,
        "macro_f1": macro_f1,
        "report": rep,
        "n": len(sub),
    }


# -----------------------
# Main
# -----------------------
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--model", type=str, default=DEFAULT_MODEL)
    ap.add_argument("--bsz", type=int, default=8)
    ap.add_argument("--sentiment_csv", type=str, default="../data/processed/sentiment_df.csv")
    ap.add_argument("--sarcasm_csv",  type=str, default="../data/processed/sarcasm_df.csv")
    ap.add_argument("--out_dir", type=str, default="runs/llama31_zeroshot_eval")
    args, _ = ap.parse_known_args()

    # Load data
    sent_df = pd.read_csv(args.sentiment_csv)
    sarc_df = pd.read_csv(args.sarcasm_csv)
    assert {"text","sentiment","split"}.issubset(sent_df.columns), f"Bad columns in {args.sentiment_csv}: {sent_df.columns}"
    assert {"text","sarcasm","split"}.issubset(sarc_df.columns),  f"Bad columns in {args.sarcasm_csv}: {sarc_df.columns}"

    # Show split counts
    def split_counts(df):
        return df["split"].map(normalize_split).value_counts().to_dict()
    print("Sentiment split counts:", split_counts(sent_df))
    print("Sarcasm  split counts:", split_counts(sarc_df))

    # Load model
    tokenizer, model = load_model(args.model)
    os.makedirs(args.out_dir, exist_ok=True)

    # Sentiment (validation only)
    print("\n########## EVAL: SENTIMENT (validation only) ##########")
    res_sent = run_eval_validation(
        sent_df, "sentiment", tokenizer, model,
        batch_size=args.bsz, desc="Sentiment (validation)"
    )
    if res_sent is not None:
        print(f"\n=== SENTIMENT | validation | N={res_sent['n']} ===")
        print(f"Accuracy: {res_sent['accuracy']:.4f} | Macro-F1: {res_sent['macro_f1']:.4f}")
        print("Confusion matrix (rows=true, cols=pred):")
        print(np.array(res_sent["confusion_matrix"]))
        print(res_sent["report"])
        out_sent = os.path.join(args.out_dir, "sentiment_results_validation.json")
        with open(out_sent, "w") as f:
            json.dump(res_sent, f, indent=2)
        print("[OK] saved:", out_sent)

    # Sarcasm (validation only)
    print("\n########## EVAL: SARCASM (validation only) ##########")
    res_sarc = run_eval_validation(
        sarc_df, "sarcasm", tokenizer, model,
        batch_size=args.bsz, desc="Sarcasm (validation)"
    )
    if res_sarc is not None:
        print(f"\n=== SARCASM | validation | N={res_sarc['n']} ===")
        print(f"Accuracy: {res_sarc['accuracy']:.4f} | Macro-F1: {res_sarc['macro_f1']:.4f}")
        print("Confusion matrix (rows=true, cols=pred):")
        print(np.array(res_sarc["confusion_matrix"]))
        print(res_sarc["report"])
        out_sarc = os.path.join(args.out_dir, "sarcasm_results_validation.json")
        with open(out_sarc, "w") as f:
            json.dump(res_sarc, f, indent=2)
        print("[OK] saved:", out_sarc)


if __name__ == "__main__":
    main()


Sentiment split counts: {'train': 254159, 'test': 19494, 'validation': 8101}
Sarcasm  split counts: {'train': 11009, 'test': 2414, 'validation': 955}
[INFO] Loading model: meta-llama/Llama-3.1-8B-Instruct


Loading checkpoint shards: 100%|██████████| 4/4 [00:11<00:00,  2.79s/it]



########## EVAL: SENTIMENT (validation only) ##########


Sentiment (validation):   0%|          | 0/8101 [00:00<?, ?sample/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Sentiment (validation): 100%|██████████| 8101/8101 [56:18<00:00,  2.40sample/s]  



=== SENTIMENT | validation | N=8101 ===
Accuracy: 0.5586 | Macro-F1: 0.4956
Confusion matrix (rows=true, cols=pred):
[[ 878  483 1379]
 [ 114  573 1411]
 [   8  181 3074]]
              precision    recall  f1-score   support

    negative     0.8780    0.3204    0.4695      2740
     neutral     0.4632    0.2731    0.3436      2098
    positive     0.5242    0.9421    0.6736      3263

    accuracy                         0.5586      8101
   macro avg     0.6218    0.5119    0.4956      8101
weighted avg     0.6281    0.5586    0.5191      8101



TypeError: Object of type int64 is not JSON serializable

In [2]:
# eval_llama_validation_nosave.py
# 读取 ../data/processed/sentiment_df.csv 和 ../data/processed/sarcasm_df.csv
# 仅评测 split=validation 的样本；打印 Accuracy / Macro-F1 / 混淆矩阵与分类报告
# 不保存任何 json 到磁盘

import os
import re
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, classification_report, f1_score, accuracy_score

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# -----------------------
# Config: device & dtype
# -----------------------
DEFAULT_MODEL = "meta-llama/Llama-3.1-8B-Instruct"

# 检测顺序：CUDA > MPS > CPU
if torch.cuda.is_available():
    DEVICE = "cuda"; DTYPE = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
elif torch.backends.mps.is_available():
    DEVICE = "mps";  DTYPE = torch.float16
else:
    DEVICE = "cpu";  DTYPE = torch.float32

SENTIMENT_LABELS = {0: "negative", 1: "neutral", 2: "positive"}
SARCASM_LABELS   = {0: "non-sarcastic", 1: "sarcastic"}

# -----------------------
# Split helpers
# -----------------------
def normalize_split(s: str) -> str:
    if s is None:
        return ""
    t = str(s).strip().lower()
    if t in {"val", "valid", "validation", "dev"}: return "validation"
    if t in {"train", "training"}:                 return "train"
    if t in {"test", "testing"}:                   return "test"
    return t

def filter_validation(df: pd.DataFrame) -> pd.DataFrame:
    if "split" not in df.columns:
        return df.iloc[0:0].copy()
    sub = df.copy()
    sub["split_norm"] = sub["split"].map(normalize_split)
    sub = sub[sub["split_norm"] == "validation"].copy()
    return sub.reset_index(drop=True)

# -----------------------
# Prompts
# -----------------------
def build_chat_prompt_sentiment(text: str):
    system = (
        "You are a precise sentiment classifier. "
        "Classify the user's message into one of three classes: "
        "0=negative, 1=neutral, 2=positive. "
        "Return exactly one digit (0 or 1 or 2). Output nothing else."
    )
    user = f"Message:\n{text}\n\nReturn only one digit (0/1/2)."
    return system, user

def build_chat_prompt_sarcasm(text: str):
    system = (
        "You are a precise sarcasm detector. "
        "Decide if the user's message is sarcastic. "
        "Use 0=non-sarcastic, 1=sarcastic. "
        "Return exactly one digit (0 or 1). Output nothing else."
    )
    user = f"Message:\n{text}\n\nReturn only one digit (0/1)."
    return system, user

# -----------------------
# Parsers
# -----------------------
_digit012 = re.compile(r"([012])")
_digit01  = re.compile(r"([01])")

def parse_digit_012(s: str) -> int | None:
    if s is None: return None
    t = s.strip()
    if t in {"0","1","2"}: return int(t)
    m = _digit012.search(t)
    if m: return int(m.group(1))
    lo = t.lower()
    if "negative" in lo: return 0
    if "neutral"  in lo: return 1
    if "positive" in lo: return 2
    return None

def parse_digit_01(s: str) -> int | None:
    if s is None: return None
    t = s.strip()
    if t in {"0","1"}: return int(t)
    m = _digit01.search(t)
    if m: return int(m.group(1))
    lo = t.lower()
    if "non-sarcastic" in lo or "non sarcastic" in lo: return 0
    if "sarcastic" in lo: return 1
    return None

# -----------------------
# Model loading & generate
# -----------------------
def load_model(model_name: str = DEFAULT_MODEL):
    print(f"[INFO] Loading model: {model_name}")
    tok = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    # decoder-only: 左填充 + 左截断
    tok.padding_side = "left"
    tok.truncation_side = "left"
    if tok.pad_token_id is None and tok.eos_token_id is not None:
        tok.pad_token = tok.eos_token

    # attn_implementation 在不同版本 transformers 里可选
    if DEVICE == "cuda":
        device_map = "auto"
        attn_impl = "flash_attention_2"
    elif DEVICE == "mps":
        device_map = {"": "mps"}
        attn_impl = "eager"
    else:
        device_map = {"": "cpu"}
        attn_impl = "eager"

    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            dtype=DTYPE,
            device_map=device_map,
            low_cpu_mem_usage=True,
            attn_implementation=attn_impl,
        )
    except TypeError:
        # 兼容老版本 transformers（没有 attn_implementation 参数）
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            dtype=DTYPE,
            device_map=device_map,
            low_cpu_mem_usage=True,
        )
    return tok, model

@torch.no_grad()
def generate_batch(tokenizer, model, systems, users, max_new_tokens=2):
    prompts = []
    for sys, usr in zip(systems, users):
        messages = [{"role":"system","content":sys}, {"role":"user","content":usr}]
        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        prompts.append(prompt)

    inputs = tokenizer(
        prompts,
        return_tensors="pt",
        padding=True,      # 左填充
        truncation=True,
        max_length=512,
    ).to(model.device)

    out = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        use_cache=True,
    )
    gen_tokens = out[:, inputs["input_ids"].shape[1]:]
    return tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)

# -----------------------
# Eval (validation only)
# -----------------------
def run_eval_validation(df: pd.DataFrame, task: str, tokenizer, model, batch_size=8, desc=""):
    """
    只评测 validation。
      sentiment 需要列: text, sentiment, split
      sarcasm   需要列: text, sarcasm,  split
    """
    assert task in {"sentiment","sarcasm"}
    label_col = "sentiment" if task=="sentiment" else "sarcasm"
    id2name   = SENTIMENT_LABELS if task=="sentiment" else SARCASM_LABELS
    parser    = parse_digit_012 if task=="sentiment" else parse_digit_01
    build     = build_chat_prompt_sentiment if task=="sentiment" else build_chat_prompt_sarcasm

    sub = filter_validation(df)
    if sub.empty:
        print(f"[Skip] No validation rows for task={task}.")
        return None

    preds, gts = [], []
    systems, users, idx_buf = [], [], []
    pbar = tqdm(total=len(sub), desc=desc or f"{task.capitalize()} (validation)", unit="sample")

    for i, row in sub.iterrows():
        sys, usr = build(row["text"])
        systems.append(sys); users.append(usr); idx_buf.append(i)

        if len(systems) >= batch_size:
            outs = generate_batch(tokenizer, model, systems, users)
            for j, out in enumerate(outs):
                p = parser(out)
                if p is None: p = 1 if task=="sentiment" else 0
                preds.append(p)
                gts.append(int(sub.iloc[idx_buf[j]][label_col]))
                pbar.update(1)
            systems, users, idx_buf = [], [], []

    if systems:
        outs = generate_batch(tokenizer, model, systems, users)
        for j, out in enumerate(outs):
            p = parser(out)
            if p is None: p = 1 if task=="sentiment" else 0
            preds.append(p)
            gts.append(int(sub.iloc[idx_buf[j]][label_col]))
            pbar.update(1)

    pbar.close()

    preds = np.array(preds); gts = np.array(gts)
    labels = sorted(set(gts) | set(preds))
    cm = confusion_matrix(gts, preds, labels=labels)
    acc = accuracy_score(gts, preds)
    macro_f1 = f1_score(gts, preds, average="macro")
    rep = classification_report(gts, preds, target_names=[id2name[i] for i in labels], digits=4)

    # 直接返回可打印对象（不做 JSON 序列化）
    return {
        "labels": [int(x) for x in labels],
        "confusion_matrix": cm.astype(int),
        "accuracy": float(acc),
        "macro_f1": float(macro_f1),
        "report": rep,
        "n": int(len(sub)),
    }

# -----------------------
# Main
# -----------------------
def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--model", type=str, default=DEFAULT_MODEL)
    ap.add_argument("--bsz", type=int, default=8)
    ap.add_argument("--sentiment_csv", type=str, default="../data/processed/sentiment_df.csv")
    ap.add_argument("--sarcasm_csv",  type=str, default="../data/processed/sarcasm_df.csv")
    args, _ = ap.parse_known_args()

    # Load data
    sent_df = pd.read_csv(args.sentiment_csv)
    sarc_df = pd.read_csv(args.sarcasm_csv)
    assert {"text","sentiment","split"}.issubset(sent_df.columns), f"Bad columns in {args.sentiment_csv}: {sent_df.columns}"
    assert {"text","sarcasm","split"}.issubset(sarc_df.columns),  f"Bad columns in {args.sarcasm_csv}: {sarc_df.columns}"

    # Show split counts
    def split_counts(df):
        return df["split"].map(normalize_split).value_counts().to_dict()
    print("Sentiment split counts:", split_counts(sent_df))
    print("Sarcasm  split counts:", split_counts(sarc_df))

    # Load model
    tokenizer, model = load_model(args.model)

    # Sentiment (validation only)
    print("\n########## EVAL: SENTIMENT (validation only) ##########")
    res_sent = run_eval_validation(
        sent_df, "sentiment", tokenizer, model,
        batch_size=args.bsz, desc="Sentiment (validation)"
    )
    if res_sent is not None:
        print(f"\n=== SENTIMENT | validation | N={res_sent['n']} ===")
        print(f"Accuracy: {res_sent['accuracy']:.4f} | Macro-F1: {res_sent['macro_f1']:.4f}")
        print("Confusion matrix (rows=true, cols=pred):")
        print(np.array(res_sent["confusion_matrix"]))
        print(res_sent["report"])

    # Sarcasm (validation only)
    print("\n########## EVAL: SARCASM (validation only) ##########")
    res_sarc = run_eval_validation(
        sarc_df, "sarcasm", tokenizer, model,
        batch_size=args.bsz, desc="Sarcasm (validation)"
    )
    if res_sarc is not None:
        print(f"\n=== SARCASM | validation | N={res_sarc['n']} ===")
        print(f"Accuracy: {res_sarc['accuracy']:.4f} | Macro-F1: {res_sarc['macro_f1']:.4f}")
        print("Confusion matrix (rows=true, cols=pred):")
        print(np.array(res_sarc["confusion_matrix"]))
        print(res_sarc["report"])

if __name__ == "__main__":
    main()


Sentiment split counts: {'train': 254159, 'test': 19494, 'validation': 8101}
Sarcasm  split counts: {'train': 11009, 'test': 2414, 'validation': 955}
[INFO] Loading model: meta-llama/Llama-3.1-8B-Instruct


Loading checkpoint shards: 100%|██████████| 4/4 [00:10<00:00,  2.56s/it]



########## EVAL: SENTIMENT (validation only) ##########


Sentiment (validation):   0%|          | 0/8101 [00:00<?, ?sample/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Sentiment (validation): 100%|██████████| 8101/8101 [56:30<00:00,  2.39sample/s]  



=== SENTIMENT | validation | N=8101 ===
Accuracy: 0.5586 | Macro-F1: 0.4956
Confusion matrix (rows=true, cols=pred):
[[ 878  483 1379]
 [ 114  573 1411]
 [   8  181 3074]]
              precision    recall  f1-score   support

    negative     0.8780    0.3204    0.4695      2740
     neutral     0.4632    0.2731    0.3436      2098
    positive     0.5242    0.9421    0.6736      3263

    accuracy                         0.5586      8101
   macro avg     0.6218    0.5119    0.4956      8101
weighted avg     0.6281    0.5586    0.5191      8101


########## EVAL: SARCASM (validation only) ##########


Sarcasm (validation): 100%|██████████| 955/955 [04:58<00:00,  3.20sample/s]


=== SARCASM | validation | N=955 ===
Accuracy: 0.5110 | Macro-F1: 0.3978
Confusion matrix (rows=true, cols=pred):
[[ 37 462]
 [  5 451]]
               precision    recall  f1-score   support

non-sarcastic     0.8810    0.0741    0.1368       499
    sarcastic     0.4940    0.9890    0.6589       456

     accuracy                         0.5110       955
    macro avg     0.6875    0.5316    0.3978       955
 weighted avg     0.6962    0.5110    0.3861       955




