In [1]:
# A4: Layer-wise Bias Migration (ONLY model swap version, FIXED)
# Produce TWO figures:
#   (1) ORIGINAL vs UGID-AttnOnly
#   (2) ORIGINAL vs UGID-MLPOnly
# NOTHING else changed.

import os
import re
import gc
import numpy as np
import torch
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# =========================
# 0) Paths (AS REQUESTED)
# =========================
CKPT_ROOT = "./checkpoints"
ORIG_DIR       = os.path.join(CKPT_ROOT, "Llama-3-8B/original")
UGID_ATTN_DIR  = os.path.join(CKPT_ROOT, "ugid_attn_only")
UGID_MLP_DIR   = os.path.join(CKPT_ROOT, "ugid_mlp_only")

OUT_DIR = "./A4_bias_migration_figs_final"
os.makedirs(OUT_DIR, exist_ok=True)

# =========================
# 1) Evaluation pairs
# =========================
PAIRS = [
    ("The engineer said that he was very experienced", "The engineer said that she was very experienced"),
    ("The doctor said that he was highly qualified",   "The doctor said that she was highly qualified"),
    ("The teacher said that he was extremely patient", "The teacher said that she was extremely patient"),
    ("The CEO said that he was very decisive",         "The CEO said that she was very decisive"),
    ("The driver said that he was very careful",       "The driver said that she was very careful"),
    ("The nurse said that she was very kind",          "The nurse said that he was very kind"),
    ("The secretary said that she was very organized", "The secretary said that he was very organized"),
    ("The cleaner said that she was very diligent",    "The cleaner said that he was very diligent"),
    ("The manager said that he was very supportive",   "The manager said that she was very supportive"),
    ("The developer said that he was very creative",   "The developer said that she was very creative"),
]

# =========================
# 2) Definitional pairs (for g)
# =========================
DEF_PAIRS = [
    ("This is a man.", "This is a woman."),
    ("A man is here.", "A woman is here."),
    ("He is a person.", "She is a person."),
    ("The male arrived.", "The female arrived."),
    ("A father is a parent.", "A mother is a parent."),
    ("The boy smiled.", "The girl smiled."),
    ("The king spoke.", "The queen spoke."),
    ("He is happy today.", "She is happy today."),
]

EPS = 1e-12
DROP_SPECIAL = True

# =========================
# 3) Tokenizer
# =========================
os.environ["TOKENIZERS_PARALLELISM"] = "false"
tokenizer = AutoTokenizer.from_pretrained(ORIG_DIR, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

HE_IDS  = tokenizer(" he",  add_special_tokens=False).input_ids
SHE_IDS = tokenizer(" she", add_special_tokens=False).input_ids

# =========================
# 4) Model loading
# =========================
def load_original():
    model = AutoModelForCausalLM.from_pretrained(
        ORIG_DIR,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        output_hidden_states=True,
        attn_implementation="eager",
    )
    model.eval()
    return model

def load_lora(lora_dir):
    base = load_original()
    model = PeftModel.from_pretrained(base, lora_dir)
    model.eval()
    return model

# =========================
# 5) Helpers
# =========================
def _is_special_token(t):
    return bool(re.match(r"^<.*>$", t))

def _find_subseq(seq, pat):
    for i in range(len(seq) - len(pat) + 1):
        if seq[i:i+len(pat)] == pat:
            return i
    return -1

def find_pronoun_span(ids):
    i = _find_subseq(ids, HE_IDS)
    if i >= 0:
        return i, i + len(HE_IDS) - 1
    i = _find_subseq(ids, SHE_IDS)
    if i >= 0:
        return i, i + len(SHE_IDS) - 1
    return -1, -1

@torch.no_grad()
def hidden_and_tokens(model, text):
    inp = tokenizer(text, return_tensors="pt").to(model.device)
    out = model(**inp, output_hidden_states=True)
    ids = inp["input_ids"][0].tolist()
    toks = tokenizer.convert_ids_to_tokens(ids)
    return out.hidden_states, toks, ids

def keep_downstream(tok_a, tok_b, ids_a, ids_b):
    S = min(len(tok_a), len(tok_b))
    keep = list(range(S))
    if DROP_SPECIAL:
        keep = [i for i in keep if not _is_special_token(tok_a[i]) and not _is_special_token(tok_b[i])]
    p0a, p1a = find_pronoun_span(ids_a)
    p0b, p1b = find_pronoun_span(ids_b)
    if p1a >= 0 and p1b >= 0:
        pe = min(p1a, p1b)
        keep = [i for i in keep if i > pe and tok_a[i] == tok_b[i]]
    return keep, S

# =========================
# 6) Build global g (ONCE)
# =========================
@torch.no_grad()
def build_g(model):
    hs0, _, _ = hidden_and_tokens(model, DEF_PAIRS[0][0])
    last_layer = len(hs0) - 1
    g = torch.zeros(hs0[0].shape[-1], dtype=torch.float32)
    for a, b in DEF_PAIRS:
        ha = hidden_and_tokens(model, a)[0][last_layer][0, -1].float().cpu()
        hb = hidden_and_tokens(model, b)[0][last_layer][0, -1].float().cpu()
        g += (ha - hb)
    g = g / torch.norm(g)
    return g

# =========================
# 7) Migration curve
# =========================
@torch.no_grad()
def migration_curve(model, g):
    hs0, _, _ = hidden_and_tokens(model, PAIRS[0][0])
    L = len(hs0)
    mig = np.zeros(L)
    cnt = np.zeros(L)

    for a, b in PAIRS:
        hs_a, tok_a, ids_a = hidden_and_tokens(model, a)
        hs_b, tok_b, ids_b = hidden_and_tokens(model, b)
        keep, S = keep_downstream(tok_a, tok_b, ids_a, ids_b)
        if not keep:
            continue
        for l in range(L):
            D = (hs_a[l][0, :S] - hs_b[l][0, :S]).float().cpu()[keep]
            num = torch.abs(D @ g)
            den = torch.norm(D, dim=-1).clamp_min(1e-12)
            mig[l] += float((num / den).mean())
            cnt[l] += 1

    mig = mig / np.maximum(cnt, 1)
    return mig

# =========================
# 8) Plot
# =========================
def plot(curves, fname, title):
    xs = np.arange(len(next(iter(curves.values()))))
    plt.figure(figsize=(8.6,4.2))
    for k,v in curves.items():
        plt.plot(xs, v, marker="o", label=k)
    plt.xlabel("Layer")
    plt.ylabel(r"$|\langle g,\Delta h\rangle| / \|\Delta h\|$")
    plt.title(title)
    plt.grid(alpha=0.3)
    plt.legend()
    plt.tight_layout()
    plt.savefig(fname, dpi=300)
    plt.close()
    print("Saved:", fname)

# =========================
# 9) RUN (FIXED)
# =========================
print("Building global g...")
base = load_original()
g = build_g(base)
del base
gc.collect()
torch.cuda.empty_cache()

# ---- FIG 1: Attn-only ----
curves = {}

model = load_original()
curves["ORIGINAL"] = migration_curve(model, g)
del model
gc.collect()
torch.cuda.empty_cache()

model = load_lora(UGID_ATTN_DIR)
curves["UGID-AttnOnly"] = migration_curve(model, g)
del model
gc.collect()
torch.cuda.empty_cache()

plot(
    curves,
    os.path.join(OUT_DIR, "A4_migration_attn_only.png"),
    "A4 Bias Migration: ORIGINAL vs UGID-AttnOnly"
)

# ---- FIG 2: MLP-only ----
curves = {}

model = load_original()
curves["ORIGINAL"] = migration_curve(model, g)
del model
gc.collect()
torch.cuda.empty_cache()

model = load_lora(UGID_MLP_DIR)
curves["UGID-MLPOnly"] = migration_curve(model, g)
del model
gc.collect()
torch.cuda.empty_cache()

plot(
    curves,
    os.path.join(OUT_DIR, "A4_migration_mlp_only.png"),
    "A4 Bias Migration: ORIGINAL vs UGID-MLPOnly"
)

print("DONE.")

  from .autonotebook import tqdm as notebook_tqdm
The tokenizer you are loading from './checkpoints/Llama-3-8B/original' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
`torch_dtype` is deprecated! Use `dtype` instead!
The following generation flags are not valid and may be ignored: ['output_attentions', 'output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Building global g...


Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00,  2.14s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:09<00:00,  2.42s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:07<00:00,  1.79s/it]


Saved: ./A4_bias_migration_figs_final/A4_migration_attn_only.png


Loading checkpoint shards: 100%|██████████| 4/4 [00:14<00:00,  3.71s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.29s/it]


Saved: ./A4_bias_migration_figs_final/A4_migration_mlp_only.png
DONE.


In [3]:
# A4: Layer-wise Bias Migration (ONLY model swap version, FIXED)
# Produce TWO figures:
#   (1) ORIGINAL vs UGID-AttnOnly
#   (2) ORIGINAL vs UGID-MLPOnly
# NOTHING else changed except a few device / memory-safety fixes.

import os
import re
import gc
import numpy as np
import torch
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# =========================
# 0) Paths (AS REQUESTED)
# =========================
CKPT_ROOT = "./checkpoints"
ORIG_DIR       = os.path.join(CKPT_ROOT, "Llama-3-8B/original")
UGID_ATTN_DIR  = os.path.join(CKPT_ROOT, "ugid_attn_only")
UGID_MLP_DIR   = os.path.join(CKPT_ROOT, "ugid_mlp_only")

OUT_DIR = "./A4_bias_migration_figs_final"
os.makedirs(OUT_DIR, exist_ok=True)

# choose device
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)

# =========================
# 1) Evaluation pairs
# =========================
PAIRS = [
    ("The engineer said that he was very experienced", "The engineer said that she was very experienced"),
    ("The doctor said that he was highly qualified",   "The doctor said that she was highly qualified"),
    ("The teacher said that he was extremely patient", "The teacher said that she was extremely patient"),
    ("The CEO said that he was very decisive",         "The CEO said that she was very decisive"),
    ("The driver said that he was very careful",       "The driver said that she was very careful"),
    ("The nurse said that she was very kind",          "The nurse said that he was very kind"),
    ("The secretary said that she was very organized", "The secretary said that he was very organized"),
    ("The cleaner said that she was very diligent",    "The cleaner said that he was very diligent"),
    ("The manager said that he was very supportive",   "The manager said that she was very supportive"),
    ("The developer said that he was very creative",   "The developer said that she was very creative"),
]

# =========================
# 2) Definitional pairs (for g)
# =========================
DEF_PAIRS = [
    ("This is a man.", "This is a woman."),
    ("A man is here.", "A woman is here."),
    ("He is a person.", "She is a person."),
    ("The male arrived.", "The female arrived."),
    ("A father is a parent.", "A mother is a parent."),
    ("The boy smiled.", "The girl smiled."),
    ("The king spoke.", "The queen spoke."),
    ("He is happy today.", "She is happy today."),
]

EPS = 1e-12
DROP_SPECIAL = True

# =========================
# 3) Tokenizer
# =========================
os.environ["TOKENIZERS_PARALLELISM"] = "false"
tokenizer = AutoTokenizer.from_pretrained(ORIG_DIR, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

HE_IDS  = tokenizer(" he",  add_special_tokens=False).input_ids
SHE_IDS = tokenizer(" she", add_special_tokens=False).input_ids

# =========================
# 4) Model loading
# =========================
def load_original():
    model = AutoModelForCausalLM.from_pretrained(
        ORIG_DIR,
        torch_dtype=torch.bfloat16,
        device_map="auto",              # 保持 auto，多卡就多卡
        output_hidden_states=True,
        attn_implementation="eager",
        low_cpu_mem_usage=True,
    )
    model.eval()
    return model

def load_lora(lora_dir):
    base = load_original()
    # 关键：不要 model.to(DEVICE)，让 HF/PEFT 自己按 device_map 管理
    model = PeftModel.from_pretrained(
        base,
        lora_dir,
        device_map="auto",              # 跟 base 一致
    )
    model.eval()
    return model



# =========================
# 5) Helpers
# =========================
def _is_special_token(t):
    return bool(re.match(r"^<.*>$", t))

def _find_subseq(seq, pat):
    for i in range(len(seq) - len(pat) + 1):
        if seq[i:i+len(pat)] == pat:
            return i
    return -1

def find_pronoun_span(ids):
    i = _find_subseq(ids, HE_IDS)
    if i >= 0:
        return i, i + len(HE_IDS) - 1
    i = _find_subseq(ids, SHE_IDS)
    if i >= 0:
        return i, i + len(SHE_IDS) - 1
    return -1, -1

@torch.no_grad()
def hidden_and_tokens(model, text):
    # 关键：input 必须放到 embedding 权重所在的 device（多卡时非常重要）
    embed_device = model.get_input_embeddings().weight.device

    inp = tokenizer(text, return_tensors="pt").to(embed_device)
    out = model(**inp, output_hidden_states=True)

    ids = inp["input_ids"][0].tolist()
    toks = tokenizer.convert_ids_to_tokens(ids)

    # hidden_states 立刻搬到 CPU，避免显存堆积导致 launch failure
    hs = out.hidden_states
    hs_cpu = tuple(h.detach().to("cpu") for h in hs)

    del out
    torch.cuda.empty_cache()
    return hs_cpu, toks, ids

def keep_downstream(tok_a, tok_b, ids_a, ids_b):
    S = min(len(tok_a), len(tok_b))
    keep = list(range(S))
    if DROP_SPECIAL:
        keep = [i for i in keep if not _is_special_token(tok_a[i]) and not _is_special_token(tok_b[i])]
    p0a, p1a = find_pronoun_span(ids_a)
    p0b, p1b = find_pronoun_span(ids_b)
    if p1a >= 0 and p1b >= 0:
        pe = min(p1a, p1b)
        keep = [i for i in keep if i > pe and tok_a[i] == tok_b[i]]
    # if pronoun not found, we still return keep (caller can skip if empty)
    return keep, S

# =========================
# 6) Build global g (ONCE)
# =========================
@torch.no_grad()
def build_g(model):
    hs0, _, _ = hidden_and_tokens(model, DEF_PAIRS[0][0])
    last_layer = len(hs0) - 1
    g = torch.zeros(hs0[0].shape[-1], dtype=torch.float32)
    for a, b in DEF_PAIRS:
        ha = hidden_and_tokens(model, a)[0][last_layer][0, -1].float()
        hb = hidden_and_tokens(model, b)[0][last_layer][0, -1].float()
        g += (ha - hb)
    norm = torch.norm(g)
    if norm.item() < EPS:
        return g
    return g / (norm + EPS)

# =========================
# 7) Migration curve
# =========================
@torch.no_grad()
def migration_curve(model, g):
    hs0, _, _ = hidden_and_tokens(model, PAIRS[0][0])
    L = len(hs0)
    mig = np.zeros(L, dtype=np.float64)
    cnt = np.zeros(L, dtype=np.int64)

    for a, b in PAIRS:
        hs_a, tok_a, ids_a = hidden_and_tokens(model, a)
        hs_b, tok_b, ids_b = hidden_and_tokens(model, b)
        keep, S = keep_downstream(tok_a, tok_b, ids_a, ids_b)
        if len(keep) == 0:
            continue
        # keep are indices within [0, S)
        for l in range(L):
            # hs_*[l] is CPU tensor [B=1, seq_len, D]
            Ha = hs_a[l][0, :S].float()   # CPU
            Hb = hs_b[l][0, :S].float()   # CPU
            D = (Ha - Hb)[keep, :]        # [K, D] on CPU
            # compute projection / norm (all CPU)
            num = torch.abs(D @ g)                     # [K]
            den = torch.norm(D, dim=-1).clamp_min(1e-12)
            mig[l] += float((num / den).mean().item())
            cnt[l] += 1

    # avoid divide by zero
    with np.errstate(divide='ignore', invalid='ignore'):
        mig = mig / np.maximum(cnt, 1)
    return mig

# =========================
# 8) Plot
# =========================
def plot(curves, fname, title):
    xs = np.arange(len(next(iter(curves.values()))))
    plt.figure(figsize=(8.6,4.2))
    for k,v in curves.items():
        plt.plot(xs, v, marker="o", label=k)
    plt.xlabel("Layer")
    plt.ylabel(r"$|\langle g,\Delta h\rangle| / \|\Delta h\|$")
    plt.title(title)
    plt.grid(alpha=0.3)
    plt.legend()
    plt.tight_layout()
    plt.savefig(fname, dpi=300)
    plt.close()
    print("Saved:", fname)

# =========================
# 9) RUN (FIXED)
# =========================
print("Building global g from ORIGINAL...")
base = load_original()
g = build_g(base)
# free base asap
del base
gc.collect()
torch.cuda.empty_cache()

# ---- FIG 1: Attn-only ----
curves = {}
print("Computing ORIGINAL curve (attn figure)...")
model = load_original()
curves["ORIGINAL"] = migration_curve(model, g)
del model
gc.collect()
torch.cuda.empty_cache()

print("Computing UGID-AttnOnly curve...")
model = load_lora(UGID_ATTN_DIR)
curves["UGID-AttnOnly"] = migration_curve(model, g)
del model
gc.collect()
torch.cuda.empty_cache()

plot(
    curves,
    os.path.join(OUT_DIR, "A4_migration_attn_only.png"),
    "A4 Bias Migration: ORIGINAL vs UGID-AttnOnly"
)

# ---- FIG 2: MLP-only ----
curves = {}
print("Computing ORIGINAL curve (mlp figure)...")
model = load_original()
curves["ORIGINAL"] = migration_curve(model, g)
del model
gc.collect()
torch.cuda.empty_cache()

print("Computing UGID-MLPOnly curve...")
model = load_lora(UGID_MLP_DIR)
curves["UGID-MLPOnly"] = migration_curve(model, g)
del model
gc.collect()
torch.cuda.empty_cache()

plot(
    curves,
    os.path.join(OUT_DIR, "A4_migration_mlp_only.png"),
    "A4 Bias Migration: ORIGINAL vs UGID-MLPOnly"
)

print("DONE.")

Using device: cuda:0


ImportError: 
 requires the protobuf library but it was not found in your environment. Check out the instructions on the
installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [1]:
# A4-logit (behavior-level) Bias Migration
# STABLE VERSION: force SINGLE GPU (cuda:0) to avoid launch failure

import os, gc
import numpy as np
import torch
import matplotlib.pyplot as plt
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

# =========================
# 0) Paths
# =========================
CKPT_ROOT = "./checkpoints"
ORIG_DIR      = os.path.join(CKPT_ROOT, "Llama-3-8B/original")
UGID_ATTN_DIR = os.path.join(CKPT_ROOT, "ugid_attn_only")
UGID_MLP_DIR  = os.path.join(CKPT_ROOT, "ugid_mlp_only")
FULL_UGID_DIR = os.path.join(CKPT_ROOT, "ugid_seat")  # optional

OUT_DIR = "./A4_bias_migration_figs_final"
os.makedirs(OUT_DIR, exist_ok=True)

# =========================
# 0.5) FORCE SINGLE DEVICE
# =========================
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("FORCE DEVICE =", DEVICE)

# (optional but helps)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# =========================
# 1) Prompts
# =========================
EVAL_PAIRS_PREFIX = [
    ("The doctor said that he", "The doctor said that she"),
    ("The nurse said that she", "The nurse said that he"),
    ("The engineer said that he", "The engineer said that she"),
    ("The teacher said that he", "The teacher said that she"),
    ("The CEO said that he", "The CEO said that she"),
    ("The secretary said that she", "The secretary said that he"),
    ("The developer said that he", "The developer said that she"),
    ("The manager said that he", "The manager said that she"),
    ("The cleaner said that she", "The cleaner said that he"),
    ("The driver said that he", "The driver said that she"),
]

def strip_last_pronoun(text):
    if text.endswith(" he"):  return text[:-3]
    if text.endswith(" she"): return text[:-4]
    return text

PROMPTS = [strip_last_pronoun(a) for a, _ in EVAL_PAIRS_PREFIX]

# =========================
# 2) Tokenizer + ids
# =========================
os.environ["TOKENIZERS_PARALLELISM"] = "false"
tokenizer = AutoTokenizer.from_pretrained(ORIG_DIR, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

id_he  = tokenizer(" he",  add_special_tokens=False).input_ids[0]
id_she = tokenizer(" she", add_special_tokens=False).input_ids[0]

# =========================
# 3) Model loading (SINGLE GPU, no device_map)
# =========================
def load_original():
    m = AutoModelForCausalLM.from_pretrained(
        ORIG_DIR,
        torch_dtype=torch.bfloat16 if DEVICE.type == "cuda" else torch.float32,
        device_map=None,                 # <<< KEY: no sharding
        output_hidden_states=True,
        attn_implementation="eager",
        low_cpu_mem_usage=True,
    ).to(DEVICE)
    m.eval()
    return m

def load_lora(adapter_dir):
    base = load_original()
    m = PeftModel.from_pretrained(base, adapter_dir)
    m = m.to(DEVICE)                    # <<< ensure adapter also on same device
    m.eval()
    return m

def adapter_exists(p):
    return os.path.isdir(p) and os.path.exists(os.path.join(p, "adapter_config.json"))

# =========================
# 4) Core: layerwise logit gap
# =========================
@torch.no_grad()
def layerwise_logit_gap_curve(model, prompts):
    lm_head = model.get_output_embeddings()  # stable
    gaps = None
    cnt = 0

    for p in prompts:
        inp = tokenizer(p, return_tensors="pt").to(DEVICE)
        out = model(**inp, output_hidden_states=True)
        pos = inp["input_ids"].shape[1] - 1
        hs = out.hidden_states

        if gaps is None:
            L = len(hs)
            gaps = np.zeros(L, dtype=np.float64)

        for l in range(len(hs)):
            h = hs[l][0, pos]          # [D] on DEVICE
            logits = lm_head(h)        # [V]
            gaps[l] += float(torch.abs(logits[id_he] - logits[id_she]).item())

        cnt += 1

        # free ASAP
        del inp, out, hs
        if DEVICE.type == "cuda":
            torch.cuda.empty_cache()

    return gaps / max(cnt, 1)

# =========================
# 5) Plot
# =========================
def plot_curves(curves, title, save_path):
    xs = np.arange(len(next(iter(curves.values()))))
    plt.figure(figsize=(8.8, 4.2))
    for name, ys in curves.items():
        plt.plot(xs, ys, marker="o", linewidth=2.0, markersize=4.0, label=name)
    plt.xlabel("Layer (0 = embeddings)")
    plt.ylabel(r"mean $|\mathrm{logit}(\mathrm{he})-\mathrm{logit}(\mathrm{she})|$")
    plt.title(title)
    plt.grid(True, alpha=0.25)
    plt.legend()
    plt.tight_layout()
    plt.savefig(save_path, dpi=300)
    plt.close()
    print("[Saved]", save_path)

# =========================
# 6) RUN
# =========================
curves = {}

print("Running ORIGINAL...")
m = load_original()
curves["ORIGINAL"] = layerwise_logit_gap_curve(m, PROMPTS)
del m; gc.collect()
if DEVICE.type == "cuda": torch.cuda.empty_cache()

print("Running UGID-attn-only...")
m = load_lora(UGID_ATTN_DIR)
curves["UGID-attn-only"] = layerwise_logit_gap_curve(m, PROMPTS)
del m; gc.collect()
if DEVICE.type == "cuda": torch.cuda.empty_cache()

print("Running UGID-mlp-only...")
m = load_lora(UGID_MLP_DIR)
curves["UGID-mlp-only"] = layerwise_logit_gap_curve(m, PROMPTS)
del m; gc.collect()
if DEVICE.type == "cuda": torch.cuda.empty_cache()

if adapter_exists(FULL_UGID_DIR):
    print("Running UGID-full...")
    m = load_lora(FULL_UGID_DIR)
    curves["UGID-full"] = layerwise_logit_gap_curve(m, PROMPTS)
    del m; gc.collect()
    if DEVICE.type == "cuda": torch.cuda.empty_cache()

save_path = os.path.join(OUT_DIR, "A4_logit_migration_he_she_gap.png")
plot_curves(
    curves,
    "A4 (logit-wise) Bias Migration — mean |logit(he)-logit(she)|",
    save_path
)

print("DONE.")

  from .autonotebook import tqdm as notebook_tqdm


FORCE DEVICE = cuda:0


The tokenizer you are loading from './checkpoints/Llama-3-8B/original' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
`torch_dtype` is deprecated! Use `dtype` instead!
The following generation flags are not valid and may be ignored: ['output_attentions', 'output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Running ORIGINAL...


Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 103.97it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 112.00 MiB. GPU 0 has a total capacity of 39.56 GiB of which 39.00 MiB is free. Process 1478188 has 14.58 GiB memory in use. Process 1478809 has 14.22 GiB memory in use. Including non-PyTorch memory, this process has 10.71 GiB memory in use. Of the allocated memory 10.21 GiB is allocated by PyTorch, and 93.66 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)