In [None]:
#Vars
TARGET_PROP = "gravy"      # charge_pH7, aromaticity, instability_index, gravy, iso_point
N_SAMPLES   = 500          
SEQ_LEN     = 500
TOP_K       = 5
TEMP        = 1.0

In [None]:
!pip install -q biopython transformers tokenizers accelerate datasets tqdm

import os, math, random, warnings
from pathlib import Path

import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset
from tqdm import tqdm

import torch
import torch.nn.functional as F

from Bio.SeqUtils.ProtParam import ProteinAnalysis

warnings.filterwarnings("ignore")


SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE  = torch.float16  # runtime weights for base/negated

if torch.cuda.is_available():
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.benchmark = True

# Experiment params

N_TRIALS   = 1
N_SAMPLES  = 500
SEQ_LEN    = 500
TOP_K      = 5
TEMP       = 1.0

# Fine-tune params
MAX_TOKENS   = 96     
EPOCHS       = 1
BSZ          = 1      
GRAD_ACCUM   = 16    
LR           = 2e-5
WARMUP_STEPS = 50
MAX_STEPS    = None

SAVE_DIR = Path("/workspace/progen2_taskvec")
SAVE_DIR.mkdir(parents=True, exist_ok=True)

AMINO_ACIDS = list("ACDEFGHIKLMNPQRSTVWY")

print(f"Device: {DEVICE}, dtype: {DTYPE}, samples/state: {N_SAMPLES}, length: {SEQ_LEN}")


In [None]:
from transformers import AutoModelForCausalLM
from tokenizers import Tokenizer

MODEL_NAME = "hugohrban/progen2-large"

# this is the HF "tokenizers" Tokenizer, as required by this model card
tokenizer = Tokenizer.from_pretrained(MODEL_NAME)
tokenizer.no_padding()

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    torch_dtype=DTYPE,
    device_map="auto" if torch.cuda.is_available() else None,
)
base_model.eval()
print("Loaded base ProGen2-large.")


In [None]:
def score_property(seq: str, prop: str) -> float:
    pa = ProteinAnalysis(seq)
    if prop == "mol_weight":
        return pa.molecular_weight()
    if prop == "charge_pH7":
        return pa.charge_at_pH(7.0)
    if prop == "aromaticity":
        return pa.aromaticity()
    if prop == "instability_index":
        return pa.instability_index()
    if prop == "gravy":
        return pa.gravy()
    if prop == "iso_point":
        return pa.isoelectric_point()
    raise ValueError(f"Unknown property: {prop}")


In [None]:
print("Downloading dataset once...")
dataset = load_dataset("protolyze/plminterp", split="train")
df_all = pd.DataFrame(dataset)
assert "Sequence" in df_all.columns, "Expected 'Sequence' column in protolyze/plminterp."

def build_non_examples_for_property(df: pd.DataFrame, prop: str, pct: float = 0.20) -> pd.DataFrame:
    # score entire df for the target property
    scores = []
    for s in tqdm(df["Sequence"], desc=f"Scoring ({prop})"):
        try:
            scores.append(score_property(s, prop))
        except Exception:
            scores.append(np.nan)
    scored = df.copy()
    scored["score"] = scores
    scored = scored.dropna(subset=["score"]).reset_index(drop=True)

    # partition
    if prop in ["aromaticity", "instability_index"]:
        # steer downward = take top scoring as non-examples
        scored = scored.sort_values("score", ascending=False)
    else:
        # steer upward = take bottom scoring as non-examples
        scored = scored.sort_values("score", ascending=True)

    n = int(pct * len(scored))
    non_examples = scored.head(max(1, n)).reset_index(drop=True)
    return non_examples[["Sequence", "score"]]


In [None]:
class ProGenCausalDataset(torch.utils.data.Dataset):
    def __init__(self, sequences, tokenizer, max_tokens=256):
        self.sequences = sequences
        self.tok = tokenizer
        self.max_tokens = max_tokens

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        seq = str(self.sequences[idx])
        ids = self.tok.encode(seq).ids
        ids = ids[: self.max_tokens]
        attn = [1] * len(ids)

        pad_len = self.max_tokens - len(ids)
        if pad_len > 0:
            ids  = ids  + [0] * pad_len
            attn = attn + [0] * pad_len

        input_ids      = torch.tensor(ids,  dtype=torch.long)
        attention_mask = torch.tensor(attn, dtype=torch.long)

        labels = input_ids.clone()
        labels[attention_mask == 0] = -100  # ignore pads

        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

def _unfreeze_last_n_blocks(model, n_last: int = 4):
    
    for p in model.parameters():
        p.requires_grad = False

    # find blocks list
    blocks = None
    containers = []
    for attr in ["transformer", "model", "gpt_neox", "backbone"]:
        if hasattr(model, attr):
            containers.append(getattr(model, attr))
    for c in containers:
        for name in ["h", "layers", "blocks"]:
            if hasattr(c, name):
                blocks = getattr(c, name)
                break
        if blocks is not None:
            break

    # unfreeze last n blocks
    if blocks is not None:
        for b in (list(blocks)[-n_last:] if hasattr(blocks, "__iter__") else []):
            for p in b.parameters():
                p.requires_grad = True

    if hasattr(model, "lm_head"):
        for p in model.lm_head.parameters():
            p.requires_grad = True

def finetune_on_non_examples(base_model, sequences, epochs=1, max_tokens=96, lr=2e-5, bsz=1, grad_accum=16,
                             warmup_steps=50, max_steps=None, save_path=None, n_last_trainable_blocks: int = 4):

    from transformers import AutoModelForCausalLM
    import bitsandbytes as bnb

    # Load training model in bf16 (A40 supports bfloat16 well)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        trust_remote_code=True,
        torch_dtype=torch.bfloat16,
        device_map="auto" if torch.cuda.is_available() else None,
    )
    if getattr(model.config, "use_cache", None) is not None:
        model.config.use_cache = False

    # freeze all but last few blocks
    _unfreeze_last_n_blocks(model, n_last=n_last_trainable_blocks)

    model.train()

    ds = ProGenCausalDataset(sequences, tokenizer, max_tokens=max_tokens)
    dl = torch.utils.data.DataLoader(
        ds, batch_size=bsz, shuffle=True, drop_last=True, pin_memory=True
    )

    # 8-bit Adam
    opt = bnb.optim.Adam8bit(filter(lambda p: p.requires_grad, model.parameters()), lr=lr)

    total_steps = max_steps or (epochs * math.ceil(len(ds) / (bsz * grad_accum)))
    def lr_lambda(step):
        if warmup_steps and step < warmup_steps:
            return float(step) / float(max(1, warmup_steps))
        if warmup_steps and step >= warmup_steps:
            return max(0.0, float(total_steps - step) / float(max(1, total_steps - warmup_steps)))
        return max(0.0, float(total_steps - step) / float(max(1, total_steps)))

    sched = torch.optim.lr_scheduler.LambdaLR(opt, lr_lambda)

    # amp in bf16 path 
    step = 0
    for epoch in range(epochs):
        pbar = tqdm(dl, desc=f"Finetuning (epoch {epoch+1}/{epochs})", leave=False)
        opt.zero_grad(set_to_none=True)
        for i, batch in enumerate(pbar):
            for k in batch:
                batch[k] = batch[k].to(model.device, non_blocking=True)

            with torch.cuda.amp.autocast(enabled=torch.cuda.is_available(), dtype=torch.bfloat16):
                out = model(**batch)
                loss = out.loss / grad_accum

            loss.backward()
            if (i + 1) % grad_accum == 0:
                torch.nn.utils.clip_grad_norm_(
                    [p for p in model.parameters() if p.requires_grad], 1.0
                )
                opt.step(); opt.zero_grad(set_to_none=True); sched.step(); step += 1

            pbar.set_postfix({"loss": f"{out.loss.item():.4f}", "step": step})
            if max_steps and step >= max_steps:
                break
        if max_steps and step >= max_steps:
            break

    model.eval()
    if save_path:
        save_path = Path(save_path); save_path.mkdir(parents=True, exist_ok=True)
        model.save_pretrained(save_path)
        print(f"Saved finetuned model to: {save_path}")
    return model


In [None]:
def build_negated_model(base_model, finetuned_model):
    with torch.no_grad():
        base_sd = {k: v.to("cpu") for k, v in base_model.state_dict().items()}
        ft_sd   = {k: v.to("cpu") for k, v in finetuned_model.state_dict().items()}
        neg_sd  = {}
        for k, bv in base_sd.items():
            fv = ft_sd.get(k, None)
            if fv is not None and fv.shape == bv.shape:
                delta = fv - bv
                neg_sd[k] = (bv - delta).to(DTYPE)
            else:
                neg_sd[k] = bv.to(DTYPE)

    neg_model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        trust_remote_code=True,
        torch_dtype=DTYPE,
        device_map="auto" if torch.cuda.is_available() else None,
    )
    missing, unexpected = neg_model.load_state_dict(neg_sd, strict=False)
    neg_model.eval()
    print("Negated task-vector model built.",
          f"\n missing keys: {len(missing)} | unexpected keys: {len(unexpected)}")
    return neg_model


In [None]:
# Build amino-acid token id list once
AA_IDs = []
for aa in AMINO_ACIDS:
    tid = tokenizer.token_to_id(aa)
    if tid is not None:
        AA_IDs.append(tid)
AA_IDs = torch.tensor(AA_IDs, device=base_model.device if hasattr(base_model, 'device') else 0)

def generate_protein_sequence(model, length: int = 100, temperature: float = 1.0, top_k: int = 5, seed_prompt: str = "M") -> str:
    ids = tokenizer.encode(seed_prompt).ids
    x = torch.tensor(ids, device=model.device, dtype=torch.long).unsqueeze(0) 
    with torch.no_grad():
        steps = max(0, length - len(seed_prompt))
        for _ in range(steps):
            logits = model(x).logits[:, -1, :]  
            aa_logits = logits.index_select(dim=-1, index=AA_IDs) 
            aa_logits = aa_logits / max(1e-6, temperature)

            k = min(top_k, aa_logits.shape[-1])
            topk_vals, topk_idx = torch.topk(aa_logits, k=k, dim=-1)
            probs = F.softmax(topk_vals, dim=-1)
            sample_local = torch.multinomial(probs, num_samples=1)
            next_token_global = AA_IDs[topk_idx.gather(1, sample_local)].view(1,1)
            x = torch.cat([x, next_token_global], dim=1)

    tokens = [tokenizer.id_to_token(t) for t in x[0].tolist()]
    seq_only = "".join([t for t in tokens if len(t) == 1 and t in AMINO_ACIDS])
    if len(seq_only) < length: seq_only = seq_only + ("A" * (length - len(seq_only)))
    elif len(seq_only) > length: seq_only = seq_only[:length]
    return seq_only


In [None]:
try:
    TARGET_PROP 
except NameError:
    TARGET_PROP = os.environ.get("PROP", "").strip()
    if not TARGET_PROP:
        TARGET_PROP = "gravy"  # safe default for ad-hoc runs

assert TARGET_PROP in {"charge_pH7","aromaticity","instability_index","gravy","iso_point"}, \
       f"TARGET_PROP='{TARGET_PROP}' must be one of the 5 (excluding 'mol_weight')."

PROPERTIES = [TARGET_PROP] 
print(f"[CONFIG] Running single property: {TARGET_PROP}")


In [None]:
# FT -> generate -> score TARGET_PROP only
prop = PROPERTIES[0]
print(f"\n=== PROPERTY (task-vector target): {prop} ===")

# Non-examples for this prop
non_examples = build_non_examples_for_property(df_all, prop, pct=0.20)
non_examples.to_csv(SAVE_DIR / f"non_examples_{prop}.csv", index=False)
train_seqs = non_examples["Sequence"].tolist()
print(f"Non-examples for {prop}: {len(train_seqs)}")

# free the GPU before FT, move base to CPU
base_model.to("cpu"); torch.cuda.empty_cache()

# Fine-tune
ft_path = SAVE_DIR / f"ft_{prop}"
ft_model = finetune_on_non_examples(
    base_model=None,        
    sequences=train_seqs,
    epochs=EPOCHS, max_tokens=MAX_TOKENS,
    lr=LR, bsz=BSZ, grad_accum=GRAD_ACCUM,
    warmup_steps=WARMUP_STEPS, max_steps=MAX_STEPS,
    save_path=ft_path
)


base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, trust_remote_code=True, torch_dtype=DTYPE, device_map="auto"
)
base_model.eval()

# Build negated model (TA)
neg_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, trust_remote_code=True, torch_dtype=DTYPE, device_map={"": "cpu"}
)
# compute negated state dict
with torch.no_grad():
    base_sd = {k: v.to("cpu") for k, v in base_model.state_dict().items()}
    ft_sd   = {k: v.to("cpu") for k, v in ft_model.state_dict().items()}
    neg_sd  = {}
    for k, bv in base_sd.items():
        fv = ft_sd.get(k, None)
        if fv is not None and fv.shape == bv.shape:
            delta = fv - bv
            neg_sd[k] = (bv - delta).to(DTYPE)
        else:
            neg_sd[k] = bv.to(DTYPE)
missing, unexpected = neg_model.load_state_dict(neg_sd, strict=False)
neg_model.to("cuda"); neg_model.eval()
print("Negated task-vector model built.",
      f"\n missing keys: {len(missing)} | unexpected keys: {len(unexpected)}")

def gen_and_score_single_property(model, state_tag, prop, n_samples=N_SAMPLES, seq_len=SEQ_LEN, temp=TEMP, top_k=TOP_K):
    seqs = []
    for _ in tqdm(range(n_samples), desc=f"Generating ({prop}, state={state_tag})"):
        seqs.append(generate_protein_sequence(model, length=seq_len, temperature=temp, top_k=top_k, seed_prompt="M"))

    scores = []
    for s in seqs:
        try: scores.append(score_property(s, prop))
        except Exception: scores.append(np.nan)

    df = pd.DataFrame({"sequence": seqs, prop: scores})
    vals = df[prop].dropna().to_numpy(dtype=np.float64)
    n = vals.size
    if n > 1:
        mean = float(vals.mean()); std = float(vals.std(ddof=1)); ci = 1.96 * std / math.sqrt(n)
    elif n == 1:
        mean, std, ci = float(vals[0]), 0.0, 0.0
    else:
        mean, std, ci = 0.0, 0.0, 0.0

    print(f"[SUMMARY] state={state_tag} | {prop}: mean={mean:.6f} | std={std:.6f} | 95% CI=Â±{ci:.6f} | n={n}")
    return df, mean, std, ci, n

# Scoring/evaluation
summaries = []
for state_tag, mdl in [("base", base_model), ("finetuned", ft_model), ("negated", neg_model)]:
    df_metrics, mean, std, ci, n = gen_and_score_single_property(mdl, state_tag, prop)
    out_prefix = SAVE_DIR / f"{prop}__{state_tag}"
    pd.DataFrame({"sequence": df_metrics["sequence"]}).to_csv(f"{out_prefix}__sequences.csv", index=False)
    df_metrics.to_csv(f"{out_prefix}__metrics.csv", index=False)
    summaries.append({"taskvec_target": prop, "state": state_tag, "mean": mean, "std": std, "ci95": ci, "n": n})

pd.DataFrame(summaries).to_csv(SAVE_DIR / f"summary_{prop}_all_states.csv", index=False)
print(f"\nSaved summary: {SAVE_DIR / f'summary_{prop}_all_states.csv'}")
