In [None]:
!pip install -q biopython tqdm transformers tokenizers accelerate

import pandas as pd
import numpy as np
import torch
import random
from tqdm.auto import tqdm
from Bio.SeqUtils.ProtParam import ProteinAnalysis

seed = 42
random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
device = "cuda" if torch.cuda.is_available() else "cpu"

print("Device:", device)


In [None]:
# Properties to evaluate
traits = ["charge_pH7","gravy","aromaticity","instability_index","mol_weight","iso_point"]

# Traits to steer down
steer_down_traits = {"aromaticity", "instability_index"}

def is_steer_down(prop: str) -> bool:
    return prop in steer_down_traits

def get_score(seq: str, trait_name: str):
    """Compute Biopython-based property for a protein sequence."""
    try:
        pa = ProteinAnalysis(seq)
        fns = {
            "charge_pH7":        lambda x: x.charge_at_pH(7.0),
            "gravy":             lambda x: x.gravy(),
            "aromaticity":       lambda x: x.aromaticity(),
            "instability_index": lambda x: x.instability_index(),
            "mol_weight":        lambda x: x.molecular_weight(),
            "iso_point":         lambda x: x.isoelectric_point(),
        }
        return fns[trait_name](pa)
    except Exception:
        return np.nan


In [None]:
from transformers import AutoModelForCausalLM
from tokenizers import Tokenizer
import torch, torch.nn.functional as F

MODEL_NAME = "hugohrban/progen2-large"
REV = "main"  # optionally pin to a commit hash for reproducibility

dtype = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    torch_dtype=dtype,
    low_cpu_mem_usage=True,
    revision=REV,
).eval().to(device)

# Tokenizer
tokenizer = Tokenizer.from_pretrained(MODEL_NAME, revision=REV)
tokenizer.no_padding()

AA = set("ACDEFGHIKLMNPQRSTVWY")

print(f"Loaded: {MODEL_NAME} on {device} (dtype={dtype})")


In [None]:
import torch

def top_k_sample(logits: torch.Tensor, k: int = 0, temperature: float = 1.0) -> int:
    """Temperature + optional top-k sampling; returns sampled token id (int)."""
    if temperature <= 0:
        return torch.argmax(logits, dim=-1).item()
    logits = logits / temperature
    if k and k > 0:
        topk = torch.topk(logits, k)
        probs = F.softmax(topk.values, dim=-1)
        idx_in_topk = torch.multinomial(probs, num_samples=1).item()
        return topk.indices[idx_in_topk].item()
    else:
        probs = F.softmax(logits, dim=-1)
        return torch.multinomial(probs, num_samples=1).item()

def generate_sequence_progen2(
    model,
    tokenizer,
    length: int = 500,
    temperature: float = 0.7,
    top_k: int = 3,
    max_steps: int = 5000,
) -> str:
    model_device = next(model.parameters()).device

    prompt = "1"
    ids = tokenizer.encode(prompt).ids
    input_ids = torch.tensor(ids, dtype=torch.long, device=model_device)

    seq_chars = []

    with torch.no_grad():
        for _ in range(max_steps):
            # forward
            logits = model(input_ids).logits[-1, :]  # last-step logits

            # sample next token id
            next_id = top_k_sample(logits, k=top_k, temperature=temperature)

            # append and check stopping conditions
            input_ids = torch.cat([input_ids, torch.tensor([next_id], device=model_device)], dim=0)

            tok = tokenizer.id_to_token(next_id)

            # Stop if EOS-like '2'
            if tok == "2":
                break

            if len(tok) == 1 and tok in AA:
                seq_chars.append(tok)
                if len(seq_chars) >= length:
                    break

    return "".join(seq_chars)


In [None]:
import itertools
import sys

n_trials        = 1 
n_samples       = 500
sequence_length = 500
temperature     = 0.7
top_k           = 3
seed            = 42

# Loop over properties
for property_label in tqdm(
    traits, desc="Properties", dynamic_ncols=True, mininterval=0.2, leave=True, file=sys.stdout
):

    trial_means = []
    trial_stds  = []
    all_scores  = []

    # single trial — no inner chatter, only the per-property tqdm below
    for _ in range(n_trials):
        # seed per trial
        random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
        if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

        scores = []

        
        for _ in tqdm(
            range(n_samples),
            desc=f"Generating ({property_label})",
            dynamic_ncols=True,
            mininterval=0.1,
            leave=False,
            file=sys.stdout
        ):
            seq = generate_sequence_progen2(
                model=model,
                tokenizer=tokenizer,
                length=sequence_length,
                temperature=temperature,
                top_k=top_k,
            )
            try:
                scores.append(get_score(seq, property_label))
            except Exception:
                scores.append(np.nan)

        # per-trial stats
        scores_arr = np.asarray(scores, dtype=float)
        scores_arr = scores_arr[~np.isnan(scores_arr)]
        trial_mean = float(np.mean(scores_arr)) if len(scores_arr) else float("nan")
        trial_std  = float(np.std(scores_arr, ddof=1)) if len(scores_arr) > 1 else 0.0

        trial_means.append(trial_mean)
        trial_stds.append(trial_std)
        all_scores.append(scores_arr.tolist())

    # CI
    if n_trials == 1 and len(all_scores[0]) > 1:
        per_sample = np.array(all_scores[0], dtype=float)
        n   = len(per_sample)
        mean_ps = float(np.mean(per_sample))
        std_ps  = float(np.std(per_sample, ddof=1))
        ci95    = 1.96 * (std_ps / np.sqrt(n))
    else:
        overall_mean = float(np.mean(trial_means)) if trial_means else float("nan")
        overall_std  = float(np.std(trial_means, ddof=1)) if len(trial_means) > 1 else 0.0
        n = sum(len(x) for x in all_scores)
        mean_ps, std_ps = overall_mean, overall_std
        ci95 = 1.96 * (overall_std / np.sqrt(max(n_trials, 1)))

    # outputting results
    print(f"[SUMMARY] {property_label}: mean={mean_ps:.6f} | std={std_ps:.6f} | 95% CI=±{ci95:.6f} | n={n}", flush=True)

    trial_stats = pd.DataFrame({
        'trial': np.arange(1, n_trials+1),
        'mean_score': trial_means,
        'std_score': trial_stds
    })
    trial_stats.to_csv(f"progen2_trial_stats_{property_label}.csv", index=False)

    # scoring
    flat_scores = list(itertools.chain.from_iterable(all_scores))
    pd.DataFrame({'score': flat_scores}).to_csv(f"progen2_generated_scores_{property_label}.csv", index=False)
