In [None]:
!pip install -q esm biopython tqdm

import pandas as pd
import numpy as np
import torch
import random
from tqdm import tqdm, trange
from Bio.SeqUtils.ProtParam import ProteinAnalysis

# ESM3
from esm.models.esm3 import ESM3
from esm.sdk.api import ESMProtein, GenerationConfig

seed = 42
random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
device = "cuda" if torch.cuda.is_available() else "cpu"

print("Device:", device)

In [None]:
# properties
traits = ["charge_pH7","gravy","aromaticity","instability_index","mol_weight","iso_point"]

# traits steer down
steer_down_traits = {"aromaticity", "instability_index"}

def is_steer_down(prop: str) -> bool:
    return prop in steer_down_traits

def get_score(seq: str, trait_name: str):
    """Compute Biopython-based property for a protein sequence."""
    try:
        pa = ProteinAnalysis(seq)
        fns = {
            "charge_pH7":        lambda x: x.charge_at_pH(7.0),
            "gravy":             lambda x: x.gravy(),
            "aromaticity":       lambda x: x.aromaticity(),
            "instability_index": lambda x: x.instability_index(),
            "mol_weight":        lambda x: x.molecular_weight(),
            "iso_point":         lambda x: x.isoelectric_point(),
        }
        return fns[trait_name](pa)
    except Exception:
        return np.nan

In [None]:
from huggingface_hub import login
login()


In [None]:
esm3_model_id = "esm3_sm_open_v1"
model = ESM3.from_pretrained(esm3_model_id).to(device)
model.eval()
print("Loaded ESM3:", esm3_model_id)

In [None]:

import sys, os, io, contextlib
import itertools
import pandas as pd
import numpy as np
import torch
import random
from tqdm.auto import tqdm

# experiment params 
n_trials        = 1       
n_samples       = 500        
sequence_length = 500
temperature     = 0.7
num_steps       = 8          # ESM3 unmasking iterations
seed            = 42

def generate_sequence_esm3(model, length: int, temperature: float = 0.7, num_steps: int = 8) -> str:
    """De novo sequence via ESM3: fully masked '_' * length; generate sequence track only."""
    protein = ESMProtein(sequence="_" * length)
    protein = model.generate(
        protein,
        GenerationConfig(track="sequence", num_steps=num_steps, temperature=temperature)
    )
    return protein.sequence

def generate_sequence_esm3_silent(model, length: int, temperature: float = 0.7, num_steps: int = 8) -> str:
    """Wrapper that suppresses any internal prints from ESM3's generate()."""
    with contextlib.redirect_stdout(io.StringIO()), contextlib.redirect_stderr(io.StringIO()):
        return generate_sequence_esm3(model, length=length, temperature=temperature, num_steps=num_steps)

#  main loop
for property_label in tqdm(traits, desc="Properties", dynamic_ncols=True, mininterval=0.2, leave=True, file=sys.stdout):
    trial_means, trial_stds, all_scores = [], [], []

    for _ in range(n_trials):
        random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
        if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

        scores = []
        for _ in tqdm(range(n_samples), desc=f"Generating ({property_label})", dynamic_ncols=True, mininterval=0.1, leave=False, file=sys.stdout):
            seq = generate_sequence_esm3_silent(model, length=sequence_length, temperature=temperature, num_steps=num_steps)
            try:
                scores.append(get_score(seq, property_label))
            except Exception:
                scores.append(np.nan)

        # per-trial stats
        arr = np.asarray(scores, dtype=float)
        arr = arr[~np.isnan(arr)]
        trial_mean = float(np.mean(arr)) if len(arr) else float("nan")
        trial_std  = float(np.std(arr, ddof=1)) if len(arr) > 1 else 0.0
        trial_means.append(trial_mean); trial_stds.append(trial_std); all_scores.append(arr.tolist())

    # CI from per-sample since n_trials=1
    if n_trials == 1 and all_scores and len(all_scores[0]) > 1:
        per_sample = np.array(all_scores[0], dtype=float)
        n    = len(per_sample)
        mean_ps = float(np.mean(per_sample))
        std_ps  = float(np.std(per_sample, ddof=1))
        ci95    = 1.96 * (std_ps / np.sqrt(n))
    else:
        overall_mean = float(np.mean(trial_means)) if trial_means else float("nan")
        overall_std  = float(np.std(trial_means, ddof=1)) if len(trial_means) > 1 else 0.0
        n = sum(len(x) for x in all_scores) if all_scores else 0
        mean_ps, std_ps = overall_mean, overall_std
        ci95 = 1.96 * (overall_std / np.sqrt(max(n_trials, 1))) if n_trials > 0 else 0.0

    print(f"[SUMMARY] {property_label}: mean={mean_ps:.6f} | std={std_ps:.6f} | 95% CI=±{ci95:.6f} | n={n}", flush=True)


    pd.DataFrame({'trial': np.arange(1, n_trials+1), 'mean_score': trial_means, 'std_score': trial_stds}).to_csv(f"esm3_trial_stats_{property_label}.csv", index=False)
    flat_scores = list(itertools.chain.from_iterable(all_scores))
    pd.DataFrame({'score': flat_scores}).to_csv(f"esm3_generated_scores_{property_label}.csv", index=False)


Built with ESM.
https://huggingface.co/EvolutionaryScale/esm3-sm-open-v1
https://github.com/evolutionaryscale/esm?tab=readme-ov-file#esm-3-