# This is a notebook to show cases of ESM_if

In [1]:
import esm
#load the model
model, alphabet = esm.pretrained.esm_if1_gvp4_t16_142M_UR50()
model = model.eval()  

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# download the example structure
!wget https://files.rcsb.org/download/7mmo.cif -P data/

--2025-06-12 22:07:39--  https://files.rcsb.org/download/7mmo.cif
Connecting to 128.59.114.167:3128... connected.
Proxy request sent, awaiting response... 200 OK
Length: unspecified [application/octet-stream]
Saving to: ‘data/7mmo.cif’

7mmo.cif                [ <=>                ] 968.49K  --.-KB/s    in 0.02s   

2025-06-12 22:07:39 (52.9 MB/s) - ‘data/7mmo.cif’ saved [991730]



In [3]:
# information on structure to be designed
fpath = 'data/7mmo.cif' # .pdb format is also acceptable
chain_id = 'A'
structure = esm.inverse_folding.util.load_structure(fpath, chain_id)
coords, native_seq = esm.inverse_folding.util.extract_coords_from_structure(structure)
print('Native sequence:')
print(native_seq)

Native sequence:
ITLKESGPTLVKPTQTLTLTCTFSGFSLSISGVGVGWLRQPPGKALEWLALIYWDDDKRYSPSLKSRLTISKDTSKNQVVLKMTNIDPVDTATYYCAHHSISTIFDHWGQGTLVTVSSASTKGPSVFPLAPCTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTKTYTCNVDHKPSNTKVDKRVHH


# Application 1. sample new seqence based on the backbone structure

In [4]:
import numpy as np

sampled_seq = model.sample(coords, temperature=1)
print('Sampled sequence:', sampled_seq)

recovery = np.mean([(a==b) for a, b in zip(native_seq, sampled_seq)])
print('Sequence recovery:', recovery)

Sampled sequence: TSLVVSGSSLVKPTETLTLVCTYSGWSLTTSGRGVGWLWVAPGAARSALARILWDNLTEYNPDKKASLTISKNTVAHTVTLKKTNIVPTDTATYYCAFHDDTDTETNTSNGLLVTVSASTETGPTVTPIAGPFRTVGCSIDDFYPLTVTVSWNEGADTSGVTVEPSTLQPSGLYHRDANLRIPTWACLTEARECDVIHVPTRTHVVQTVTC
Sequence recovery: 0.4786729857819905


In [7]:
# we can also sample sequences conditioned on a partial structure
from copy import deepcopy
masked_coords = deepcopy(coords)
print('Masked coordinates shape:', masked_coords.shape)
masked_coords[:15] = np.inf # mask the first 15 residues
ll_fullseq, ll_withcoord = esm.inverse_folding.util.score_sequence(model, alphabet, masked_coords, native_seq)

print(f'average log-likelihood on entire sequence: {ll_fullseq:.2f} (perplexity {np.exp(-ll_fullseq):.2f})')
print(f'average log-likelihood excluding missing coordinates: {ll_withcoord:.2f} (perplexity {np.exp(-ll_withcoord):.2f})')

sampled_seq = model.sample(masked_coords, temperature=1)
print('Sampled sequence with masked coordinates:', sampled_seq)

recovery = np.mean([(a==b) for a, b in zip(native_seq, sampled_seq)])
print('Sequence recovery:', recovery)

# we can also sample sequences conditioned on a partial sequence
# masked_seq = deepcopy(native_seq)
# masked_seq[:15] = '-' # mask the first 15 residues
# ll_fullseq, ll_withseq = esm.inverse_folding.util.score_sequence(model, alphabet, coords, masked_seq)
# print(f'average log-likelihood on entire sequence: {ll_fullseq:.2f} (perplexity {np.exp(-ll_fullseq):.2f})')
# print(f'average log-likelihood excluding missing sequence: {ll_withseq:.2f} (perplexity {np.exp(-ll_withseq):.2f})')

Masked coordinates shape: (211, 3, 3)
average log-likelihood on entire sequence: -1.44 (perplexity 4.22)
average log-likelihood excluding missing coordinates: -1.33 (perplexity 3.77)
Sampled sequence with masked coordinates: SDKEATFYTVKPLGGMLKLLCKYEGFKLTKQGRGIGFFWQPPGTALAALARIEYNNRKYYEPSLASRLNISKNEELSEVTLTIRDVTPDDTATYYCAYVTSSTSYTFSSKGQLVVVSDAAYTGPKVEPLSPGTLTIGCRVANYFPLAVTVTWNRGTTTAGVKVDPAKLQSSGLYSRTATVTIPTDKCGKTSYTCHITHVPTDTIVDRTITC
Sequence recovery: 0.44075829383886256


# same application for multichain