### Imports and installs

In [1]:
!pip install esm
!pip install omegaconf
!pip install torch
!pip install huggingface-hub

Collecting esm
  Downloading esm-3.0.6-py3-none-any.whl.metadata (9.4 kB)
Collecting torch>=2.2.0 (from esm)
  Downloading torch-2.5.0-cp311-none-macosx_11_0_arm64.whl.metadata (28 kB)
Collecting torchvision (from esm)
  Downloading torchvision-0.20.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.1 kB)
Collecting torchtext (from esm)
  Downloading torchtext-0.18.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (7.9 kB)
Collecting transformers (from esm)
  Downloading transformers-4.45.2-py3-none-any.whl.metadata (44 kB)
Collecting einops (from esm)
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Collecting biotite==0.41.2 (from esm)
  Downloading biotite-0.41.2-cp311-cp311-macosx_11_0_arm64.whl.metadata (7.1 kB)
Collecting msgpack-numpy (from esm)
  Downloading msgpack_numpy-0.4.8-py2.py3-none-any.whl.metadata (5.0 kB)
Collecting biopython (from esm)
  Downloading biopython-1.84-cp311-cp311-macosx_11_0_arm64.whl.metadata (12 kB)
Collecting scikit-learn (from esm)
  Downloading

In [2]:
import torch
import torch.nn.functional as F
import torch.nn as nn

from model import DNADecoder

from omegaconf import OmegaConf
from torch.nn import TransformerDecoderLayer, TransformerDecoder
from transformers import AutoModel, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


### Set up models

In [4]:
config = OmegaConf.load("config.yaml")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
choformer = DNADecoder(config).to(device).eval()
choformer.load_state_dict(torch.load("best_model.pth", map_location=device))

esm_model = AutoModel.from_pretrained("facebook/esm2_t6_8M_UR50D").to(device)
esm_tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")

  choformer.load_state_dict(torch.load("best_model.pth", map_location=device))
Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# import protein sequences
import pandas as pd
proteins = "/Users/rishabjain/Desktop/Research/choformer/inference/cho_low_exp_prot.csv"

df = pd.read_csv(proteins)

# convert column protein to list of sequences
sequences = df["protein"].tolist()

### Inference

In [12]:
protein_sequences = sequences[:10]
longest_protein_length = max([len(sequence) for sequence in protein_sequences])

protein_tokens = esm_tokenizer(
    protein_sequences,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=longest_protein_length
).to(device)


with torch.no_grad():
    protein_embeddings = esm_model(**protein_tokens).last_hidden_state.squeeze(0)

In [13]:
outputs = choformer.generate(protein_embeddings)
print("GENERATED DNA SEQUENCES:", outputs['generated_sequences'])

GENERATED DNA SEQUENCES: ['ATGTCCTGGACCTCTGCCGACCCCGGGCAGAGCTCCCGCCCCCGCTGCCGCTTCCTGGAGGCCTCCCGCCTGGAGCGCCCCGTGCAGCTGGCCTGCTCCCGCCTGGAGCGCCGCTTCCGCGTGGAGCCCCGCGTCACCTTCCCGCCCGCCGAGCTCTGCTGCCGCCAGCGCCGCCGCCGCCGCGCGGCCTGCGGCCGCCGCGGCCTGGGCTGCGGCGCCCAGCTCAGCGCCAGCCCCGGCACCGGCGGCGTCTTCGGGGCCCCGAGCCCCGGGCGCCGCGCCGGCGCCGGGCCCGGCGGCCGCGCGGCCGGCGCCTGGCCGCCGTGCGCCGCGCTGCTGGTGGGCCGCGGCGAGGACGGCCCCGCCCGGGCCACCCTGGGCGCCGGCACCTCCCGCCCCCGGGACCCCGGGCCCTGGACCGCCGCCCGCCCCTGGCACGTGCGCCGCAGCCTGGAGGCCGCCTGGCCGCGCCTGGCCGGCCTGGCCCCGGCGGAGGCCGGCTGCGCCCGCCGCCGCGTCTGCCCGCGCGTCTCCCCCTCCCCCTTCTGCGGCAGCAACGCCTGCCTGCCCGGCGGCGCCCGGCAGCAGGTGCCCGGGGTCTGCGCCGCCCGGCAGCTCTCCTCCTCCAGCCAGCGCCGCAGCGAGGCCCTGGCCGGGGCCCCCCTGGACAATGCCCCCAAGGAGTACCCCCCCAAGATCCAGCAGCTGGTGCAGGACATTGCCAGCCTGACCCTGCTGGAGATCAGTGACCTGAATGAGCTGCTGAAGAAGACCCTGAAGATCCAGGATGTGGGCCTGATGCCCATGGGGGGCATGATGCCTGGGGCTGTGCCTGCCGCCGCCGCCGCCGCCCCCGAGGTGGCTGAGGGGGAGGACATCCCCAAGCAGAAGGAGCGCACCCACTTTACAGTGCGCCTGACAGAGGCCAAGCCTGTGGACAAGGTGAAGCTGATCAAGGAGATCAAGAATTATG