### Imports and installs

In [2]:
!pip install esm
!pip install omegaconf



In [6]:
import torch
import torch.nn.functional as F
import torch.nn as nn

from model import DNADecoder

from omegaconf import OmegaConf
from torch.nn import TransformerDecoderLayer, TransformerDecoder
from transformers import AutoModel, AutoTokenizer



### Set up models

In [7]:
config = OmegaConf.load("/content/config.yaml")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
choformer = DNADecoder(config).to(device).eval()
choformer.load_state_dict(torch.load("/content/best_model.pth", map_location=device))

esm_model = AutoModel.from_pretrained("facebook/esm2_t6_8M_UR50D").to(device)
esm_tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t6_8M_UR50D")

  choformer.load_state_dict(torch.load("/content/best_model.pth", map_location=device))
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/31.4M [00:00<?, ?B/s]

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t6_8M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/95.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/93.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



### Inference

In [9]:
protein_sequences = ["MTEYKLV", "QEWR"]
longest_protein_length = max([len(sequence) for sequence in protein_sequences])

protein_tokens = esm_tokenizer(
    protein_sequences,
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=longest_protein_length
).to(device)


with torch.no_grad():
    protein_embeddings = esm_model(**protein_tokens).last_hidden_state.squeeze(0)

In [92]:
# dna_sequences = [len(seq)*"AGC" for seq in protein_sequences]
# longest_dna_sequence = max([len(sequence) for sequence in dna_sequences])

# dna_tokens = [
#     tokenizer.encode([seq], max_length=longest_protein_length).to(device)
#     for seq in dna_sequences
# ]
# dna_tokens = torch.stack(dna_tokens).squeeze(1)

In [10]:
outputs = choformer.generate(protein_embeddings)
outputs

{'logits': tensor([[[12.9560,  0.8540, -1.3136,  ..., -0.3375, -0.6853,  1.0808],
          [-1.8309, -0.0448, -2.4459,  ..., -1.9548, -1.3534,  1.4383],
          [ 0.5082,  0.6656, -3.9410,  ..., -2.0489, -2.6285,  1.5828],
          ...,
          [-0.3718,  0.3638, -2.4063,  ..., -2.2504, -2.8857,  1.6205],
          [-1.8982, -0.3922, -2.8095,  ..., -2.2006, -1.5473,  0.9988],
          [-4.1325,  0.0435, -5.8176,  ..., -4.3954, -3.9615,  1.9043]],
 
         [[10.8750,  0.7154, -2.6606,  ..., -1.3456, -1.7118,  2.1060],
          [-1.8617,  0.3270, -5.3284,  ..., -4.0618, -3.4859,  2.8907],
          [-1.3244,  0.0225, -5.7593,  ..., -3.9842, -4.0113,  2.9483],
          ...,
          [-1.4084,  0.1900, -5.2027,  ..., -3.8248, -3.5365,  2.7491],
          [-3.1041,  0.0399, -5.6730,  ..., -4.4360, -3.7767,  2.8985],
          [-3.8797, -0.3976, -6.1462,  ..., -4.5827, -4.2466,  2.3695]]],
        grad_fn=<ViewBackward0>),
 'generated_sequences': ['ATGACAGAATATAAAAAG', 'CAGGAGTGG

In [11]:
print("GENERATED DNA SEQUENCES:", outputs['generated_sequences'])
#print("PERPLEXITY:", torch.exp(outputs['loss']).item())

GENERATED DNA SEQUENCES: ['ATGACAGAATATAAAAAG', 'CAGGAGTGGAGGNNNGAG']
