# 1. Installs and Dependencies

In [None]:
# Before we begin, we need to install prodigal, which we will use later
# to predict protein-coding genes from our generated DNA sequences.
# We install prodigal through conda, which might take a few minutes...
!pip install -q condacolab # -> condacolab allows us to install with conda
import condacolab
condacolab.install()
!conda install -c bioconda prodigal

[0m✨🍰✨ Everything looks OK!
Channels:
 - bioconda
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): - \ | / - \ | / - done
Solving environment: | / - done


    current version: 23.11.0
    latest version: 24.1.2

Please update conda by running

    $ conda update -n base -c conda-forge conda



# All requested packages already installed.



In [None]:
!pip install together accelerate biopython biotite py3Dmol
!pip install git+https://github.com/Dao-AILab/flash-attention.git

[0mCollecting git+https://github.com/Dao-AILab/flash-attention.git
  Cloning https://github.com/Dao-AILab/flash-attention.git to /tmp/pip-req-build-dg7rpztl
  Running command git clone --filter=blob:none --quiet https://github.com/Dao-AILab/flash-attention.git /tmp/pip-req-build-dg7rpztl
  Resolved https://github.com/Dao-AILab/flash-attention.git to commit 36587c01cb4390de0a590b2131e3fcc4859ba09c
  Running command git submodule update --init --recursive -q
  Preparing metadata (setup.py) ... [?25l[?25hdone
[0m

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import torch
#import py3Dmol # -> used to visualize protein structures
import together # -> to call the API

# ↓ Tools to process DNA and protein data
import biotite.structure.io as bsio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
# ↓ To load ESMFold from HuggingFace, which we use to predict protein foldings
from transformers import (
  AutoTokenizer,
  EsmForProteinFolding,
  set_seed
)
from torch.cuda.amp import autocast, GradScaler

from model import StripedHyena_embedding

# Let's also enable TensorFloat32 computation for some speedups:
torch.backends.cuda.matmul.allow_tf32 = True

In [None]:
pip install biopython

[0m

In [None]:
# Setup GPU
print(torch.cuda.is_available())  # Should return True if a GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

True


#2. Set up model

## Download model

In [None]:
# Model is manualled downloaded from Huggingface, load it locally through Google Drive
from google.colab import drive
import json
from configuration_hyena import StripedHyenaConfig

drive.mount('/content/drive')
model_path = '/content/drive/My Drive/Evo/pytorch_model.pt'
config_path = '/content/drive/My Drive/Evo/config.json'
config = StripedHyenaConfig.from_original_config(config_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
None


In [None]:
# Load embedding model
# Initiate model
Evo_embedding_model = StripedHyena_embedding(config)

# Match keys
# get keys in the original model
original_state_dict = torch.load(model_path, map_location='cpu')

# Remove the unused keys
keys_to_remove = ["unembed.weight"]

for key in keys_to_remove:
    original_state_dict.pop(key, None)

# Load the modified state dict
Evo_embedding_model.load_state_dict(original_state_dict, strict=False)
Evo_embedding_model.eval()  # Set the model to evaluation mode
Evo_embedding_model.to(device) # Send model to GPU


#3. Get embeddings - short sequence

## Tokenize

In [None]:
tokenizer = ByteTokenizer()

# Example DNA sequence
dna_sequence = "atgattgcacgcatcaacgactggtttgaggaagtgaaagctaagcgcggcaagcgcccgacagccttccagttcctgcaagaaatcaagccggaagccgtagcgtacatcaccattaagaccactctggcttgcctaaccagtgctgacaatacaaccgttcaggctgtagcaa"

# Tokenize the DNA sequence
encoded = tokenizer.encode_plus(dna_sequence, return_tensors="pt")

# Encoded token IDs and attention mask
z = encoded["input_ids"].to(device)

print(z.shape)
print(z.dtype)

torch.Size([1, 175])
torch.int64


## Get embedding

In [None]:
# Get embedding
embeddings = Evo_embedding_model(z)[0].to('cpu')

print(embeddings)
print(embeddings.shape)

#4. Get embeddings - FASTA

In [None]:
# Load fasta file
path_fasta = '/content/drive/My Drive/Evo/test_seq.fa'

with open(path_fasta, 'r') as fasta:
    sequence_record = next(SeqIO.parse(fasta, 'fasta'))
    dna_sequence = str(sequence_record.seq)
print(dna_sequence)

ttgagatcctttttttctgcgcgtaatctgctgcttgcaaacaaaaaaaccaccgctaccagcggtggtttgtttgccggatcaagagctaccaactctttttccgaaggtaactggcttcagcagagcgcagataccaaatactgttcttctagtgtagccgtagttaggccaccacttcaagaactctgtagcaccgcctacatacctcgctctgctaatcctgttaccagtggctgctgccagtggcgataagtcgtgtcttaccgggttggactcaagacgatagttaccggataaggcgcagcggtcgggctgaacggggggttcgtgcacacagcccagcttggagcgaacgacctacaccgaactgagatacctacagcgtgagctatgagaaagcgccacgcttcccgaagggagaaaggcggacaggtatccggtaagcggcagggtcggaacaggagagcgcacgagggagcttccagggggaaacgcctggtatctttatagtcctgtcgggtttcgccacctctgacttgagcgtcgatttttgtgatgctcgtcaggggggcggagcctatggaaaaacgccagcaacgcggcctttttacggttcctggccttttgctggccttttgctcacatgttctttcctgcgttatcccctgattctgtggataaccgtattaccgcctttgagtgagctgataccgctcgccgcagccgaacgaccgagcgcagcgagtcagtgagcgaggaagcggaagagcgcccaatacgcatgcttaagttattggtatgactggttttaagcgcaaaaaaagttgctttttcgtacctattaatgtatcgttagaaaaccgactgtaaaaagtacagtcggcattatctcatattataaaagccagtcattaggcctatctgacaattcctgaatagagttcataaacaatcctgcatgataaccatcacaaacagaatgatgtacctgtaaaga

In [None]:
tokenizer = ByteTokenizer()
encoded = tokenizer.encode_plus(dna_sequence, return_tensors="pt")
z = encoded["input_ids"].to(device)
print(z.shape)

embeddings = Evo_embedding_model(z)[0].to('cpu')  # Your forward pass

print(embeddings)
print(embeddings.shape)


# Clear RAM

In [None]:
import gc

del encoded, z # Delete unused variables
gc.collect() # Collect garbage
torch.cuda.empty_cache() # Clear cache
print(torch.cuda.memory_summary())


|                  PyTorch CUDA memory summary, device ID 0                 |
|---------------------------------------------------------------------------|
|            CUDA OOMs: 1            |        cudaMalloc retries: 1         |
|        Metric         | Cur Usage  | Peak Usage | Tot Alloc  | Tot Freed  |
|---------------------------------------------------------------------------|
| Allocated memory      |  34075 MiB |  36348 MiB |  65788 MiB |  31713 MiB |
|       from large pool |  34051 MiB |  36324 MiB |  65762 MiB |  31710 MiB |
|       from small pool |     23 MiB |     24 MiB |     26 MiB |      2 MiB |
|---------------------------------------------------------------------------|
| Active memory         |  34075 MiB |  36348 MiB |  65788 MiB |  31713 MiB |
|       from large pool |  34051 MiB |  36324 MiB |  65762 MiB |  31710 MiB |
|       from small pool |     23 MiB |     24 MiB |     26 MiB |      2 MiB |
|---------------------------------------------------------------