In [50]:
import torch
import numpy as np
import psutil
import logging
import time
import random
from helical.models.hyena_dna import HyenaDNA, HyenaDNAConfig
from tqdm import tqdm

## Download the HyenaDNAConfig model

In [3]:
# Download the model using the hyena_config
hyena_config = HyenaDNAConfig(
    model_name = "hyenadna-tiny-1k-seqlen-d256"
)
model = HyenaDNA(configurer=hyena_config)

2026-01-13 21:50:48,649 - INFO:helical.models.hyena_dna.pretrained_model:Loaded pretrained weights ok!
2026-01-13 21:50:48,652 - INFO:helical.models.hyena_dna.model:Model finished initializing.
2026-01-13 21:50:48,652 - INFO:helical.models.hyena_dna.model:'hyenadna-tiny-1k-seqlen-d256' model is in 'eval' mode, on device 'cpu'.


### Setup Device string based on GPU availability

In [20]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cpu'

### Move model to use GPU if possible

In [21]:

if DEVICE == "cuda":
    model.model.to(DEVICE)

## Download the dataset

In [None]:
from datasets import load_dataset
label = "promoter_tata"

dataset = load_dataset("InstaDeepAI/nucleotide_transformer_downstream_tasks",  trust_remote_code=True).filter(lambda x: x["task"] == "promoter_tata")

### Understanding the dataset

In [25]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sequence', 'name', 'label', 'task'],
        num_rows: 5509
    })
    test: Dataset({
        features: ['sequence', 'name', 'label', 'task'],
        num_rows: 621
    })
})

In [26]:
train_dataset = dataset["train"]
test_dataset = dataset["test"]

In [27]:
train_dataset.features

{'sequence': Value(dtype='string', id=None),
 'name': Value(dtype='string', id=None),
 'label': Value(dtype='int32', id=None),
 'task': Value(dtype='string', id=None)}

In [28]:
sequences = train_dataset['sequence']
sequences[0], len(sequences[0]), len(sequences)

('CGCTCCCCCAGGAGTGTACTCCTGGTCAAAAGAGCGACATCACACGACGTAGGCCCGCCCGGCTTATCGAAGTCGAGCTGGGATTTGGGGGGGAACCTGACAGTATAGGTTGGGGGCCAGGACATTTATAGAACAACGGGAAAGACCTGCGCCAGCAGCTGAGAAGGAGGCCCCGTGATCAGCTCCAGCCATTTGCCAGCAACCGAAGCCCAGGAGCTTACATAATTTGCCAGGGCAGCACTGAGAGGTGACAGTTAGAGTTAAGTCGCTCTCGGAGCTCCGGGCTACCAGCGATTCTCT',
 300,
 5509)

### Sampling the data

In [45]:
SAMPLE_SIZE = 1000
sample_sequences = sequences[:SAMPLE_SIZE]
len(sample_sequences), sample_sequences[0]

(1000,
 'CGCTCCCCCAGGAGTGTACTCCTGGTCAAAAGAGCGACATCACACGACGTAGGCCCGCCCGGCTTATCGAAGTCGAGCTGGGATTTGGGGGGGAACCTGACAGTATAGGTTGGGGGCCAGGACATTTATAGAACAACGGGAAAGACCTGCGCCAGCAGCTGAGAAGGAGGCCCCGTGATCAGCTCCAGCCATTTGCCAGCAACCGAAGCCCAGGAGCTTACATAATTTGCCAGGGCAGCACTGAGAGGTGACAGTTAGAGTTAAGTCGCTCTCGGAGCTCCGGGCTACCAGCGATTCTCT')

In [46]:
# for suppressing INFO printing on each iteration
logging.getLogger("helical.models.hyena_dna.model").setLevel(logging.WARNING)

## Task 1: Perturbation based inferencing (NAIVE)

In [47]:
def add_pertubations(sequence_string: str, num_of_pertubations: int):
  """adds pertubations to a sequence of nucleotides"""
  nucleotides = ["A", "G", "T", "C"]
  length = len(sequence_string)
  seq_list = list(sequence_string)

  for _ in range(num_of_pertubations):
    # randomly choose index to perturb on
    random_idx = np.random.randint(0, length - 1)

    original_nucleotide = seq_list[random_idx]
    
    # find the possible perturbations
    possible_pertubations = [n for n in nucleotides if n != original_nucleotide]
    
    # choose perturbation randomly out of possible_perturbations
    new_nucleotide = random.choice(possible_pertubations)

    # apply the pertubation to mutate
    seq_list[random_idx] = new_nucleotide

  # return perturbed sequence
  return "".join(seq_list)


In [48]:
preturbed_sequences = []

for sequence in sample_sequences:
    preturbed_sequences.append(add_pertubations(sequence, num_of_pertubations=1))

### Run inferencing on perturbed sequences

In [51]:
start = time.time()
perturbed_embeddings = []
latencies = []
start_rss = psutil.Process().memory_info().rss / (1024 * 1024)
BATCH_SIZE = 1
overall_start = time.time()
for i in tqdm(range(0, SAMPLE_SIZE, BATCH_SIZE)):
  t_loop_in = time.time()
  raw_tokens = model.process_data(preturbed_sequences[i:i + BATCH_SIZE])
  input_ids_tensor = torch.tensor(raw_tokens["input_ids"]).to(DEVICE)

  with torch.no_grad():
    outputs = model.model(input_ids=input_ids_tensor)
    embeddings = outputs
  t_loop_out = time.time()
  latencies.append(t_loop_out - t_loop_in)
  if isinstance(embeddings, torch.Tensor):
    perturbed_embeddings.append(embeddings)
overall_end = time.time()

100%|██████████| 1000/1000 [00:43<00:00, 22.87it/s]


In [52]:
torch.cat(perturbed_embeddings, dim=0).shape

torch.Size([1000, 302, 256])

### Profiling the inferencing on perturbated sequences (NAIVE)

In [53]:
import numpy as np
import psutil
# Profiling (without Batching)
overall_time = overall_end - overall_start
avg_latency = np.mean(latencies)
throughput = SAMPLE_SIZE/overall_time
end_rss = psutil.Process().memory_info().rss / (1024 * 1024)
peak_gpu = torch.cuda.max_memory_allocated() / (1024 * 1024) if torch.cuda.is_available() else 0

print(f"""
------------ Profile : BATCH_SIZE = 1 (NAIVE) ------------
Number of Samples processed: {len(preturbed_sequences)}
Overall time taken to inference: {overall_time:.2f}
Avg Latency per Batch: {1000*avg_latency:.2f} ms
Throughput Processed: {throughput:.2f} samples/s
CPU RAM Usage: {end_rss - start_rss:2f} MB
GPU Memory Peak: {peak_gpu:.2f} MB
---------------------------------------------------
""")


------------ Profile : BATCH_SIZE = 1 (NAIVE) ------------
Number of Samples processed: 1000
Overall time taken to inference: 43.73
Avg Latency per Batch: 43.61 ms
Throughput Processed: 22.87 samples/s
CPU RAM Usage: 34.640625 MB
GPU Memory Peak: 0.00 MB
---------------------------------------------------



# Task 2: Scale ISP

### Optimization 1 : Batching (Batch Size = 32)

In [55]:
preturbed_sequences = []

for sequence in sample_sequences:
    preturbed_sequences.append(add_pertubations(sequence, num_of_pertubations=1))
    
start = time.time()
perturbed_embeddings = []
latencies = []
start_rss = psutil.Process().memory_info().rss / (1024 * 1024)
BATCH_SIZE = 32
overall_start = time.time()
for i in tqdm(range(0, SAMPLE_SIZE, BATCH_SIZE)):
  t_loop_in = time.time()
  raw_tokens = model.process_data(preturbed_sequences[i:i + BATCH_SIZE])
  input_ids_tensor = torch.tensor(raw_tokens["input_ids"]).to(DEVICE)

  with torch.no_grad():
    outputs = model.model(input_ids=input_ids_tensor)
    embeddings = outputs
  t_loop_out = time.time()
  latencies.append(t_loop_out - t_loop_in)
  if isinstance(embeddings, torch.Tensor):
    perturbed_embeddings.append(embeddings)
overall_end = time.time()

100%|██████████| 32/32 [00:21<00:00,  1.47it/s]


In [56]:
import numpy as np
import psutil
# Profiling (without Batching)
overall_time = overall_end - overall_start
avg_latency = np.mean(latencies)
throughput = SAMPLE_SIZE/overall_time
end_rss = psutil.Process().memory_info().rss / (1024 * 1024)
peak_gpu = torch.cuda.max_memory_allocated() / (1024 * 1024) if torch.cuda.is_available() else 0

print(f"""
------------ Profile : BATCH_SIZE = 32 ------------
Number of Samples processed: {len(preturbed_sequences)}
Overall time taken to inference: {overall_time:.2f}
Avg Latency per Batch: {1000*avg_latency:.2f} ms
Throughput Processed: {throughput:.2f} samples/s
CPU RAM Usage: {end_rss - start_rss:2f} MB
GPU Memory Peak: {peak_gpu:.2f} MB
---------------------------------------------------
""")


------------ Profile : BATCH_SIZE = 32 ------------
Number of Samples processed: 1000
Overall time taken to inference: 21.78
Avg Latency per Batch: 680.29 ms
Throughput Processed: 45.91 samples/s
CPU RAM Usage: 377.171875 MB
GPU Memory Peak: 0.00 MB
---------------------------------------------------



### Optimization 2 : Mixed Precision

In [60]:
from torch.amp import autocast

In [65]:
preturbed_sequences = []
use_amp = True

for sequence in sample_sequences:
    preturbed_sequences.append(add_pertubations(sequence, num_of_pertubations=1))
    
overall_start = time.time()
perturbed_embeddings = []
latencies = []
start_rss = psutil.Process().memory_info().rss / (1024 * 1024)
BATCH_SIZE = 32
overall_start = time.time()
for i in tqdm(range(0, SAMPLE_SIZE, BATCH_SIZE)):
    t_loop_in = time.time()
    raw_tokens = model.process_data(preturbed_sequences[i:i + BATCH_SIZE])
    input_ids_tensor = torch.tensor(raw_tokens["input_ids"]).to(DEVICE)

    with torch.no_grad():
        with autocast(DEVICE, enabled=use_amp, dtype=torch.float16):
            outputs = model.model(input_ids=input_ids_tensor)
            embeddings = outputs
        t_loop_out = time.time()
        latencies.append(t_loop_out - t_loop_in)
    if isinstance(embeddings, torch.Tensor):
        perturbed_embeddings.append(embeddings)
overall_end = time.time()

100%|██████████| 32/32 [00:18<00:00,  1.73it/s]


In [66]:
import numpy as np
import psutil
# Profiling (without Batching)
overall_time = overall_end - overall_start
avg_latency = np.mean(latencies)
throughput = SAMPLE_SIZE/overall_time
end_rss = psutil.Process().memory_info().rss / (1024 * 1024)
peak_gpu = torch.cuda.max_memory_allocated() / (1024 * 1024) if torch.cuda.is_available() else 0

print(f"""
------------ Profile : BATCH_SIZE = 32, Mixed Precision ------------
Number of Samples processed: {len(preturbed_sequences)}
Overall time taken to inference: {overall_time:.2f}
Avg Latency per Batch: {1000*avg_latency:.2f} ms
Throughput Processed: {throughput:.2f} samples/s
CPU RAM Usage: {end_rss - start_rss:2f} MB
GPU Memory Peak: {peak_gpu:.2f} MB
---------------------------------------------------
""")


------------ Profile : BATCH_SIZE = 32, Mixed Precision ------------
Number of Samples processed: 1000
Overall time taken to inference: 18.50
Avg Latency per Batch: 577.73 ms
Throughput Processed: 54.05 samples/s
CPU RAM Usage: 466.703125 MB
GPU Memory Peak: 0.00 MB
---------------------------------------------------

