In [40]:
import torch
from torch.amp import autocast
import numpy as np
import pandas as pd
import psutil
import logging
import time
import random
from helical.models.hyena_dna import HyenaDNA, HyenaDNAConfig
from datasets import load_dataset
from tqdm import tqdm

## Task 1 : Profiling on Naive Inferencing

In [65]:
import torch
import numpy as np
import psutil
import logging
import time
import random
from helical.models.hyena_dna import HyenaDNA, HyenaDNAConfig
from tqdm import tqdm

### Configurations to set before inferencing (for naive inferencing)

In [66]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # for dynamic switching between CPU and GPU based on availability 

MODEL_CONFIG = {
    "model_name": "hyenadna-tiny-1k-seqlen-d256", # Model name to use
    # "model_name": "hyenadna-tiny-1k-seqlen", # Alternate Model name to use
    "batch_size": 1, # Number of sequences to process in a batch
    "use_amp": False, # Use auto mixed precision for optimization
    "amp_dtype": torch.float16 # will use amp_dtype only when use_amp is set to True
}

DATA_CONFIG = {
    "sample_size": 50, # Change if we want a different sample size
    "number_of_perturbations": 1 # Change if we want more perturbations
}

In [83]:
# for benchmarking all the experiments
all_profiles = []

def load_hyena_model():
    """
    Loads the HyenaDNA model based on settings in config.py,
    moves it to the correct device, and returns the model object.
    """
    hyena_config = HyenaDNAConfig(
        model_name = MODEL_CONFIG["model_name"], # HyenaDNA models can be used here
    )
    model = HyenaDNA(configurer=hyena_config)

    # move model to use GPU if possible
    if DEVICE == "cuda":
        model.model.to(DEVICE)
        
    print("Model loaded successfully!")
    
    return model
  
def compare_embeddings(embedding1, embedding2, name1="Embedding 1", name2="Embedding 2"):
    """
    Compares two embedding tensors by calculating Mean Absolute Error and Cosine Similarity.

    Args:
        embedding1 (torch.Tensor): The first embedding tensor.
        embedding2 (torch.Tensor): The second embedding tensor.
        name1 (str): Name for the first embedding for printing.
        name2 (str): Name for the second embedding for printing.
    """
    # Ensure tensors are on the same device (CPU) for comparison and are float32
    emb1_cpu = embedding1.cpu().to(torch.float32)
    emb2_cpu = embedding2.cpu().to(torch.float32)

    # 1. Calculate Mean Absolute Error (MAE)
    mae = torch.mean(torch.abs(emb1_cpu - emb2_cpu))

    # 2. Calculate Cosine Similarity
    # We compute it for each pair of vectors (row-wise) and then average
    cos_sim = torch.nn.functional.cosine_similarity(emb1_cpu, emb2_cpu, dim=1)
    avg_cos_sim = torch.mean(cos_sim)

    print(f"--- Embedding Comparison: {name1} vs. {name2} ---")
    print(f"Mean Absolute Error:     {mae.item():.10f}")
    print(f"Average Cosine Similarity: {avg_cos_sim.item():.10f}")

  
def get_sequences(sample_size: int):
    """Download the promoter_tata dataset and returns a sample of sequences

    Args:
        sample_size (int): Number of samples we want to use
    """
    print("Downloading dataset ...")
    dataset = load_dataset(
        "InstaDeepAI/nucleotide_transformer_downstream_tasks",
        trust_remote_code=True
    ).filter(lambda x: x["task"] == "promoter_tata")
    
    sequences = dataset["train"]["sequence"]
    print(f"Dataset is loaded, we will be taking a sample of {sample_size}")
    return sequences[:sample_size]
  

def add_pertubations(sequence_string: str, num_of_pertubations: int):
  """adds pertubations to a sequence of nucleotides"""
  nucleotides = ["A", "G", "T", "C"]
  length = len(sequence_string)
  seq_list = list(sequence_string)

  for _ in range(num_of_pertubations):
    random_idx = np.random.randint(0, length - 1)

    original_nucleotide = seq_list[random_idx]
    possible_pertubations = [n for n in nucleotides if n != original_nucleotide]
    new_nucleotide = random.choice(possible_pertubations)

    # apply the pertubation to mutate
    seq_list[random_idx] = new_nucleotide

  # return perturbed sequence
  return "".join(seq_list)

def log_inference_profile(
  total_time: float,
  latencies: list,
  num_samples: int,
  start_rss_mb: float
):
  """
  Calculates and logs inference related metrics of the run
  """
  avg_latency = np.mean(latencies) * 1000 # to convert in ms
  throughput = DATA_CONFIG["sample_size"]/total_time
  end_rss_mb = psutil.Process().memory_info().rss / (1024 * 1024)
  
  # get GPU memory if running on GPU
  peak_gpu_mb = 0
  if DEVICE == "cuda" and torch.cuda.is_available():
    peak_gpu_mb = torch.cuda.max_memory_allocated() / (1024 * 1024)
    
  log_line = f"""
------------ Inference Profile ------------
Device:                 {DEVICE.upper()}
Model name:             {MODEL_CONFIG["model_name"]}
amp enabled:            {MODEL_CONFIG["use_amp"]}
amp dtype:              {MODEL_CONFIG["amp_dtype"] if MODEL_CONFIG["use_amp"] else "torch.float32 (default)"}
Total Samples:          {num_samples}
Batch Size:             {MODEL_CONFIG["batch_size"]}
---
Total Time:             {total_time:.2f} s
Throughput:             {throughput:.2f} samples/s
Avg. Latency / Batch:   {avg_latency:.2f} ms
---
CPU RAM Usage:          {end_rss_mb - start_rss_mb:.2f} MB
Peak GPU Memory:        {peak_gpu_mb:.2f} MB
---------------------------------------------------
"""
  # Create a dictionary of the results
  profile_results = {
      "Model name":  MODEL_CONFIG["model_name"],
      "Total samples": num_samples,
      "Total time": f"{total_time:.2f} s",
      "Throughput (samples/s)": f"{throughput:.2f}",
      "Avg. Latency (ms/batch)": f"{avg_latency:.2f}",
      "CPU RAM Usage (MB)": f"{end_rss_mb - start_rss_mb:.2f}",
      "Peak GPU Memory (MB)": f"{peak_gpu_mb:.2f}"
  }
  
  # Reset peak memory stats for the next run if needed
  if DEVICE == "cuda" and torch.cuda.is_available():
      torch.cuda.reset_peak_memory_stats()
      
  return log_line, profile_results


def run_hyena_inferencing(model, sequences_to_process: list):
    """runs inferencing on sequences"""
    pertubation_embeddings = []
    latencies = []

    # Inference on pertubations
    overall_start = time.time()
    start_rss = psutil.Process().memory_info().rss / (1024 * 1024)
    BATCH_SIZE = MODEL_CONFIG["batch_size"]
    overall_start = time.time()

    for i in range(0, DATA_CONFIG["sample_size"], BATCH_SIZE):
        t_loop_in = time.time()
        raw_tokens = model.process_data(sequences_to_process[i:i + BATCH_SIZE])
        input_ids_tensor = torch.tensor(raw_tokens["input_ids"]).to(DEVICE)

        with torch.no_grad():
            with autocast(DEVICE, enabled=MODEL_CONFIG["use_amp"], dtype=MODEL_CONFIG["amp_dtype"]):
                outputs = model.model(input_ids=input_ids_tensor)
                embeddings = outputs

        t_loop_out = time.time()
        latencies.append(t_loop_out - t_loop_in)
        
        if isinstance(embeddings, torch.Tensor):
            pertubation_embeddings.append(embeddings)

    total_time = time.time() - overall_start

    # Call the logging function from utils
    run_profile, run_profile_dict = log_inference_profile(
        total_time=total_time,
        latencies=latencies,
        num_samples=len(sequences_to_process),
        start_rss_mb=start_rss
    )
    
    print(run_profile)

    return torch.cat(pertubation_embeddings, dim=0), run_profile_dict

### Profiling the inferencing on perturbated sequences (NAIVE)

In [84]:
time_signature = time.strftime('%Y%m%d-%H%M%S')

sequences = get_sequences(DATA_CONFIG["sample_size"])

perturbed_sequences = []

for sequence in sequences:
    perturbed_sequences.append(add_pertubations(sequence, num_of_pertubations=1))

perturbed_sequences = [add_pertubations(seq, num_of_pertubations=DATA_CONFIG["number_of_perturbations"]) for seq in sequences]

print("Loading Hyena model...")
model = load_hyena_model()

print("Starting inference run on original sequences...")
naive_original_embeddings, naive_original_profile = run_hyena_inferencing(model, sequences)
naive_original_profile["Sequences"] = "Original"
naive_original_profile["Experiment"] = "Naive (BS=1, FP32)"
all_profiles.append(naive_original_profile)

print("Starting inference run on perturbed sequences...")
naive_perturbed_embeddings, naive_perturbed_profile = run_hyena_inferencing(model, perturbed_sequences)
naive_perturbed_profile["Sequences"] = "Perturbed"
naive_perturbed_profile["Experiment"] = "Naive (BS=1, FP32)"
all_profiles.append(naive_perturbed_profile)


print(f"Successfully generated original embeddings of shape: {naive_original_embeddings.shape}")
print(f"Successfully generated perturbed embeddings of shape: {perturbed_embeddings.shape}")

original_embedding_array = np.stack(naive_original_embeddings)
perturbed_embedding_array = np.stack(naive_perturbed_embeddings)

np.save(f"original_embedding_{time_signature}.npy", original_embedding_array)
np.save(f"perturbed_embedding_{time_signature}.npy", original_embedding_array)

Downloading dataset ...


2026-01-14 13:26:27,107 - INFO:helical.models.hyena_dna.pretrained_model:Loaded pretrained weights ok!
2026-01-14 13:26:27,110 - INFO:helical.models.hyena_dna.model:Model finished initializing.
2026-01-14 13:26:27,111 - INFO:helical.models.hyena_dna.model:'hyenadna-tiny-1k-seqlen-d256' model is in 'eval' mode, on device 'cpu'.
2026-01-14 13:26:27,112 - INFO:helical.models.hyena_dna.model:Processing data for HyenaDNA.
2026-01-14 13:26:27,123 - INFO:helical.models.hyena_dna.model:Succesfully prepared the HyenaDNA Dataset.


Dataset is loaded, we will be taking a sample of 50
Loading Hyena model...
Model loaded successfully!
Starting inference run on original sequences...


2026-01-14 13:26:27,885 - INFO:helical.models.hyena_dna.model:Processing data for HyenaDNA.
2026-01-14 13:26:27,891 - INFO:helical.models.hyena_dna.model:Succesfully prepared the HyenaDNA Dataset.
2026-01-14 13:26:28,433 - INFO:helical.models.hyena_dna.model:Processing data for HyenaDNA.
2026-01-14 13:26:28,442 - INFO:helical.models.hyena_dna.model:Succesfully prepared the HyenaDNA Dataset.



------------ Inference Profile ------------
Device:                 CPU
Model name:             hyenadna-tiny-1k-seqlen-d256
amp enabled:            False
amp dtype:              torch.float32 (default)
Total Samples:          50
Batch Size:             32
---
Total Time:             1.32 s
Throughput:             37.93 samples/s
Avg. Latency / Batch:   659.09 ms
---
CPU RAM Usage:          310.94 MB
Peak GPU Memory:        0.00 MB
---------------------------------------------------

Starting inference run on perturbed sequences...


2026-01-14 13:26:29,132 - INFO:helical.models.hyena_dna.model:Processing data for HyenaDNA.
2026-01-14 13:26:29,138 - INFO:helical.models.hyena_dna.model:Succesfully prepared the HyenaDNA Dataset.



------------ Inference Profile ------------
Device:                 CPU
Model name:             hyenadna-tiny-1k-seqlen-d256
amp enabled:            False
amp dtype:              torch.float32 (default)
Total Samples:          50
Batch Size:             32
---
Total Time:             1.24 s
Throughput:             40.37 samples/s
Avg. Latency / Batch:   619.26 ms
---
CPU RAM Usage:          39.50 MB
Peak GPU Memory:        0.00 MB
---------------------------------------------------

Successfully generated original embeddings of shape: torch.Size([50, 302, 256])
Successfully generated perturbed embeddings of shape: torch.Size([50, 302, 256])


# Task 2: Scale ISP and Optimizations

### Optimization 1 : Batching (Batch Size = 32)

### Configurations to set before inferencing (for batching optimization)

In [85]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # for dynamic switching between CPU and GPU based on availability 

MODEL_CONFIG = {
    "model_name": "hyenadna-tiny-1k-seqlen-d256", # Model name to use
    # "model_name": "hyenadna-tiny-1k-seqlen", # Alternate Model name to use
    "batch_size": 32, # Number of sequences to process in a batch
    "use_amp": False, # Use auto mixed precision for optimization
    "amp_dtype": torch.float16 # will use amp_dtype only when use_amp is set to True
}

DATA_CONFIG = {
    "sample_size": 50, # Change if we want a different sample size
    "number_of_perturbations": 1 # Change if we want more perturbations
}

In [86]:
time_signature = time.strftime('%Y%m%d-%H%M%S')

sequences = get_sequences(DATA_CONFIG["sample_size"])

perturbed_sequences = []

for sequence in sequences:
    perturbed_sequences.append(add_pertubations(sequence, num_of_pertubations=1))

perturbed_sequences = [add_pertubations(seq, num_of_pertubations=DATA_CONFIG["number_of_perturbations"]) for seq in sequences]

print("Loading Hyena model...")
model = load_hyena_model()

print("Starting inference run on original sequences...")
batch_only_original_embeddings, optimization_one_original_profile = run_hyena_inferencing(model, sequences)
optimization_one_original_profile["Sequences"] = "Original"
optimization_one_original_profile["Experiment"] = "Batching Only (BS=32, FP32)"
all_profiles.append(optimization_one_original_profile)

print("Starting inference run on perturbed sequences...")
batch_only_perturbed_embeddings, optimization_one_perturbed_profile = run_hyena_inferencing(model, perturbed_sequences)
optimization_one_perturbed_profile["Sequences"] = "Perturbed"
optimization_one_perturbed_profile["Experiment"] = "Batching Only (BS=32, FP32)"
all_profiles.append(optimization_one_perturbed_profile)

print(f"Successfully generated original embeddings of shape: {batch_only_original_embeddings.shape}")
print(f"Successfully generated perturbed embeddings of shape: {perturbed_embeddings.shape}")

original_embedding_array = np.stack(batch_only_original_embeddings)
perturbed_embedding_array = np.stack(batch_only_perturbed_embeddings)

np.save(f"original_embedding_{time_signature}.npy", original_embedding_array)
np.save(f"perturbed_embedding_{time_signature}.npy", original_embedding_array)

Downloading dataset ...


2026-01-14 13:26:30,564 - INFO:helical.models.hyena_dna.pretrained_model:Loaded pretrained weights ok!
2026-01-14 13:26:30,565 - INFO:helical.models.hyena_dna.model:Model finished initializing.
2026-01-14 13:26:30,565 - INFO:helical.models.hyena_dna.model:'hyenadna-tiny-1k-seqlen-d256' model is in 'eval' mode, on device 'cpu'.
2026-01-14 13:26:30,567 - INFO:helical.models.hyena_dna.model:Processing data for HyenaDNA.
2026-01-14 13:26:30,576 - INFO:helical.models.hyena_dna.model:Succesfully prepared the HyenaDNA Dataset.


Dataset is loaded, we will be taking a sample of 50
Loading Hyena model...
Model loaded successfully!
Starting inference run on original sequences...


2026-01-14 13:26:31,269 - INFO:helical.models.hyena_dna.model:Processing data for HyenaDNA.
2026-01-14 13:26:31,275 - INFO:helical.models.hyena_dna.model:Succesfully prepared the HyenaDNA Dataset.
2026-01-14 13:26:31,807 - INFO:helical.models.hyena_dna.model:Processing data for HyenaDNA.
2026-01-14 13:26:31,816 - INFO:helical.models.hyena_dna.model:Succesfully prepared the HyenaDNA Dataset.



------------ Inference Profile ------------
Device:                 CPU
Model name:             hyenadna-tiny-1k-seqlen-d256
amp enabled:            False
amp dtype:              torch.float32 (default)
Total Samples:          50
Batch Size:             32
---
Total Time:             1.24 s
Throughput:             40.40 samples/s
Avg. Latency / Batch:   618.84 ms
---
CPU RAM Usage:          54.94 MB
Peak GPU Memory:        0.00 MB
---------------------------------------------------

Starting inference run on perturbed sequences...


2026-01-14 13:26:32,490 - INFO:helical.models.hyena_dna.model:Processing data for HyenaDNA.
2026-01-14 13:26:32,496 - INFO:helical.models.hyena_dna.model:Succesfully prepared the HyenaDNA Dataset.



------------ Inference Profile ------------
Device:                 CPU
Model name:             hyenadna-tiny-1k-seqlen-d256
amp enabled:            False
amp dtype:              torch.float32 (default)
Total Samples:          50
Batch Size:             32
---
Total Time:             1.22 s
Throughput:             41.05 samples/s
Avg. Latency / Batch:   608.98 ms
---
CPU RAM Usage:          48.95 MB
Peak GPU Memory:        0.00 MB
---------------------------------------------------

Successfully generated original embeddings of shape: torch.Size([50, 302, 256])
Successfully generated perturbed embeddings of shape: torch.Size([50, 302, 256])


### Optimization 2 : Mixed Precision (Batch Size = 32)

### Configurations to set before inferencing (for batching optimization)

In [87]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # for dynamic switching between CPU and GPU based on availability 

MODEL_CONFIG = {
    "model_name": "hyenadna-tiny-1k-seqlen-d256", # Model name to use
    # "model_name": "hyenadna-tiny-1k-seqlen", # Alternate Model name to use
    "batch_size": 32, # Number of sequences to process in a batch
    "use_amp": True, # Use auto mixed precision for optimization
    "amp_dtype": torch.float16 # will use amp_dtype only when use_amp is set to True
}

DATA_CONFIG = {
    "sample_size": 50, # Change if we want a different sample size
    "number_of_perturbations": 1 # Change if we want more perturbations
}

In [88]:
time_signature = time.strftime('%Y%m%d-%H%M%S')

sequences = get_sequences(DATA_CONFIG["sample_size"])

perturbed_sequences = []

for sequence in sequences:
    perturbed_sequences.append(add_pertubations(sequence, num_of_pertubations=1))

perturbed_sequences = [add_pertubations(seq, num_of_pertubations=DATA_CONFIG["number_of_perturbations"]) for seq in sequences]

print("Loading Hyena model...")
model = load_hyena_model()

print("Starting inference run on original sequences...")
batch_and_amp_original_embeddings, optimization_two_original_profile = run_hyena_inferencing(model, sequences)
optimization_two_original_profile["Sequences"] = "Original"
optimization_two_original_profile["Experiment"] = "Batching with AMP (BS=32, FP16)"
all_profiles.append(optimization_two_original_profile)

print("Starting inference run on perturbed sequences...")
batch_and_amp_perturbed_embeddings, optimization_two_perturbed_profile = run_hyena_inferencing(model, perturbed_sequences)
optimization_two_perturbed_profile["Sequences"] = "Perturbed"
optimization_two_perturbed_profile["Experiment"] = "Batching with AMP (BS=32, FP16)"
all_profiles.append(optimization_two_perturbed_profile)

print(f"Successfully generated original embeddings of shape: {batch_and_amp_original_embeddings.shape}")
print(f"Successfully generated perturbed embeddings of shape: {batch_and_amp_perturbed_embeddings.shape}")

original_embedding_array = np.stack(batch_and_amp_original_embeddings)
perturbed_embedding_array = np.stack(batch_and_amp_perturbed_embeddings)

np.save(f"original_embedding_{time_signature}.npy", original_embedding_array)
np.save(f"perturbed_embedding_{time_signature}.npy", original_embedding_array)

Downloading dataset ...


2026-01-14 13:26:34,043 - INFO:helical.models.hyena_dna.pretrained_model:Loaded pretrained weights ok!
2026-01-14 13:26:34,045 - INFO:helical.models.hyena_dna.model:Model finished initializing.
2026-01-14 13:26:34,045 - INFO:helical.models.hyena_dna.model:'hyenadna-tiny-1k-seqlen-d256' model is in 'eval' mode, on device 'cpu'.
2026-01-14 13:26:34,048 - INFO:helical.models.hyena_dna.model:Processing data for HyenaDNA.
2026-01-14 13:26:34,057 - INFO:helical.models.hyena_dna.model:Succesfully prepared the HyenaDNA Dataset.


Dataset is loaded, we will be taking a sample of 50
Loading Hyena model...
Model loaded successfully!
Starting inference run on original sequences...


2026-01-14 13:26:34,651 - INFO:helical.models.hyena_dna.model:Processing data for HyenaDNA.
2026-01-14 13:26:34,658 - INFO:helical.models.hyena_dna.model:Succesfully prepared the HyenaDNA Dataset.
2026-01-14 13:26:35,002 - INFO:helical.models.hyena_dna.model:Processing data for HyenaDNA.
2026-01-14 13:26:35,011 - INFO:helical.models.hyena_dna.model:Succesfully prepared the HyenaDNA Dataset.



------------ Inference Profile ------------
Device:                 CPU
Model name:             hyenadna-tiny-1k-seqlen-d256
amp enabled:            True
amp dtype:              torch.float16
Total Samples:          50
Batch Size:             32
---
Total Time:             0.95 s
Throughput:             52.46 samples/s
Avg. Latency / Batch:   476.59 ms
---
CPU RAM Usage:          -104.30 MB
Peak GPU Memory:        0.00 MB
---------------------------------------------------

Starting inference run on perturbed sequences...


2026-01-14 13:26:35,589 - INFO:helical.models.hyena_dna.model:Processing data for HyenaDNA.
2026-01-14 13:26:35,595 - INFO:helical.models.hyena_dna.model:Succesfully prepared the HyenaDNA Dataset.



------------ Inference Profile ------------
Device:                 CPU
Model name:             hyenadna-tiny-1k-seqlen-d256
amp enabled:            True
amp dtype:              torch.float16
Total Samples:          50
Batch Size:             32
---
Total Time:             0.93 s
Throughput:             53.62 samples/s
Avg. Latency / Batch:   466.24 ms
---
CPU RAM Usage:          57.64 MB
Peak GPU Memory:        0.00 MB
---------------------------------------------------

Successfully generated original embeddings of shape: torch.Size([50, 302, 256])
Successfully generated perturbed embeddings of shape: torch.Size([50, 302, 256])


# Summarizing Results

In [89]:
df = pd.DataFrame(all_profiles)
df

Unnamed: 0,Model name,Total samples,Total time,Throughput (samples/s),Avg. Latency (ms/batch),CPU RAM Usage (MB),Peak GPU Memory (MB),Sequences,Experiment
0,hyenadna-tiny-1k-seqlen-d256,50,1.32 s,37.93,659.09,310.94,0.0,Original,"Naive (BS=1, FP32)"
1,hyenadna-tiny-1k-seqlen-d256,50,1.24 s,40.37,619.26,39.5,0.0,Perturbed,"Naive (BS=1, FP32)"
2,hyenadna-tiny-1k-seqlen-d256,50,1.24 s,40.4,618.84,54.94,0.0,Original,"Batching Only (BS=32, FP32)"
3,hyenadna-tiny-1k-seqlen-d256,50,1.22 s,41.05,608.98,48.95,0.0,Perturbed,"Batching Only (BS=32, FP32)"
4,hyenadna-tiny-1k-seqlen-d256,50,0.95 s,52.46,476.59,-104.3,0.0,Original,"Batching with AMP (BS=32, FP16)"
5,hyenadna-tiny-1k-seqlen-d256,50,0.93 s,53.62,466.24,57.64,0.0,Perturbed,"Batching with AMP (BS=32, FP16)"


We can see that BATCHING optimization has reduced the inference time by 1.8x on both original and perturbed data when compared with Naive Inferencing

Also, further applying auto mixed precision (amp) to use FP16 futher reduced the inference time by 1.3x 

Both the optimization combined gives inference time reduction of 2.3x

### Validation of generated embeddings

Comparing original embeddings generated using Naive inferencing with Batch only optimized inferencing

In [90]:
compare_embeddings(naive_original_embeddings, batch_only_original_embeddings, 
                   name1="Naive (BS=1)", name2="Batched (BS=32)")

--- Embedding Comparison: Naive (BS=1) vs. Batched (BS=32) ---
Mean Absolute Error:     0.0000000000
Average Cosine Similarity: 1.0000000000


Comparing original embeddings generated using Naive inferencing with Batch + amp optimized inferencing

In [91]:
compare_embeddings(naive_original_embeddings, batch_and_amp_original_embeddings, 
                   name1="Naive (BS=1)", name2="Batching with AMP (BS=32, FP16)")

--- Embedding Comparison: Naive (BS=1) vs. Batching with AMP (BS=32, FP16) ---
Mean Absolute Error:     0.0004380128
Average Cosine Similarity: 0.9999994040


Perturbed Embedding should be have more differences

In [92]:
compare_embeddings(naive_original_embeddings, naive_perturbed_embeddings, 
                   name1="Naive (BS=1)", name2="Batched (BS=32)")

--- Embedding Comparison: Naive (BS=1) vs. Batched (BS=32) ---
Mean Absolute Error:     0.0186630860
Average Cosine Similarity: 0.9931645989


This suggests that we are able to optimize inference time by multifold factor without close to no compromise on embedding generations