In [1]:
import torch
import torch.nn as nn
import numpy as np
import random
from typing import Dict, List, Tuple, Optional, Any
from dataclasses import dataclass, asdict
from collections import defaultdict
import json
import time
import math
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging

In [2]:
#Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
@dataclass
class SearchSpace:
  # Defines the neural architecture search space for transformers
  embed_dims: List[int] = None
  num_heads: List[int] = None
  num_layers: List[int] = None
  ffn_ratios: List[float] = None
  dropout_rates: List[float] = None
  activation_functions: List[str] = None
  attention_types: List[str] = None
  layer_types: List[str] = None

  def __post_init__(self):
    if self.embed_dims is None:
      self.embed_dims = [256, 384, 512, 768, 1024]
    if self.num_heads is None:
      self.num_heads = [4, 6, 8, 12, 16]
    if self.num_layers is None:
      self.num_layers = [6, 8, 12, 16, 24]
    if self.ffn_ratios is None:
      self.ffn_ratios = [2.0, 3.0, 4.0, 5.33]
    if self.dropout_rates is None:
      self.dropout_rates = [0.0, 0.1, 0.2, 0.3]
    if self.activation_functions is None:
      self.activation_functions = ['relu', 'gelu', 'silu', 'mish']
    if self.attention_types is None:
      self.attention_types = ['standard', 'linear', 'performer', 'linformer']
    if self.layer_types is None:
      self.layer_types = ['transformer', 'mamba', 'hybrid']

In [4]:
@dataclass
class Architecture:
  #Represents a specific transformer architecture
  embed_dim: int
  num_heads: int
  num_layers: int
  ffn_ratio: float
  dropout_rate: float
  activation_function: str
  attention_type: str
  layer_type: str

  def __hash__(self) -> int:
    return hash(tuple(asdict(self).values()))

  def to_dict(self):
    return asdict(self)

  @classmethod
  def from_dict(cls, d):
    return cls(**d)

In [5]:
class HardwareProfiler:
  #Hardware aware optimization profiler

  def __init__(self):
    self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    self.memory_limit = self._get_memory_limit()
    self.compute_capability = self._get_compute_capability()

  def _get_memory_limit(self) -> int:
    #Get GPU memory limit in MB
    if torch.cuda.is_available():
      return int(torch.cuda.get_device_properties(0).total_memory / 1024**2)
    return 8192 # Default 8GB for CPU

  def _get_compute_capability(self) -> float:
    #Estimate compute capability
    if torch.cuda.is_available():
      props = torch.cuda.get_device_properties(0)
      return props.major * props.minor * 0.1
    return 3.5 #Default capability

  def estimate_memory_usage(self, arch: Architecture, seq_len: int = 512, batch_size: int = 8) -> float:
    #Estimate memory usage in MB for given architecture
    #Simplified memory estimation
    embed_params = arch.embed_dim * 50000 #Vocab size approximation
    attention_params = arch.num_layers * arch.num_heads * arch.embed_dim * arch.embed_dim
    ffn_params = arch.num_layers * arch.embed_dim * int(arch.embed_dim * arch.ffn_ratio) * 2

    total_params = embed_params + attention_params + ffn_params
    param_memory = total_params * 4 / 1024*2 #4 bytes per float 32

    #Activation memory
    activation_memory = batch_size * seq_len * arch.embed_dim * arch.num_layers * 8 / 1024**2

    return param_memory + activation_memory

  def estimate_latency(self, arch: Architecture, seq_len: int = 512) -> float:
    #Estimate inference latency in milliseconds
    #simplified latency model
    base_latency = 10.0 # Base overhead
    layer_latency = arch.num_layers * 2.0
    attention_latency = arch.num_heads * seq_len * 0.001
    ffn_latency = arch.ffn_ratio * 1.5

    total_latency = base_latency + layer_latency + attention_latency + ffn_latency

    #Hardware specific scaling
    if self.compute_capability >= 7.0:
      total_latency *= 0.8 #Newer GPUs are potentially faster
    return total_latency

In [6]:
class AttentionModule(nn.Module):
  #Configurable attention module supporting diff attention types
  def __init__(self, embed_dim: int, num_heads: int, attention_type: str = 'standard'):
    super(AttentionModule, self).__init__()
    self.embed_dim = embed_dim
    self.num_heads = num_heads
    self.attention_type = attention_type
    self.head_dim = embed_dim // num_heads

    self.qkv = nn.Linear(embed_dim, embed_dim * 3, bias=False)
    self.proj = nn.Linear(embed_dim, embed_dim)

    if attention_type == 'Linear':
      self.feature_map = nn.ReLU()
    elif attention_type == 'Linformer':
      self.k_proj = nn.Linear(embed_dim, 256) #Reduce sequence dimension
      self.v_proj = nn.Linear(embed_dim, 256)

  def forward(self, x):
    B, N, C = x.shape
    qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim ).permute(2, 0, 3, 1, 4)
    q, k, v = qkv.unbind(0)

    if self.attention_type == 'standard':
      attn = (q @ k.transpose(-2, -1)) * (self.head_dim**-0.5)
      attn = attn.softmax(dim=1)
      x = (attn @ v).transpose(1, 2).reshape(B, N, C)
    elif self.attention_type == 'Linear':
      q, k = self.feature_map(q), self.feature_map(k)
      kv = k.transpose(-2, -1) @ v
      x = (q @ kv).transpose(1, 2).reshape(B, N, C)
    elif self.attention_type == 'Linformer':
      k = self.k_proj(k.transpose(-2, -1)).transpose(-2, -1)
      v = self.v_proj(v.transpose(-2, -1)).transpose(-2, -1)
      attn = (q @ k.transpose(-2, -1)) * (self.head_dim**-0.5)
      attn = attn.softmax(dim=1)
      x = (attn @ v).transpose(1, 2).reshape(B, N, C)
    else:
       #performer approximation
       q, k = torch.nn.functional.normalize(q, dim=-1), torch.nn.functional.normalize(k, dim=-1)
       kv = k.transpose(-2, -1) @ v
       x = (q @ kv).transpose(1, 2).reshape(B, N, C)


    return self.proj(x)


In [7]:
class TransformerBlock(nn.Module):
  #Transformer block with configurable attention

  def __init__(self, embed_dim: int, num_heads: int, ffn_ratio: float, dropout_rate: float, activation: str, attention_type: str = 'standard'):
    super().__init__()
    self.normal = nn.LayerNorm(embed_dim)
    self.attn = AttentionModule(embed_dim, num_heads, attention_type)
    self.norm2 = nn.LayerNorm(embed_dim)

    ffn_dim = int(embed_dim * ffn_ratio)
    if activation == 'relu':
      act_fn = nn.ReLU()
    elif activation == 'gelu':
      act_fn = nn.GELU()
    elif activation == 'swish':
      act_fn = nn.SiLU()
    elif activation == 'mish':
      act_fn = nn.Mish()

    self.mlp = nn.Sequential(
      nn.Linear(embed_dim, ffn_dim),
      act_fn,
      nn.Dropout(dropout_rate),
      nn.Linear(ffn_dim, embed_dim),
      nn.Dropout(dropout_rate)
    )

  def forward(self, x):
    x = x + self.attn(self.normal(x))
    x = x + self.mlp(self.norm2(x))
    return x



In [8]:
class ConfigurableTransformer(nn.Module):
# Configurable transformer model

  def __init__(self, arch: Architecture, vocab_size: int = 50000, max_seq_len: int = 512):
    super().__init__()
    self.arch = arch
    self.vocab_size = vocab_size

    self.embed = nn.Embedding(vocab_size, arch.embed_dim)
    self.pos_embed = nn.Parameter(torch.randn(1, max_seq_len, arch.embed_dim))
    self.dropout = nn.Dropout(arch.dropout_rate)

    self.blocks = nn.ModuleList([
        TransformerBlock(
            arch.embed_dim, arch.num_heads, arch.ffn_ratio,
            arch.dropout_rate, arch.activation_function, arch.attention_type)
         for _ in range(arch.num_layers)
     ])


    self.norm = nn.LayerNorm(arch.embed_dim)
    self.head = nn.Linear(arch.embed_dim, vocab_size)

    self.apply(self._init_weights)


  def _init_weights(self, m):
    if isinstance(m, nn.Linear):
      torch.nn.init.normal_(m.weight, std=0.02)
      if m.bias is not None:
        torch.nn.init.zeros_(m.bias)
    elif isinstance(m, nn.Embedding):
      torch.nn.init.normal_(m.weight, std=0.02)

  def forward(self, x):
    seq_len = x.size(1)
    x = self.embed(x) + self.pos_embed[:, :seq_len]
    x = self.dropout(x)

    for block in self.blocks:
      x = block(x)

    x = self.norm(x)
    return self.head(x)

In [9]:
class EvolutionarySearcher:
  #Evolutionary algorithm for neural architecture search

  def __init__(self, search_space: SearchSpace, population_size: int = 50, mutation_rate: float = 0.3, crossover_rate: float = 0.6):
    self.search_space = search_space
    self.population_size = population_size
    self.mutation_rate = mutation_rate
    self.crossover_rate = crossover_rate
    self.population = []
    self.fitness_history = []

  def _random_architecture(self) -> Architecture:
    #Generate a random architecture from search space
    return Architecture(
      embed_dim=random.choice(self.search_space.embed_dims),
      num_heads=random.choice(self.search_space.num_heads),
      num_layers=random.choice(self.search_space.num_layers),
      ffn_ratio=random.choice(self.search_space.ffn_ratios),
      dropout_rate=random.choice(self.search_space.dropout_rates),
      activation_function=random.choice(self.search_space.activation_functions),
      attention_type=random.choice(self.search_space.attention_types),
      layer_type=random.choice(self.search_space.layer_types)
    )

  def _mutate(self, arch: Architecture) -> Architecture:
    #Mutate an Architecture
    arch_dict = arch.to_dict()

    #Randomly select a parameter to mutate
    param = random.choice(list(arch_dict.keys()))

    if param == 'embed_dim':
      arch_dict[param] = random.choice(self.search_space.embed_dims)
    elif param == 'num_heads':
      arch_dict[param] = random.choice(self.search_space.num_heads)
    elif param == 'num_layers':
      arch_dict[param] = random.choice(self.search_space.num_layers)
    elif param == 'ffn_ratio':
      arch_dict[param] = random.choice(self.search_space.ffn_ratios)
    elif param == 'dropout_rate':
      arch_dict[param] = random.choice(self.search_space.dropout_rates)
    elif param == 'activation_function':
      arch_dict[param] = random.choice(self.search_space.activation_functions)
    elif param == 'attention_type':
      arch_dict[param] = random.choice(self.search_space.attention_types)
    elif param == 'layer_type':
      arch_dict[param] = random.choice(self.search_space.layer_types)

    #Ensure num_heads divides embed_dim
    while arch_dict['embed_dim'] % arch_dict['num_heads'] != 0:
      arch_dict['num_heads'] = random.choice(self.search_space.num_heads)

    return Architecture.from_dict(arch_dict)

  def _crossover(self, parent1: Architecture, parent2: Architecture) -> Tuple[Architecture,  Architecture]:
    #Perform crossover between two architectures
    p1_dict = parent1.to_dict()
    p2_dict = parent2.to_dict()

    child1_dict = {}
    child2_dict = {}

    for key in p1_dict.keys():
      if random.random() < self.crossover_rate:
        child1_dict[key] = p1_dict[key]
        child2_dict[key] = p2_dict[key]
      else:
        child1_dict[key] = p2_dict[key]
        child2_dict[key] = p1_dict[key]


    #Fix head compatibility
    for child_dict in [child1_dict, child2_dict]:
      while child_dict['embed_dim'] % child_dict['num_heads'] != 0:
        child_dict['num_heads'] = random.choice(self.search_space.num_heads)

    return Architecture.from_dict(child1_dict), Architecture.from_dict(child2_dict)

  def initialize_population(self):
    #Initialize random population
    self.population = []
    for _ in range(self.population_size):
      arch = self._random_architecture()

      #Ensure compatibility
      while arch.embed_dim % arch.num_heads != 0:
        arch = self._random_architecture()
      self.population.append(arch)

  def evolve_generation(self, fitness_scores: List[float]) -> List[Architecture]:
    #Evolve population for one generation
    #Selection (tournament selection)
    selected = []
    for _ in range(self.population_size):
      tournament_size = min(3, len(self.population))
      tournament_indices = random.sample(range(len(self.population)), tournament_size)
      best_idx = max(tournament_indices, key=lambda i: fitness_scores[i])
      selected.append(self.population[best_idx])

    #create new generation
    new_population = []

    #Elitism: keep best individuals
    elite_count = max(1, self.population_size // 10)
    elite_indices = sorted(range(len(fitness_scores)), key=lambda i: fitness_scores[i], reverse=True)[:elite_count]
    for idx in elite_indices:
      new_population.append(self.population[idx])

    #Generate Offspring
    while len(new_population) < self.population_size:
      if random.random() < self.crossover_rate and len(selected) >= 2:
        parent1, parent2 = random.sample(selected, 2)
        child1, child2 = self._crossover(parent1, parent2)
        new_population.extend([child1, child2][:self.population_size - len(new_population)])
      else:
        parent = random.choice(selected)
        if random.random() < self.mutation_rate:
          child = self._mutate(parent)
        else:
          child = parent
        new_population.append(child)

    self.population = new_population[:self.population_size]

    return self.population

In [10]:
class ProgressiveSearchStrategy:
  #Progressive search strategy with supernet training

  def __init__(self, search_space: SearchSpace, stages: int=3):
    self.search_space = search_space
    self.stages = stages
    self.current_stage = 0
    self.stage_spaces = self._create_stage_spaces()

  def _create_stage_spaces(self) -> List[SearchSpace]:
    #Create progressively larger search spaces
    stages = []
    for i in range(self.stages):
      #Progressively expand search space
      expansion_factor = (i + 1) / self.stages

      embed_dims = self.search_space.embed_dims[:max(1, int(len(self.search_space.embed_dims) * expansion_factor))]
      num_heads = self.search_space.num_heads[:max(1, int(len(self.search_space.num_heads) * expansion_factor))]
      num_layers = self.search_space.num_layers[:max(1, int(len(self.search_space.num_layers) * expansion_factor))]

      stage_space = SearchSpace(
        embed_dims=embed_dims,
        num_heads=num_heads,
        num_layers=num_layers,
        ffn_ratios=self.search_space.ffn_ratios,
        dropout_rates=self.search_space.dropout_rates,
        activation_functions=self.search_space.activation_functions,
        attention_types=self.search_space.attention_types,
        layer_types=self.search_space.layer_types
      )
      stages.append(stage_space)

    return stages

  def advance_stage(self):
    #Advance to next stage
    self.current_stage = min(self.current_stage + 1, len(self.stage_spaces) - 1)

  def get_current_search_space(self) -> SearchSpace:
      # Get the search space for the current stage
      return self.stage_spaces[self.current_stage]

In [11]:
class PerformancePredictor:
#Predict Architecture Performance without full training

  def __init__(self):
    self.performance_cache = {}
    self.training_data = []

  def predict_performance(self, arch: Architecture, profiler: HardwareProfiler) -> float:
    #Predict architecture performance score
    arch_hash = hash(arch)
    if arch_hash in self.performance_cache:
      return self.performance_cache[arch_hash]

    #Multi-Objective Scoring
    efficiency_score = self._compute_efficiency_score(arch, profiler)
    capacity_score = self._compute_capacity_score(arch)
    diversity_score = self._compute_diversity_score(arch)

    score = 0.5 * efficiency_score + 0.3 * capacity_score + 0.2 * diversity_score
    self.performance_cache[arch_hash] = score
    return score

    #weighted combination
    total_score = 0.4 * efficiency_score + 0.4 * capacity_score + 0.2 * diversity_score

    self.performance_cache[arch_hash] = total_score
    return total_score

  def _compute_efficiency_score(self, arch: Architecture, profiler: HardwareProfiler) -> float:
    #Compute efficiency score based on hardware constraints
    memory_usage = profiler.estimate_memory_usage(arch)
    latency = profiler.estimate_latency(arch)

    #Normalize scores (lower is better for memory and latency)
    memory_score = max(0,1 - memory_usage / profiler.memory_limit)
    latency_score = max(0, 1 - latency / 1000) #Normalize by 1s

    return (memory_score + latency_score) / 2

  def _compute_capacity_score(self, arch: Architecture) -> float:
    #Compute model capacity score
    #Higher Capacity score generally means better performance
    layer_score = min(1.0, arch.num_layers/ 24)
    dim_score = min(1.0, arch.embed_dim / 1024)
    ffn_score = min(1.0, arch.ffn_ratio / 8)

    return (layer_score + dim_score + ffn_score) / 3

  def _compute_diversity_score(self, arch: Architecture) -> float:
    #reward architectural diversity
    diversity_bonus = 0.0

    #Bonus for non-standard configuration
    if arch.activation != 'relu':
      diversity_bonus += 0.1
    if arch.attention_type != 'standard':
      diversity_bonus += 0.2
    if arch.ffn_ratio not in [2.0, 4.0]:
      diversity_bonus += 0.1

    return min(1.0, diversity_bonus)



In [12]:
class NeuralArchitectureSearch:
  #Main NAS system coordinating all components

  def __init__(self, searchspace: SearchSpace = None, population_size: int = 50, generations: int = 20, use_progressive: bool = True):
    self.searchspace = searchspace or SearchSpace()
    self.population_size = population_size
    self.generations = generations
    self.use_progressive = use_progressive
    #initialize components
    self.evolutionary_searcher = EvolutionarySearcher(self.searchspace, self.population_size)
    self.progressive_strategy = ProgressiveSearchStrategy(self.searchspace) if use_progressive else None
    self.hardware_profiler = HardwareProfiler()
    self.performance_predictor = PerformancePredictor()

    #Results Tracking
    self.search_history = []
    self.best_architectures = []

  def search(self) -> List[Architecture]:
    #Execute neural architecture search
    logger.info(f"Starting NAS with {self.generations} generations, population{self.population_size}")

    #Initialize population
    current_search_space = self.progressive_strategy.get_current_search_space() if self.use_progressive else self.search_space
    self.evolutionary_searcher.search_space = current_search_space
    self.evolutionary_searcher.initialize_population()

    for generation in range(self.generations):
      logger.info(f"Generation {generation + 1}/{self.generations}")

      #Evaluate population
      fitness_scores = self._evaluate_population(self.evolutionary_searcher.population)

      #Track best architecture
      best_idx = np.argmax(fitness_scores)
      best_arch = self.evolutionary_searcher.population[best_idx]
      best_score = fitness_scores[best_idx]

      self.best_architectures.append((best_arch, best_score))
      self.search_history.append({
          'generation': generation,
          'best_score': best_score,
          'avg_score': np.mean(fitness_scores),
          'best_architecture': best_arch.to_dict()
      })

      logger.info(f"Best score: {best_score: .4f}, Avg score: {np.mean(fitness_scores): .4f}")

      #Progressive search space advancement
      if self.use_progressive and (generation + 1) % (self.generations // 3) == 0:
        self.progressive_strategy.advance_stage()
        current_search_space = self.progressive_strategy.get_current_search_space()
        self.evolutionary_searcher.search_space = current_search_space
        logger.info(f"Advancing to next stage: {self.progressive_strategy.current_stage}")

      #Evolve population
      if generation < self.generations - 1:
        self.evolutionary_searcher.evolve_generation(fitness_scores)

    #Return top architectures
    self.best_architectures.sort(key=lambda x: x[1], reverse=True)
    return [arch for arch, score in self.best_architectures[:10]]

  def _evaluate_population(self, population: List[Architecture]) -> List[float]:
    #Evaluate population fitness scores
    scores = []

    for arch in population:
      try:
        score = self.performance_predictor.predict_performance(arch, self.hardware_profiler)
        scores.append(score)
      except Exception as e:
        logger.error(f"Failed to evaluate architecture{arch}: {e}")
        scores.append(0.0)

    return scores

  def create_model(self, arch: Architecture, vocab_size: int = 50000) -> nn.Module:
    #Create model from architecture
    return ConfigurableTransformer(arch, vocab_size)

  def export_results(self, filename: str = "nas_results.json"):
    #Export search results to JSON
    results = {
        'search_history': self.search_history,
        'best_architectures': [
            {'architecture': arch.to_dict(), 'score': score} for arch, score in self.best_architectures[:10]
        ],
        'search_space': asdict(self.searchspace),
        'hardware_info':{
        'memory_limit': self.hardware_profiler.memory_limit,
        'compute_capability': self.hardware_profiler.compute_capability
        }
    }

    with open(filename, 'w') as f:
      json.dump(results, f, indent=2)

    logger.info(f"Search results exported to {filename}")

#Demo usage
if __name__ == "__main__":
  #Configure Search Space
  search_space = SearchSpace(
      embed_dims=[256, 384, 512, 768],
      num_heads=[4, 6, 8, 12],
      num_layers=[6, 8, 12, 16],
      ffn_ratios=[2.0, 3.0, 4.0],
      dropout_rates=[0.0, 0.1, 0.2],
      activation_functions=['relu', 'gelu', 'swish'],
      attention_types=['standard', 'linear', 'performer'],
      layer_types=['transformer', 'hybrid']
  )

  #Initialize NAS system
  nas = NeuralArchitectureSearch(
      searchspace=search_space,
      population_size=30,
      generations=15,
      use_progressive=True
  )

  #Run Search
  logger.info("Initializing Neural Architecture Search...")
  best_architectures = nas.search()

  # Print Results
  print("\n" + "="*60)
  print("NEURAL ARCHITECTURE SEARCH RESULTS")
  print("="*60)

  for i, arch in enumerate(best_architectures[:5]):
    print(f"\nRank {i+1} Architecture:")
    print(f"Embed Dim: {arch.embed_dim}")
    print(f"Num Heads: {arch.num_heads}")
    print(f"Num Layers: {arch.num_layers}")
    print(f"FFN Ratio: {arch.ffn_ratio}")
    print(f"Dropout Rate: {arch.dropout_rate}")
    print(f"Activation: {arch.activation_function}")
    print(f"Attention: {arch.attention_type}")
    print(f"Layer Type: {arch.layer_type}")

    #Create and Analyze model
    model = nas.create_model(arch)
    total_params = sum(p.numel() for p in model.parameters())
    memory_usage = nas.hardware_profiler.estimate_memory_usage(arch)
    latency = nas.hardware_profiler.estimate_latency(arch)

    print(f" Parameters: {total_params:,}")
    print(f" Memory Usage: {memory_usage:.1f} MB")
    print(f" Latency: {latency: .1f} ms ")

    #Export results
    nas.export_results("transformer_nas_results.json")

    print(f"\n Search Completed. Best Architecture Found with {best_architectures[0].embed_dim}D embeddings")
    print(f"Full results exported to transformer_nas_results.json")

ERROR:__main__:Failed to evaluate architectureArchitecture(embed_dim=256, num_heads=4, num_layers=6, ffn_ratio=3.0, dropout_rate=0.1, activation_function='relu', attention_type='performer', layer_type='transformer'): 'Architecture' object has no attribute 'activation'
ERROR:__main__:Failed to evaluate architectureArchitecture(embed_dim=256, num_heads=4, num_layers=6, ffn_ratio=3.0, dropout_rate=0.0, activation_function='swish', attention_type='performer', layer_type='hybrid'): 'Architecture' object has no attribute 'activation'
ERROR:__main__:Failed to evaluate architectureArchitecture(embed_dim=256, num_heads=4, num_layers=6, ffn_ratio=3.0, dropout_rate=0.0, activation_function='gelu', attention_type='standard', layer_type='transformer'): 'Architecture' object has no attribute 'activation'
ERROR:__main__:Failed to evaluate architectureArchitecture(embed_dim=256, num_heads=4, num_layers=6, ffn_ratio=4.0, dropout_rate=0.2, activation_function='swish', attention_type='linear', layer_type


NEURAL ARCHITECTURE SEARCH RESULTS

Rank 1 Architecture:
Embed Dim: 256
Num Heads: 4
Num Layers: 6
FFN Ratio: 3.0
Dropout Rate: 0.1
Activation: relu
Attention: performer
Layer Type: transformer
 Parameters: 29,727,568
 Memory Usage: 130768.0 MB
 Latency:  28.5 ms 

 Search Completed. Best Architecture Found with 256D embeddings
Full results exported to transformer_nas_results.json

Rank 2 Architecture:
Embed Dim: 256
Num Heads: 4
Num Layers: 6
FFN Ratio: 3.0
Dropout Rate: 0.1
Activation: relu
Attention: performer
Layer Type: transformer
 Parameters: 29,727,568
 Memory Usage: 130768.0 MB
 Latency:  28.5 ms 

 Search Completed. Best Architecture Found with 256D embeddings
Full results exported to transformer_nas_results.json

Rank 3 Architecture:
Embed Dim: 256
Num Heads: 4
Num Layers: 6
FFN Ratio: 3.0
Dropout Rate: 0.1
Activation: relu
Attention: performer
Layer Type: transformer
 Parameters: 29,727,568
 Memory Usage: 130768.0 MB
 Latency:  28.5 ms 

 Search Completed. Best Architectur