# Ch16-3 Genome Design

DNABERT-powered genome generation that works on M-series Macs without CUDA/Triton

## Updates & Notes
- Mac Compatible: No CUDA/Triton required
- Defaults to CPU mode for maximum compatibility
- Optional MPS (Apple Silicon GPU) support
- plotly = 6.3.1
- tqdm = 4.67.1

## Install Required Packages

In [None]:
# Install main packages
!pip install transformers torch numpy biopython pandas matplotlib seaborn plotly tqdm

## Import Libraries

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
from typing import List, Dict, Tuple
import re
import random
import warnings
import logging
from tqdm import tqdm
from IPython.display import display, HTML, Markdown, clear_output
from io import StringIO
import sys
import time

## Jupyter Configuration

In [None]:
# Jupyter notebook configuration
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10
sns.set_style("whitegrid")

# Suppress warnings for cleaner output
logging.getLogger("transformers").setLevel(logging.ERROR)
warnings.filterwarnings("ignore")

## DNABERT Genome Generator Class

In [None]:
class DNABERTGenomeGenerator:
    def __init__(self, model_name: str = "zhihan1996/DNABERT-2-117M", device: str = "cpu"):
        """
        Initialize DNABERT model for genome generation and analysis.
        Mac-compatible version with CPU/GPU/MPS device selection.
        
        Args:
            model_name: HuggingFace model identifier
            device: Device to use - "cpu", "cuda", "mps", or "auto"
                   Defaults to "cpu" for maximum compatibility
        """
        print("üß¨ Initializing DNABERT Genome Generator (Mac Compatible)...")
        print("=" * 50)
        
        # Set up device
        self.device = self._setup_device(device)
        print(f"üñ•Ô∏è  Using device: {self.device}")
        
        print(f"üì• Loading DNABERT model: {model_name}")
        
        # Load model with progress updates
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            transformers_logger = logging.getLogger("transformers")
            original_level = transformers_logger.level
            transformers_logger.setLevel(logging.ERROR)
            
            try:
                print("  üì§ Loading tokenizer...")
                self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
                
                print("  ü§ñ Loading model...")
                self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
                
                # Move model to selected device
                self.model = self.model.to(self.device)
                self.model.eval()
                
            finally:
                transformers_logger.setLevel(original_level)
        
        # Set maximum sequence length for tokenization
        self.max_length = 512
        
        print("  üß™ Testing model output format...")
        self._test_model_output()
        
        print("  üìö Generating reference embeddings...")
        self.reference_embeddings = self._generate_reference_embeddings()
        
        display(HTML(f"""
        <div style="border: 3px solid #4CAF50; padding: 15px; border-radius: 10px; background-color: #e8f5e8; margin: 10px 0;">
            <h3 style="color: #2E7D32; margin: 0;">‚úÖ DNABERT Model Ready!</h3>
            <p style="margin: 5px 0; color: #424242;">Device: {self.device} | Ready to generate AI-optimized genomes</p>
        </div>
        """))
        
        # Define functional genome elements
        self.genome_elements = {
            'promoter': {'length': 50, 'consensus': 'TATAAA', 'gc_content': 0.4},
            'coding_sequence': {'length': 600, 'start_codon': 'ATG', 'stop_codons': ['TAA', 'TAG', 'TGA'], 'gc_content': 0.5},
            'terminator': {'length': 30, 'gc_content': 0.6},
            'intergenic': {'length': 100, 'gc_content': 0.45}
        }
    
    def _setup_device(self, device: str) -> str:
        """Set up computation device with Mac compatibility."""
        if device == "auto":
            if torch.cuda.is_available():
                return "cuda"
            elif torch.backends.mps.is_available():
                print("  ‚ÑπÔ∏è  MPS (Apple Silicon GPU) detected but using CPU for stability")
                return "cpu"
            else:
                return "cpu"
        elif device == "mps":
            if torch.backends.mps.is_available():
                print("  ‚ö†Ô∏è  Warning: MPS may have compatibility issues. Switch to CPU if errors occur.")
                return "mps"
            else:
                print("  ‚ö†Ô∏è  MPS not available, falling back to CPU")
                return "cpu"
        elif device == "cuda":
            if torch.cuda.is_available():
                return "cuda"
            else:
                print("  ‚ö†Ô∏è  CUDA not available, falling back to CPU")
                return "cpu"
        else:
            return "cpu"
    
    def _test_model_output(self):
        """Test model output format with minimal output."""
        test_seq = "ATGC"
        inputs = self.tokenizer(test_seq, return_tensors="pt", max_length=self.max_length, truncation=True)
        
        # Move inputs to device
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            if isinstance(outputs, tuple):
                self.output_format = 'tuple'
            elif hasattr(outputs, 'last_hidden_state'):
                self.output_format = 'standard'
            else:
                self.output_format = 'unknown'

In [None]:
def _generate_reference_embeddings(self) -> Dict[str, np.ndarray]:
        """Generate reference embeddings with simple progress tracking."""
        references = {}
        
        # Strong promoter sequences (from literature)
        strong_promoters = [
            "TTGACAATTAATCATCGGCTCGTATAATGTGTGGAATTGTGAGCGGATAACAATTTCACACAGGAAACAG",
            "AATTGTGAGCGCTCACAATTCCACACAACATACGAGCCGGAAGCATAAAGTGTAAAGCCTGGGGTGCCTAAT",
            "TTTACACTTTTATGCTTCCGGCTCGTATGTTGTGTGGAATTGTGAGCGCTCACAATTCCACACAACATACGA",
        ]
        
        strong_genes = [
            "ATGAAACAACGCATCGTAGCGGCTCTGATCCTCGAGCGTCTGACCCAGTACGAGGCCATGACCAACGAGTAA",
            "ATGGTCAACAAACGCCTGGCGATCTACGACCGTATCAACGAGCTCAACAAACACCTGGAACAGGACAAATAA",
            "ATGCTGGAACAGAAACGTATCCAGGCGATCAACGAGTACCTCAACGAGCGCATCCAGAAACGCCTCAAATAG",
        ]
        
        strong_terminators = [
            "AAAAGCCCGAAAGGAAGCTGAGTTGGCTGCTGCCACCGCTGAGCAATAACTAGCATAACCCCTTGGGGCCTCTAAACGGGTCTT",
            "GGCGGAATTCGGGGGCGAGCGAACGCGTAAGGATTACCCCGGGCGCCGAAACGTAGCGCGACGCCGAAACGACGGCCT",
        ]
        
        print("    üìù Computing promoter embeddings...")
        promoter_embeddings = [self.get_sequence_embedding(seq) for seq in strong_promoters]
        references['promoter'] = np.mean(promoter_embeddings, axis=0)
        
        print("    üß¨ Computing coding sequence embeddings...")
        gene_embeddings = [self.get_sequence_embedding(seq) for seq in strong_genes]
        references['coding_sequence'] = np.mean(gene_embeddings, axis=0)
        
        print("    üìö Computing terminator embeddings...")
        terminator_embeddings = [self.get_sequence_embedding(seq) for seq in strong_terminators]
        references['terminator'] = np.mean(terminator_embeddings, axis=0)
        
        print("    üåç Computing intergenic reference...")
        intergenic_seqs = [self.generate_random_sequence(100, 0.45) for _ in range(5)]
        intergenic_embeddings = [self.get_sequence_embedding(seq) for seq in intergenic_seqs]
        references['intergenic'] = np.mean(intergenic_embeddings, axis=0)
        
        print("    ‚úÖ Reference embeddings complete!")
        
        return references

DNABERTGenomeGenerator._generate_reference_embeddings = _generate_reference_embeddings

In [None]:
def get_sequence_embedding(self, sequence: str) -> np.ndarray:
        """Get embedding representation of DNA sequence."""
        clean_seq = self.preprocess_sequence(sequence)
        
        if len(clean_seq) > self.max_length:
            clean_seq = clean_seq[:self.max_length]
        
        inputs = self.tokenizer(clean_seq, return_tensors="pt", padding=True, 
                               truncation=True, max_length=self.max_length)
        
        # Move inputs to device
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        
        with torch.no_grad():
            outputs = self.model(**inputs)
            
            if self.output_format == 'tuple':
                hidden_states = outputs[0]
            elif hasattr(outputs, 'last_hidden_state'):
                hidden_states = outputs.last_hidden_state
            else:
                hidden_states = outputs['last_hidden_state']
            
            # Move back to CPU for numpy conversion
            embedding = hidden_states.mean(dim=1).squeeze().cpu().numpy()
            
        return embedding
    
def preprocess_sequence(self, sequence: str) -> str:
        """Clean and validate DNA sequence."""
        return re.sub(r'[^ATGC]', '', sequence.upper())

DNABERTGenomeGenerator.get_sequence_embedding = get_sequence_embedding
DNABERTGenomeGenerator.preprocess_sequence = preprocess_sequence

In [None]:
def generate_random_sequence(self, length: int, gc_content: float = 0.5) -> str:
        """Generate random DNA sequence with specified GC content."""
        gc_count = int(length * gc_content)
        at_count = length - gc_count
        
        nucleotides = ['G'] * (gc_count // 2) + ['C'] * (gc_count // 2)
        nucleotides += ['A'] * (at_count // 2) + ['T'] * (at_count // 2)
        
        while len(nucleotides) < length:
            nucleotides.append(random.choice(['A', 'T', 'G', 'C']))
        
        random.shuffle(nucleotides)
        return ''.join(nucleotides)
    
def generate_promoter_sequence(self, length: int = 50) -> str:
        """Generate realistic promoter sequence with TATA box."""
        sequence = list(self.generate_random_sequence(length, 0.4))
        tata_pos = length - 30
        tata_box = "TATAAA"
        for i, nucleotide in enumerate(tata_box):
            if tata_pos + i < len(sequence):
                sequence[tata_pos + i] = nucleotide
        return ''.join(sequence)
    
def generate_coding_sequence(self, length: int = 600) -> str:
        """Generate coding sequence with start codon and proper reading frame."""
        sequence = ["ATG"]
        remaining_length = length - 3
        
        codons = []
        for _ in range(remaining_length // 3):
            codon = self.generate_random_sequence(3, 0.5)
            while codon in ['TAA', 'TAG', 'TGA']:
                codon = self.generate_random_sequence(3, 0.5)
            codons.append(codon)
        
        if codons:
            codons[-1] = random.choice(['TAA', 'TAG', 'TGA'])
        
        sequence.extend(codons)
        
        current_length = len(''.join(sequence))
        if current_length < length:
            sequence.append(self.generate_random_sequence(length - current_length, 0.5))
        
        return ''.join(sequence)
    
def generate_terminator_sequence(self, length: int = 30) -> str:
        """Generate terminator sequence with hairpin structure potential."""
        return self.generate_random_sequence(length, 0.6)

DNABERTGenomeGenerator.generate_random_sequence = generate_random_sequence
DNABERTGenomeGenerator.generate_promoter_sequence = generate_promoter_sequence
DNABERTGenomeGenerator.generate_coding_sequence = generate_coding_sequence
DNABERTGenomeGenerator.generate_terminator_sequence = generate_terminator_sequence

In [None]:
def optimize_sequence_with_model(self, initial_sequence: str, element_type: str = 'intergenic') -> str:
        """Use DNABERT embeddings for sequence optimization."""
        if element_type not in self.reference_embeddings:
            return self._optimize_sequence_basic(initial_sequence)
        
        best_sequence = initial_sequence
        best_embedding = self.get_sequence_embedding(initial_sequence)
        best_score = self._score_sequence_with_model(best_embedding, element_type)
        
        print(f"    üîß Optimizing {element_type} (initial score: {best_score:.3f})")
        
        improvements = 0
        
        for iteration in range(25):
            candidate = self._mutate_sequence_guided(best_sequence, element_type)
            candidate_embedding = self.get_sequence_embedding(candidate)
            candidate_score = self._score_sequence_with_model(candidate_embedding, element_type)
            
            if candidate_score > best_score:
                best_sequence = candidate
                best_embedding = candidate_embedding
                best_score = candidate_score
                improvements += 1
                
                if improvements % 5 == 0:
                    print(f"      ‚ÜóÔ∏è  {improvements} improvements, score: {best_score:.3f}")
                
                if best_score > 0.8:
                    break
        
        if improvements > 0:
            print(f"    ‚úÖ Optimization complete! Final score: {best_score:.3f} (+{improvements} improvements)")
        else:
            print(f"    ‚û°Ô∏è  No improvements found (final score: {best_score:.3f})")
            
        return best_sequence
    
def _score_sequence_with_model(self, embedding: np.ndarray, element_type: str) -> float:
        """Score sequence based on embedding similarity."""
        if element_type not in self.reference_embeddings:
            return 0.0
        
        reference_embedding = self.reference_embeddings[element_type]
        model_similarity = np.dot(embedding, reference_embedding) / (
            np.linalg.norm(embedding) * np.linalg.norm(reference_embedding)
        )
        return max(0, (model_similarity + 1) / 2)
    
def _mutate_sequence_guided(self, sequence: str, element_type: str) -> str:
        """Make targeted mutations guided by DNABERT."""
        candidate = list(sequence)
        num_mutations = max(1, min(3, len(sequence) // 50))
        
        for _ in range(num_mutations):
            pos = random.randint(0, len(candidate) - 1)
            current_nucleotide = candidate[pos]
            new_nucleotides = [n for n in ['A', 'T', 'G', 'C'] if n != current_nucleotide]
            candidate[pos] = random.choice(new_nucleotides)
        
        return ''.join(candidate)
    
def _optimize_sequence_basic(self, initial_sequence: str) -> str:
        """Fallback optimization."""
        return initial_sequence

DNABERTGenomeGenerator.optimize_sequence_with_model = optimize_sequence_with_model
DNABERTGenomeGenerator._score_sequence_with_model = _score_sequence_with_model
DNABERTGenomeGenerator._mutate_sequence_guided = _mutate_sequence_guided
DNABERTGenomeGenerator._optimize_sequence_basic = _optimize_sequence_basic

## Genome Generation Methods

In [None]:
def generate_2kb_genome(self) -> Dict[str, any]:
        """Generate a functional 2kb genome with DNABERT optimization."""
        display(HTML("""
        <div style="border: 2px solid #2196F3; padding: 15px; border-radius: 10px; background-color: #e3f2fd; margin: 10px 0;">
            <h3 style="color: #1976D2; margin: 0;">üß¨ Generating 2kb Genome with DNABERT Optimization</h3>
        </div>
        """))
        
        genome_structure = []
        total_length = 0
        target_length = 2000
        
        elements = [
            ('promoter', 80), ('coding_sequence', 600), ('intergenic', 120),
            ('promoter', 70), ('coding_sequence', 500), ('terminator', 50),
            ('intergenic', 150), ('promoter', 60), ('coding_sequence', 400),
            ('terminator', 40)
        ]
        
        genome_sequence = ""
        
        print(f"üìã Planned elements: {len(elements)} functional regions")
        print("=" * 60)
        
        for i, (element_type, length) in enumerate(elements, 1):
            if total_length + length > target_length:
                length = target_length - total_length
                if length <= 0:
                    break
            
            print(f"üîß [{i}/{len(elements)}] Generating {element_type} ({length} bp)")
            
            if element_type == 'promoter':
                sequence = self.generate_promoter_sequence(length)
            elif element_type == 'coding_sequence':
                sequence = self.generate_coding_sequence(length)
            elif element_type == 'terminator':
                sequence = self.generate_terminator_sequence(length)
            else:
                sequence = self.generate_random_sequence(length, 0.45)
            
            sequence = self.optimize_sequence_with_model(sequence, element_type)
            
            genome_structure.append({
                'type': element_type,
                'start': total_length,
                'end': total_length + length,
                'sequence': sequence
            })
            
            genome_sequence += sequence
            total_length += length
            
            progress_percent = (total_length / target_length) * 100
            print(f"    üìä Progress: {progress_percent:.1f}% ({total_length}/{target_length} bp)\n")
            
            if total_length >= target_length:
                break
        
        if len(genome_sequence) < target_length:
            padding_length = target_length - len(genome_sequence)
            padding = self.generate_random_sequence(padding_length)
            genome_sequence += padding
        
        final_genome = genome_sequence[:target_length]
        
        display(HTML(f"""
        <div style="border: 2px solid #4CAF50; padding: 15px; border-radius: 10px; background-color: #e8f5e8; margin: 10px 0;">
            <h3 style="color: #2E7D32; margin: 0;">‚úÖ Genome Generation Complete!</h3>
            <p style="margin: 5px 0; color: #424242;">Generated {len(final_genome)} bp genome with {len(genome_structure)} functional elements</p>
        </div>
        """))
        
        return {
            'sequence': final_genome,
            'structure': genome_structure,
            'length': len(final_genome)
        }

DNABERTGenomeGenerator.generate_2kb_genome = generate_2kb_genome

## Genome Analysis Methods

In [None]:
def analyze_genome(self, genome: Dict) -> Dict:
        """Comprehensive genome analysis."""
        sequence = genome['sequence']
        
        display(HTML("""
        <div style="border: 2px solid #9C27B0; padding: 15px; border-radius: 10px; background-color: #f3e5f5; margin: 10px 0;">
            <h3 style="color: #7B1FA2; margin: 0;">üî¨ Analyzing Genome with DNABERT</h3>
        </div>
        """))
        
        length = len(sequence)
        gc_content = (sequence.count('G') + sequence.count('C')) / length
        
        composition = {
            'A': sequence.count('A') / length,
            'T': sequence.count('T') / length,
            'G': sequence.count('G') / length,
            'C': sequence.count('C') / length
        }
        
        print("üìä Computing basic statistics...")
        print(f"   Length: {length:,} bp")
        print(f"   GC Content: {gc_content:.1%}")
        
        print("üìà Computing GC content windows...")
        window_size = 50
        gc_windows = []
        for pos in range(0, length - window_size + 1, 10):
            window = sequence[pos:pos + window_size]
            gc = (window.count('G') + window.count('C')) / window_size
            gc_windows.append({'position': pos + window_size // 2, 'gc_content': gc})
        
        print("üîç Finding Open Reading Frames...")
        orfs = self.find_orfs(sequence)
        print(f"   Found {len(orfs)} ORFs")
        
        print("üß† Computing DNABERT embedding...")
        embedding = self.get_sequence_embedding(sequence)
        
        print("‚≠ê Analyzing element quality...")
        element_quality_scores = self._analyze_element_quality(genome['structure'])
        
        print("‚úÖ Analysis complete!")
        
        return {
            'length': length,
            'gc_content': gc_content,
            'composition': composition,
            'gc_windows': gc_windows,
            'orfs': orfs,
            'embedding': embedding,
            'structure': genome['structure'],
            'element_quality_scores': element_quality_scores
        }
    
def _analyze_element_quality(self, structure: List[Dict]) -> Dict:
        """Analyze element quality with DNABERT."""
        quality_scores = {}
        element_type_scores = {}
        
        for element in structure:
            element_type = element['type']
            sequence = element['sequence']
            
            if element_type in self.reference_embeddings:
                element_embedding = self.get_sequence_embedding(sequence)
                quality_score = self._score_sequence_with_model(element_embedding, element_type)
                
                element_id = f"{element_type}_{element['start']}-{element['end']}"
                quality_scores[element_id] = {
                    'type': element_type,
                    'score': quality_score,
                    'start': element['start'],
                    'end': element['end'],
                    'length': element['end'] - element['start']
                }
                
                if element_type not in element_type_scores:
                    element_type_scores[element_type] = []
                element_type_scores[element_type].append(quality_score)
        
        average_scores = {}
        for element_type, scores in element_type_scores.items():
            average_scores[element_type] = {
                'average': np.mean(scores),
                'std': np.std(scores),
                'count': len(scores),
                'min': np.min(scores),
                'max': np.max(scores)
            }
        
        return {
            'individual_scores': quality_scores,
            'average_by_type': average_scores,
            'overall_average': np.mean([score['score'] for score in quality_scores.values()]) if quality_scores else 0.0
        }
    
def find_orfs(self, sequence: str) -> List[Dict]:
        """Find Open Reading Frames."""
        start_codon = 'ATG'
        stop_codons = ['TAA', 'TAG', 'TGA']
        orfs = []
        
        for frame in range(3):
            i = frame
            while i < len(sequence) - 2:
                if sequence[i:i+3] == start_codon:
                    start_pos = i
                    i += 3
                    
                    while i < len(sequence) - 2:
                        codon = sequence[i:i+3]
                        if codon in stop_codons:
                            orfs.append({
                                'start': start_pos,
                                'end': i + 3,
                                'length': i + 3 - start_pos,
                                'frame': frame,
                                'sequence': sequence[start_pos:i+3]
                            })
                            break
                        i += 3
                    break
                else:
                    i += 3
        
        return orfs

DNABERTGenomeGenerator.analyze_genome = analyze_genome
DNABERTGenomeGenerator._analyze_element_quality = _analyze_element_quality
DNABERTGenomeGenerator.find_orfs = find_orfs

## Display Summary Method

In [None]:
def display_genome_summary(self, analysis: Dict):
        """Display comprehensive genome summary."""
        quality_scores = analysis['element_quality_scores']
        
        html_summary = f"""
        <div style="border: 2px solid #4CAF50; padding: 20px; border-radius: 10px; background-color: #f9f9f9;">
            <h2 style="color: #4CAF50; margin-top: 0;">üß¨ DNABERT-Optimized Genome Summary</h2>
            <h3>üìä Basic Statistics</h3>
            <ul>
                <li><strong>Length:</strong> {analysis['length']:,} bp</li>
                <li><strong>GC Content:</strong> {analysis['gc_content']:.1%}</li>
                <li><strong>Elements:</strong> {len(analysis['structure'])}</li>
                <li><strong>ORFs Found:</strong> {len(analysis['orfs'])}</li>
                <li><strong>Overall Quality:</strong> {quality_scores['overall_average']:.3f}</li>
            </ul>
        </div>
        """
        
        display(HTML(html_summary))

DNABERTGenomeGenerator.display_genome_summary = display_genome_summary

## Visualization Functions

In [None]:
def visualize_genome_notebook(genome_data: Dict, analysis: Dict):
    """Create static genome visualizations."""
    
    display(HTML("<h3>üìä Genome Visualization Dashboard</h3>"))
    
    plt.style.use('seaborn-v0_8')
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('DNABERT-Optimized 2kb Genome Analysis', fontsize=16, fontweight='bold')
    
    colors = {
        'promoter': '#FF6B6B', 'coding_sequence': '#4ECDC4', 
        'terminator': '#45B7D1', 'intergenic': '#96CEB4', 'padding': '#FECA57'
    }
    
    # 1. Genome Map
    ax1 = axes[0, 0]
    y_pos = 0
    plotted_types = set()
    for element in analysis['structure']:
        show_label = element['type'] not in plotted_types
        plotted_types.add(element['type'])
        ax1.barh(y_pos, element['end'] - element['start'], 
                left=element['start'], height=0.5,
                color=colors.get(element['type'], '#95A5A6'),
                label=element['type'] if show_label else "")
    ax1.set_xlabel('Position (bp)')
    ax1.set_title('Genome Structure Map', fontweight='bold')
    ax1.legend()
    
    # 2. GC Content
    ax2 = axes[0, 1]
    if analysis['gc_windows']:
        positions = [w['position'] for w in analysis['gc_windows']]
        gc_values = [w['gc_content'] for w in analysis['gc_windows']]
        ax2.plot(positions, gc_values, color='#E74C3C', linewidth=2)
        ax2.axhline(y=0.5, color='gray', linestyle='--', alpha=0.7)
    ax2.set_xlabel('Position (bp)')
    ax2.set_ylabel('GC Content')
    ax2.set_title('GC Content Distribution', fontweight='bold')
    
    # 3. Nucleotide Composition
    ax3 = axes[1, 0]
    nucleotides = list(analysis['composition'].keys())
    percentages = [analysis['composition'][n] * 100 for n in nucleotides]
    ax3.pie(percentages, labels=nucleotides, autopct='%1.1f%%', 
           colors=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
    ax3.set_title('Nucleotide Composition', fontweight='bold')
    
    # 4. Quality Scores
    ax4 = axes[1, 1]
    if 'element_quality_scores' in analysis:
        quality_data = analysis['element_quality_scores']['average_by_type']
        if quality_data:
            types = list(quality_data.keys())
            scores = [quality_data[t]['average'] for t in types]
            ax4.bar(types, scores, color=[colors.get(t, '#95A5A6') for t in types])
    ax4.set_ylabel('Quality Score')
    ax4.set_title('DNABERT Quality Scores', fontweight='bold')
    plt.setp(ax4.xaxis.get_majorticklabels(), rotation=45, ha='right')
    
    plt.tight_layout()
    
    static_filename = 'Ch16-3-dnabert_genome_analysis_static.png'
    plt.savefig(static_filename, dpi=300, bbox_inches='tight')
    print(f"üíæ Static plots saved to: {static_filename}")
    
    plt.show()
    return static_filename

In [None]:
def create_interactive_genome_viewer_notebook(genome_data: Dict, analysis: Dict):
    """Create interactive genome visualization."""
    
    display(HTML("<h3>üñ±Ô∏è Interactive Genome Dashboard</h3>"))
    
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=('Genome Structure', 'GC Content', 
                       'Nucleotide Composition', 'Quality Scores'),
        specs=[[{}, {}], [{"type": "domain"}, {}]],
        vertical_spacing=0.15,
        horizontal_spacing=0.1
    )
    
    colors = {
        'promoter': '#FF6B6B', 'coding_sequence': '#4ECDC4', 
        'terminator': '#45B7D1', 'intergenic': '#96CEB4', 'padding': '#FECA57'
    }
    
    # 1. Genome Structure Map
    plotted_types = set()
    for element in analysis['structure']:
        show_legend = element['type'] not in plotted_types
        plotted_types.add(element['type'])
        
        fig.add_trace(
            go.Scatter(
                x=[element['start'], element['end'], element['end'], element['start'], element['start']],
                y=[0, 0, 1, 1, 0],
                fill='toself',
                fillcolor=colors.get(element['type'], '#95A5A6'),
                line=dict(color='black', width=1),
                name=element['type'].replace('_', ' ').title(),
                showlegend=show_legend,
                text=f"Type: {element['type']}<br>Start: {element['start']}<br>End: {element['end']}",
                hovertemplate='%{text}<extra></extra>',
                mode='lines'
            ),
            row=1, col=1
        )
    
    # 2. GC Content
    if analysis['gc_windows']:
        positions = [w['position'] for w in analysis['gc_windows']]
        gc_values = [w['gc_content'] for w in analysis['gc_windows']]
        
        fig.add_trace(
            go.Scatter(
                x=positions,
                y=gc_values,
                mode='lines',
                name='GC Content',
                line=dict(color='#E74C3C', width=2),
                showlegend=False
            ),
            row=1, col=2
        )
    
    # 3. Nucleotide Composition (Pie Chart)
    nucleotides = list(analysis['composition'].keys())
    percentages = [analysis['composition'][n] * 100 for n in nucleotides]
    colors_pie = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
    
    fig.add_trace(
        go.Pie(
            labels=nucleotides,
            values=percentages,
            marker=dict(colors=colors_pie),
            showlegend=False
        ),
        row=2, col=1
    )
    
    # 4. Quality Scores by Element Type
    if 'element_quality_scores' in analysis:
        quality_data = analysis['element_quality_scores']['average_by_type']
        if quality_data:
            types = [t.replace('_', ' ').title() for t in quality_data.keys()]
            scores = [quality_data[t]['average'] for t in quality_data.keys()]
            bar_colors = [colors.get(t.lower().replace(' ', '_'), '#95A5A6') for t in quality_data.keys()]
            
            fig.add_trace(
                go.Bar(
                    x=types,
                    y=scores,
                    marker=dict(color=bar_colors),
                    text=[f'{s:.3f}' for s in scores],
                    textposition='outside',
                    showlegend=False
                ),
                row=2, col=2
            )
    
    # Update axes labels
    fig.update_xaxes(title_text="Position (bp)", row=1, col=1)
    fig.update_xaxes(title_text="Position (bp)", row=1, col=2)
    fig.update_xaxes(title_text="Element Type", row=2, col=2)
    
    fig.update_yaxes(title_text="GC Content", row=1, col=2)
    fig.update_yaxes(title_text="Quality Score", row=2, col=2)
    
    fig.update_layout(
        height=800,
        title_text="üß¨ Interactive DNABERT Genome Dashboard",
        title_x=0.5,
        showlegend=True
    )
    
    html_filename = 'Ch16-3-dnabert_genome_analysis_interactive.html'
    fig.write_html(html_filename)
    print(f"üíæ Interactive HTML saved to: {html_filename}")
    
    fig.show()
    return fig, html_filename

## Complete Workflow Function

In [None]:
def generate_and_analyze_genome(device="cpu"):
    """
    Complete workflow for Mac-compatible genome generation.
    
    Args:
        device: "cpu" (default), "cuda", "mps", or "auto"
    """
    
    display(HTML(f"""
    <div style="border: 3px solid #2196F3; padding: 20px; border-radius: 15px; background: linear-gradient(45deg, #e3f2fd, #f3e5f5);">
        <h1 style="color: #1976D2; text-align: center;">üß¨ DNABERT Genome Generator ü§ñ</h1>
        <p style="text-align: center; color: #424242;">Mac-Compatible | Device: {device.upper()}</p>
    </div>
    """))
    
    saved_files = []
    
    try:
        generator = DNABERTGenomeGenerator(device=device)
        genome = generator.generate_2kb_genome()
        analysis = generator.analyze_genome(genome)
        generator.display_genome_summary(analysis)
        
        # Save FASTA
        fasta_filename = 'Ch16-3-dnabert_generated_genome.fasta'
        with open(fasta_filename, 'w') as f:
            f.write(">DNABERT_Generated_2kb_Genome\n")
            sequence = genome['sequence']
            for i in range(0, len(sequence), 80):
                f.write(sequence[i:i+80] + '\n')
        saved_files.append(fasta_filename)
        
        # Create visualizations
        static_file = visualize_genome_notebook(genome, analysis)
        if static_file:
            saved_files.append(static_file)
        
        fig, html_file = create_interactive_genome_viewer_notebook(genome, analysis)
        if html_file:
            saved_files.append(html_file)
        
        display(HTML(f"""
        <div style="border: 2px solid #4CAF50; padding: 15px; border-radius: 10px; background-color: #e8f5e8;">
            <h3>üìÅ Files Saved: {len(saved_files)}</h3>
            <ul>{''.join([f'<li>{f}</li>' for f in saved_files])}</ul>
        </div>
        """))
        
        return generator, genome, analysis, saved_files
        
    except Exception as e:
        display(HTML(f"<h3 style='color: red;'>‚ùå Error: {str(e)}</h3>"))
        return None, None, None, []

## Usage Instructions

In [None]:
def display_usage_instructions():
    """Display Mac-compatible usage instructions."""
    
    display(HTML("""
    <div style="border: 2px solid #4CAF50; padding: 20px; border-radius: 10px; background-color: #f1f8e9;">
        <h2>üöÄ Mac-Compatible Usage</h2>
        
        <h3>Quick Start (CPU Mode - Recommended):</h3>
        <code>generator, genome, analysis, files = generate_and_analyze_genome(device="cpu")</code>
        
        <h3>Other Options:</h3>
        <ul>
            <li><strong>Auto-detect:</strong> <code>generate_and_analyze_genome(device="auto")</code></li>
            <li><strong>Apple GPU (experimental):</strong> <code>generate_and_analyze_genome(device="mps")</code></li>
        </ul>
        
        <h3>Features:</h3>
        <ul>
            <li>‚úÖ No CUDA/Triton required</li>
            <li>‚úÖ Works on all Macs (M1/M2/M3/Intel)</li>
            <li>‚úÖ Same functionality as GPU version</li>
            <li>‚úÖ Automatic file saving</li>
        </ul>
        
        <p><strong>Note:</strong> CPU mode takes 2-5 minutes but is 100% reliable on M-series Macs.</p>
    </div>
    """))

display_usage_instructions()

## Run Genome Generation

Execute the cells below to generate your genome!

In [None]:
# Generate genome with CPU (Mac-compatible)
generator, genome, analysis, saved_files = generate_and_analyze_genome(device="cpu")

## End of Notebook