In [None]:
# Ch12-2 - 

In [None]:
! pip install biopython pandas matplotlib seaborn ViennaRNA requests

In [None]:
# Design siRNAs using BioPython

In [None]:
import re
from Bio import SeqIO
from Bio.Seq import Seq
import random
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

class SiRNADesigner:
    """
    A class for designing and evaluating siRNA candidates for gene silencing experiments.
    """
    
    def __init__(self):
        # Scoring parameters based on established siRNA design rules
        self.gc_content_range = (30, 60)  # Optimal GC content range (%)
        self.seed_region_gc_max = 60      # Maximum GC content in seed region (%)
        self.avoid_patterns = [
            'AAAA', 'CCCC', 'GGGG', 'TTTT',  # Homopolymer runs
            'TAAAA', 'AAAAA'                 # Termination signals
        ]
        # Scoring weights
        self.weights = {
            'gc_content': 0.3,
            'seed_gc': 0.2,
            'avoid_patterns': 0.2,
            'off_target': 0.2,
            'thermodynamic': 0.1
        }
    
    def extract_mrna_sequence(self, fasta_file, gene_id=None):
        """
        Extract mRNA sequence from a FASTA file.
        
        Args:
            fasta_file (str): Path to FASTA file
            gene_id (str): Optional gene identifier
            
        Returns:
            str: mRNA sequence
        """
        for record in SeqIO.parse(fasta_file, "fasta"):
            if gene_id is None or gene_id in record.id:
                return str(record.seq)
        raise ValueError(f"Gene ID {gene_id} not found in FASTA file")
    
    def generate_candidates(self, mrna_sequence, length=21):
        """
        Generate all possible siRNA candidates of specified length.
        
        Args:
            mrna_sequence (str): Target mRNA sequence
            length (int): siRNA length (default: 21 nt)
            
        Returns:
            list: List of candidate siRNA sequences
        """
        candidates = []
        
        # Generate all possible candidates
        for i in range(len(mrna_sequence) - length + 1):
            candidate = mrna_sequence[i:i+length]
            # Convert to RNA (T → U)
            candidate = candidate.replace('T', 'U')
            
            # Skip candidates with non-standard nucleotides
            if not all(n in 'ACGU' for n in candidate):
                continue
                
            # Create antisense strand (complement and reverse)
            sense = candidate
            antisense = str(Seq(candidate).complement())[::-1].replace('T', 'U')
            
            candidates.append({
                'position': i + 1,
                'sense': sense,
                'antisense': antisense,
                'target_region': mrna_sequence[max(0, i-10):min(i+length+10, len(mrna_sequence))]
            })
            
        return candidates
    
    def calculate_gc_content(self, sequence):
        """Calculate GC content as a percentage"""
        return (sequence.count('G') + sequence.count('C')) / len(sequence) * 100
    
    def calculate_seed_region_gc(self, antisense):
        """Calculate GC content in seed region (positions 2-8)"""
        seed_region = antisense[1:8]
        return self.calculate_gc_content(seed_region)
    
    def check_patterns(self, sequence):
        """Check for undesirable sequence patterns"""
        for pattern in self.avoid_patterns:
            if pattern in sequence:
                return False
        return True
    
    def estimate_thermodynamic_stability(self, antisense):
        """
        Estimate thermodynamic stability based on base composition
        This is a simplified model - real analysis would use nearest-neighbor thermodynamics
        """
        # Higher scores for sequences with lower stability at 5' end of antisense strand
        five_prime_end = antisense[:5]
        three_prime_end = antisense[-5:]
        
        five_prime_au = five_prime_end.count('A') + five_prime_end.count('U')
        three_prime_au = three_prime_end.count('A') + three_prime_end.count('U')
        
        # Prefer A/U at 5' end of antisense strand
        return (five_prime_au / 5) * 100
    
    def check_off_target_potential(self, antisense, transcriptome=None):
        """
        Simplified off-target potential check
        In real applications, you would BLAST against the transcriptome
        """
        # If no transcriptome provided, use seed region complexity as a proxy
        if transcriptome is None:
            seed = antisense[1:8]
            # Calculate complexity (higher is better)
            complexity = len(set(seed)) / len(seed) * 100
            return complexity
        else:
            # Here you would implement BLAST or other alignment methods
            # against the transcriptome database
            pass
    
    def score_candidates(self, candidates):
        """
        Score siRNA candidates based on design rules
        
        Args:
            candidates (list): List of candidate dictionaries
            
        Returns:
            list: Candidates with scores
        """
        scored_candidates = []
        
        for candidate in candidates:
            antisense = candidate['antisense']
            sense = candidate['sense']
            
            # Calculate individual scores
            gc_score = 100 - abs((self.calculate_gc_content(antisense) - 45) * 2)
            seed_gc_score = 100 - (self.calculate_seed_region_gc(antisense) * 100 / self.seed_region_gc_max)
            pattern_score = 100 if self.check_patterns(antisense) else 0
            thermo_score = self.estimate_thermodynamic_stability(antisense)
            off_target_score = self.check_off_target_potential(antisense)
            
            # Cap scores at 100
            gc_score = min(100, gc_score)
            seed_gc_score = min(100, seed_gc_score)
            thermo_score = min(100, thermo_score)
            off_target_score = min(100, off_target_score)
            
            # Calculate weighted score
            total_score = (
                gc_score * self.weights['gc_content'] +
                seed_gc_score * self.weights['seed_gc'] +
                pattern_score * self.weights['avoid_patterns'] +
                thermo_score * self.weights['thermodynamic'] +
                off_target_score * self.weights['off_target']
            )
            
            # Add scores to candidate
            candidate_with_score = candidate.copy()
            candidate_with_score.update({
                'gc_content': self.calculate_gc_content(antisense),
                'seed_gc_content': self.calculate_seed_region_gc(antisense),
                'gc_score': gc_score,
                'seed_gc_score': seed_gc_score,
                'pattern_score': pattern_score,
                'thermodynamic_score': thermo_score,
                'off_target_score': off_target_score,
                'total_score': total_score
            })
            
            scored_candidates.append(candidate_with_score)
        
        # Sort by total score (descending)
        return sorted(scored_candidates, key=lambda x: x['total_score'], reverse=True)
    
    def get_top_candidates(self, scored_candidates, top_n=5):
        """Return the top N candidates"""
        return scored_candidates[:top_n]
    
    def visualize_candidates(self, top_candidates):
        """
        Create visualizations for the top candidates
        
        Args:
            top_candidates (list): List of top siRNA candidates
            
        Returns:
            None: Displays plots
        """
        # Convert to DataFrame for easier plotting
        df = pd.DataFrame(top_candidates)
        
        # Plot positions along the mRNA
        plt.figure(figsize=(10, 6))
        sns.barplot(x='position', y='total_score', data=df)
        plt.title('siRNA Candidates by Position')
        plt.xlabel('Position in mRNA')
        plt.ylabel('Score')
        plt.show()
        
        # Plot score components
        score_components = ['gc_score', 'seed_gc_score', 'pattern_score', 
                           'thermodynamic_score', 'off_target_score']
        
        plt.figure(figsize=(12, 8))
        df_melt = pd.melt(df, 
                          id_vars=['position', 'sense'], 
                          value_vars=score_components,
                          var_name='Score Component', 
                          value_name='Value')
        
        sns.barplot(x='position', y='Value', hue='Score Component', data=df_melt)
        plt.title('Score Components by siRNA Candidate')
        plt.xlabel('Position in mRNA')
        plt.ylabel('Score Value')
        plt.legend(title='Score Component')
        plt.show()
    
    def format_output(self, top_candidates):
        """Format the output for the top candidates"""
        print("\n=== Top siRNA Candidates ===\n")
        
        for i, candidate in enumerate(top_candidates, 1):
            print(f"Candidate #{i} (Score: {candidate['total_score']:.2f}, Position: {candidate['position']})")
            print(f"Sense:     5'-{candidate['sense']}-3'")
            print(f"Antisense: 3'-{candidate['antisense']}-5'")
            print(f"GC Content: {candidate['gc_content']:.1f}%")
            print(f"Seed Region GC: {candidate['seed_gc_content']:.1f}%")
            print(f"Target Region: ...{candidate['target_region']}...")
            print("-" * 50)

In [None]:
# Example usage of the siRNA designer
if __name__ == "__main__":
    # Example mRNA sequence (partial GAPDH)
    example_sequence = """
    ATGGGGAAGGTGAAGGTCGGAGTCAACGGATTTGGTCGTATTGGGCGCCTGGTCACCAGGGCTGC
    TTTTAACTCTGGTAAAGTGGATATTGTTGCCATCAATGACCCCTTCATTGACCTCAACTACATGG
    TTTACATGTTCCAATATGATTCCACCCATGGCAAATTCCATGGCACCGTCAAGGCTGAGAACGGG
    AAGCTTGTCATCAATGGAAATCCCATCACCATCTTCCAGGAGCGAGATCCCTCCAAAATCAAGTG
    GGGCGATGCTGGCGCTGAGTACGTCGTGGAGTCCACTGGCGTCTTCACCACCATGGAGAAGGCTG
    """
    # Remove whitespace and newlines
    example_sequence = re.sub(r'\s+', '', example_sequence)
    
    # Create siRNA designer
    designer = SiRNADesigner()
    
    # Generate candidates
    candidates = designer.generate_candidates(example_sequence)
    print(f"Generated {len(candidates)} siRNA candidates")
    
    # Score candidates
    scored_candidates = designer.score_candidates(candidates)
    
    # Get top candidates
    top_candidates = designer.get_top_candidates(scored_candidates)
    
    # Display results
    designer.format_output(top_candidates)
    
    # Optional: Visualize results (uncomment to use)
    # designer.visualize_candidates(top_candidates)
    
    # Export to CSV
    df = pd.DataFrame(scored_candidates)
    df.to_csv('sirna_candidates.csv', index=False)
    print("\nAll candidates exported to 'sirna_candidates.csv'")

In [None]:
# Example 2 - Use 

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from Bio import SeqIO
from Bio.Seq import Seq
import numpy as np
import requests
import tempfile
import re
from io import StringIO
import RNA  # ViennaRNA package for RNA secondary structure prediction
import subprocess
import sys
import warnings
warnings.filterwarnings('ignore')

class SiRNADesigner:
    """
    A class for designing and evaluating siRNA candidates for gene silencing experiments
    using established RNAi design rules and publicly available tools.
    """
    
    def __init__(self):
        # Default parameters based on established siRNA design criteria
        self.parameters = {
            'length': 21,              # siRNA length
            'min_gc': 30,              # Minimum GC content (%)
            'max_gc': 60,              # Maximum GC content (%)
            'seed_max_gc': 60,         # Maximum GC content in seed region (%)
            'check_off_targets': True  # Whether to check for off-targets
        }
        
        # Scoring weights for different criteria
        self.weights = {
            'gc_content': 0.20,          # Weight for GC content
            'seed_gc': 0.15,             # Weight for seed region GC content
            'thermo_asymmetry': 0.25,    # Weight for thermodynamic asymmetry
            'secondary_structure': 0.20,  # Weight for target accessibility
            'motif_penalty': 0.20        # Weight for avoiding immune motifs and homopolymers
        }
        
        # Patterns to avoid
        self.avoid_patterns = [
            'AAAA', 'CCCC', 'GGGG', 'TTTT',  # Homopolymer runs
            'GUCCUUCAA', 'UGUGU',            # Immune stimulatory motifs
            'TAAAA', 'AAAAA'                 # Termination signals
        ]
        
        # Check if BLAST is installed
        self.blast_available = self._check_blast_installed()
            
    def _check_blast_installed(self):
        """Check if BLAST is installed on the system"""
        try:
            result = subprocess.run(['blastn', '-version'], 
                                    stdout=subprocess.PIPE, 
                                    stderr=subprocess.PIPE)
            return result.returncode == 0
        except FileNotFoundError:
            return False
            
    def extract_mrna_sequence(self, input_source, gene_id=None):
        """
        Extract mRNA sequence from a FASTA file or directly from an input string.
        
        Args:
            input_source (str): Path to FASTA file or sequence string
            gene_id (str): Optional gene identifier when using FASTA
            
        Returns:
            str: mRNA sequence
        """
        # Check if input_source is a file that exists
        if os.path.exists(input_source):
            for record in SeqIO.parse(input_source, "fasta"):
                if gene_id is None or gene_id in record.id:
                    return str(record.seq)
            raise ValueError(f"Gene ID {gene_id} not found in FASTA file")
        else:
            # Assume it's a sequence string
            # Clean the sequence (remove whitespace and non-sequence characters)
            return re.sub(r'[^ACGT]', '', input_source.upper())
    
    def fetch_sequence_from_ncbi(self, accession):
        """
        Fetch a sequence from NCBI using the Entrez API.
        
        Args:
            accession (str): The GenBank/RefSeq accession number
            
        Returns:
            str: The nucleotide sequence
        """
        url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id={accession}&rettype=fasta&retmode=text"
        response = requests.get(url)
        
        if response.status_code == 200:
            # Parse the FASTA response
            fasta_io = StringIO(response.text)
            for record in SeqIO.parse(fasta_io, "fasta"):
                return str(record.seq)
        else:
            raise Exception(f"Failed to fetch sequence from NCBI: {response.status_code}")
            
    def calculate_gc_content(self, sequence):
        """Calculate GC content as a percentage"""
        gc_count = sequence.count('G') + sequence.count('C')
        return (gc_count / len(sequence)) * 100
        
    def calculate_seed_region_gc(self, antisense):
        """Calculate GC content in seed region (positions 2-8)"""
        seed_region = antisense[1:8]
        return self.calculate_gc_content(seed_region)
        
    def check_forbidden_patterns(self, sequence):
        """Check for undesirable sequence patterns and motifs"""
        rna_seq = sequence.replace('T', 'U')
        
        for pattern in self.avoid_patterns:
            pattern = pattern.replace('T', 'U')
            if pattern in rna_seq:
                return False
        return True
    
    def calculate_thermodynamic_asymmetry(self, sense, antisense):
        """
        Calculate thermodynamic asymmetry between 5' and 3' ends.
        This is critical for proper RISC loading.
        
        Args:
            sense (str): Sense strand sequence
            antisense (str): Antisense strand sequence
            
        Returns:
            float: Score from 0-100 for thermodynamic asymmetry
        """
        # Convert to RNA
        sense_rna = sense.replace('T', 'U')
        antisense_rna = antisense.replace('T', 'U')
        
        # Calculate free energy of 5' and 3' terminal base pairs (4 bp windows)
        five_prime_sense = sense_rna[:4]
        five_prime_antisense = antisense_rna[-4:]
        delta_g_5prime = RNA.fold_compound(five_prime_sense + five_prime_antisense).mfe()[1]
        
        three_prime_sense = sense_rna[-4:]
        three_prime_antisense = antisense_rna[:4]
        delta_g_3prime = RNA.fold_compound(three_prime_sense + three_prime_antisense).mfe()[1]
        
        # Score is better when 5' end of antisense has higher free energy (less stable)
        # than 3' end (i.e., delta_g_3prime should be more negative than delta_g_5prime)
        diff = delta_g_3prime - delta_g_5prime
        
        # Normalize to a 0-100 scale (higher is better)
        # A difference of -3 kcal/mol or less is optimal
        if diff <= -3:
            return 100
        elif diff >= 0:
            return 0
        else:
            return (1 - (diff / -3)) * 100
    
    def analyze_target_accessibility(self, target_region):
        """
        Analyze accessibility of the target region using RNA folding.
        
        Args:
            target_region (str): The target region sequence
            
        Returns:
            float: Score from 0-100 for target accessibility
        """
        # Convert to RNA
        target_rna = target_region.replace('T', 'U')
        
        # Use ViennaRNA to predict secondary structure
        (structure, mfe) = RNA.fold(target_rna)
        
        # Calculate percentage of unpaired nucleotides in the central region
        # (where the siRNA would target)
        central_start = max(0, len(structure) // 2 - 10)
        central_end = min(len(structure), len(structure) // 2 + 11)
        central_region = structure[central_start:central_end]
        
        unpaired_count = central_region.count('.')
        unpaired_percentage = (unpaired_count / len(central_region)) * 100
        
        return unpaired_percentage
    
    def calculate_ui_value(self, sequence):
        """
        Calculate the Uitei (UI) value, which is a measure of sequence
        preference in highly effective siRNAs.
        
        Args:
            sequence (str): siRNA sequence
            
        Returns:
            float: UI value score (0-100)
        """
        # Preferred nucleotides at specific positions (based on Ui et al., 2004)
        # Position 1: A/U
        # Position 10: A
        # Position 19: A
        
        sequence = sequence.upper()
        score = 0
        
        # Check position 1 (0-indexed)
        if sequence[0] in 'AU':
            score += 33.3
            
        # Check position 10
        if len(sequence) > 9 and sequence[9] == 'A':
            score += 33.3
            
        # Check position 19
        if len(sequence) > 18 and sequence[18] == 'A':
            score += 33.3
            
        return score
    
    def generate_candidates(self, mrna_sequence, length=21):
        """
        Generate all possible siRNA candidates of specified length.
        
        Args:
            mrna_sequence (str): Target mRNA sequence
            length (int): siRNA length (default: 21 nt)
            
        Returns:
            list: List of candidate dictionaries
        """
        candidates = []
        
        # Generate all possible candidates
        for i in range(len(mrna_sequence) - length + 1):
            # Extract the target region
            target_start = max(0, i - 10)
            target_end = min(len(mrna_sequence), i + length + 10)
            target_region = mrna_sequence[target_start:target_end]
            
            # Get the siRNA sequence
            sense = mrna_sequence[i:i+length]
            
            # Skip candidates with non-standard nucleotides
            if not all(n in 'ACGT' for n in sense):
                continue
                
            # Create antisense strand (complement and reverse)
            antisense = str(Seq(sense).complement())[::-1]
            
            # Store candidate information
            candidates.append({
                'position': i + 1,
                'sense': sense,
                'antisense': antisense,
                'target_region': target_region
            })
            
        return candidates
    
    def compute_reynolds_score(self, sequence):
        """
        Compute the Reynolds siRNA design score based on Reynolds et al. (2004) criteria.
        
        Args:
            sequence (str): siRNA sequence
            
        Returns:
            float: Reynolds score (0-10)
        """
        score = 0
        sequence = sequence.upper()
        
        # GC content between 30% and 52%
        gc_content = self.calculate_gc_content(sequence)
        if 30 <= gc_content <= 52:
            score += 1
            
        # At least 3 A/Us at positions 15-19
        au_count = sum(1 for i in range(14, 19) if i < len(sequence) and sequence[i] in 'AT')
        if au_count >= 3:
            score += 1
            
        # A at position 3
        if len(sequence) > 2 and sequence[2] == 'A':
            score += 1
            
        # A at position 10
        if len(sequence) > 9 and sequence[9] == 'A':
            score += 1
            
        # U at position 13
        if len(sequence) > 12 and sequence[12] == 'T':
            score += 1
            
        # No G at position 13
        if len(sequence) > 12 and sequence[12] != 'G':
            score += 1
            
        # No G at position 19
        if len(sequence) > 18 and sequence[18] != 'G':
            score += 1
            
        # No internal repeats
        has_repeat = False
        for i in range(len(sequence) - 7):
            if sequence.count(sequence[i:i+7]) > 1:
                has_repeat = True
                break
        if not has_repeat:
            score += 1
            
        # A/U at position 1
        if sequence[0] in 'AT':
            score += 1
            
        # A/U at position 19 (if length is at least 19)
        if len(sequence) > 18 and sequence[18] in 'AT':
            score += 1
            
        return score
    
    def score_candidates(self, candidates):
        """
        Score siRNA candidates based on design rules
        
        Args:
            candidates (list): List of candidate dictionaries
            
        Returns:
            list: Candidates with scores
        """
        scored_candidates = []
        
        for candidate in candidates:
            sense = candidate['sense']
            antisense = candidate['antisense']
            target_region = candidate['target_region']
            
            # Calculate individual criteria scores
            gc_content = self.calculate_gc_content(sense)
            seed_gc = self.calculate_seed_region_gc(antisense)
            
            # GC content score (closer to 45% is better)
            gc_score = 100 - abs((gc_content - 45) * 2)
            gc_score = max(0, min(100, gc_score))
            
            # Seed region GC score (lower is better)
            seed_gc_score = 100 - ((seed_gc / self.parameters['seed_max_gc']) * 100)
            seed_gc_score = max(0, min(100, seed_gc_score))
            
            # Check for immune stimulatory motifs and homopolymers
            motif_score = 100 if self.check_forbidden_patterns(sense) else 0
            
            # Thermodynamic asymmetry score
            thermo_score = self.calculate_thermodynamic_asymmetry(sense, antisense)
            
            # Target accessibility score
            access_score = self.analyze_target_accessibility(target_region)
            
            # Additional scoring methods
            reynolds_score = (self.compute_reynolds_score(sense) / 10) * 100
            ui_score = self.calculate_ui_value(sense)
            
            # Calculate weighted composite score
            total_score = (
                gc_score * self.weights['gc_content'] +
                seed_gc_score * self.weights['seed_gc'] +
                thermo_score * self.weights['thermo_asymmetry'] +
                access_score * self.weights['secondary_structure'] +
                motif_score * self.weights['motif_penalty']
            )
            
            # Add extra weight from traditional scoring systems
            total_score = 0.7 * total_score + 0.15 * reynolds_score + 0.15 * ui_score
            
            # Add scores to candidate
            candidate_with_score = candidate.copy()
            candidate_with_score.update({
                'gc_content': gc_content,
                'seed_gc': seed_gc,
                'gc_score': gc_score,
                'seed_gc_score': seed_gc_score,
                'motif_score': motif_score,
                'thermo_score': thermo_score,
                'access_score': access_score,
                'reynolds_score': reynolds_score,
                'ui_score': ui_score,
                'total_score': total_score
            })
            
            scored_candidates.append(candidate_with_score)
        
        # Sort by total score (descending)
        return sorted(scored_candidates, key=lambda x: x['total_score'], reverse=True)
    
    def check_off_targets_blast(self, sequence, db_path):
        """
        Check for potential off-targets using BLAST
        
        Args:
            sequence (str): siRNA sequence
            db_path (str): Path to BLAST database
            
        Returns:
            int: Number of potential off-targets
        """
        if not self.blast_available:
            print("BLAST not installed. Skipping off-target analysis.")
            return 0
            
        # Write sequence to temporary file
        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.fa') as temp:
            temp.write(f">query\n{sequence}\n")
            temp_filename = temp.name
            
        try:
            # Run BLASTN with parameters optimized for short sequences
            cmd = [
                'blastn',
                '-query', temp_filename,
                '-db', db_path,
                '-task', 'blastn-short',
                '-outfmt', '6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore',
                '-word_size', '7',
                '-evalue', '1000',
                '-max_target_seqs', '10'
            ]
            
            result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
            
            # Count hits with high identity (>80%)
            hits = [line for line in result.stdout.split('\n') if line]
            off_targets = sum(1 for line in hits if float(line.split()[2]) > 80)
            
            return off_targets
            
        finally:
            # Clean up
            os.unlink(temp_filename)
    
    def predict_efficacy_with_thermocomposition(self, sequence):
        """
        Predict siRNA efficacy based on thermodynamic and composition features.
        
        Args:
            sequence (str): siRNA sequence
            
        Returns:
            float: Predicted efficacy score (0-1)
        """
        # Convert to RNA
        rna_seq = sequence.replace('T', 'U')
        
        # Calculate dinucleotide frequencies
        dinucleotides = {}
        for i in range(len(rna_seq) - 1):
            dinuc = rna_seq[i:i+2]
            dinucleotides[dinuc] = dinucleotides.get(dinuc, 0) + 1
            
        # Normalize by sequence length
        for dinuc in dinucleotides:
            dinucleotides[dinuc] = dinucleotides[dinuc] / (len(rna_seq) - 1)
            
        # Calculate position-dependent nucleotide preferences
        # Based on features found to be important in literature
        position_score = 0
        
        # Preference for A/U at position 1
        if rna_seq[0] in 'AU':
            position_score += 0.2
            
        # Preference for A at position 10
        if len(rna_seq) > 9 and rna_seq[9] == 'A':
            position_score += 0.2
            
        # Preference for U at position 13
        if len(rna_seq) > 12 and rna_seq[12] == 'U':
            position_score += 0.2
            
        # Preference for A/U at positions 15-19
        au_count = sum(1 for i in range(14, 19) if i < len(rna_seq) and rna_seq[i] in 'AU')
        position_score += 0.1 * min(au_count, 3) / 3
            
        # Calculate thermodynamic stability of seed region
        if len(rna_seq) >= 8:
            seed_region = rna_seq[1:8]
            (_, seed_mfe) = RNA.fold(seed_region)
            
            # Normalize MFE to a 0-1 scale (higher values are better)
            # Typical range is -4 to 0 kcal/mol for a 7-mer
            seed_stability = max(0, min(1, (seed_mfe + 4) / 4))
        else:
            seed_stability = 0.5  # Default if sequence is too short
            
        # Combined score - weigh components differently
        weighted_score = (
            0.3 * position_score +
            0.3 * seed_stability +
            0.4 * (dinucleotides.get('AU', 0) + dinucleotides.get('UA', 0))  # High AU content is good
        )
        
        return weighted_score

    def get_top_candidates(self, scored_candidates, top_n=5):
        """Return the top N candidates"""
        return scored_candidates[:top_n]
    
    def visualize_candidates(self, candidates, top_n=10):
        """
        Create visualizations for siRNA candidates.
        
        Args:
            candidates (list): List of candidate dictionaries
            top_n (int): Number of top candidates to visualize
            
        Returns:
            None: Displays plots
        """
        # Convert to DataFrame for easier plotting
        df = pd.DataFrame(candidates[:top_n])
        
        # 1. Overall scores
        plt.figure(figsize=(12, 6))
        ax = sns.barplot(x=df.index, y='total_score', data=df)
        plt.title('Top siRNA Candidates by Score')
        plt.xlabel('Candidate Index')
        plt.ylabel('Total Score')
        
        # Add position labels
        for i, p in enumerate(ax.patches):
            ax.annotate(f"Pos: {df.iloc[i]['position']}", 
                       (p.get_x() + p.get_width() / 2., p.get_height()), 
                       ha='center', va='center', 
                       xytext=(0, 10), 
                       textcoords='offset points')
        
        plt.tight_layout()
        plt.show()
        
        # 2. Score components
        score_components = ['gc_score', 'seed_gc_score', 'thermo_score', 
                           'access_score', 'motif_score']
        
        plt.figure(figsize=(14, 8))
        df_melt = pd.melt(
            df, 
            id_vars=['position'], 
            value_vars=score_components,
            var_name='Score Component', 
            value_name='Value'
        )
        
        sns.barplot(x='position', y='Value', hue='Score Component', data=df_melt)
        plt.title('Score Components by siRNA Candidate')
        plt.xlabel('Position in Target mRNA')
        plt.ylabel('Score Value')
        plt.legend(title='Score Component')
        plt.tight_layout()
        plt.show()
        
        # 3. GC content distribution
        plt.figure(figsize=(10, 6))
        sns.scatterplot(x='position', y='gc_content', size='total_score', 
                       hue='total_score', sizes=(50, 200), data=df)
        plt.axhline(y=30, color='r', linestyle='--', alpha=0.5)
        plt.axhline(y=60, color='r', linestyle='--', alpha=0.5)
        plt.title('GC Content Distribution of Top siRNA Candidates')
        plt.xlabel('Position in Target mRNA')
        plt.ylabel('GC Content (%)')
        plt.tight_layout()
        plt.show()
    
    def format_output(self, candidates, top_n=5):
        """Format the output for the top candidates"""
        print("\n=== Top siRNA Candidates ===\n")
        
        for i, candidate in enumerate(candidates[:top_n], 1):
            print(f"Candidate #{i} (Score: {candidate['total_score']:.2f}, Position: {candidate['position']})")
            print(f"Sense:     5'-{candidate['sense']}-3'")
            print(f"Antisense: 3'-{candidate['antisense']}-5'")
            print(f"GC Content: {candidate['gc_content']:.1f}%")
            print(f"Seed Region GC: {candidate['seed_gc']:.1f}%")
            
            # Print additional scores
            print(f"Thermodynamic Asymmetry: {candidate['thermo_score']:.1f}")
            print(f"Target Accessibility: {candidate['access_score']:.1f}")
            print(f"Reynolds Score: {candidate['reynolds_score']:.1f}")
            
            # Target region
            print(f"Target Region: ...{candidate['target_region']}...")
            print("-" * 50)
    
    def export_candidates(self, candidates, file_path):
        """Export candidates to CSV file"""
        df = pd.DataFrame(candidates)
        df.to_csv(file_path, index=False)
        print(f"\nAll candidates exported to '{file_path}'")


# Example usage
if __name__ == "__main__":
    try:
        # Check if ViennaRNA package is installed
        import RNA
    except ImportError:
        print("ERROR: ViennaRNA Python package not found.")
        print("Please install it with: pip install ViennaRNA")
        print("For installation instructions, visit: https://www.tbi.univie.ac.at/RNA/")
        sys.exit(1)
        
    # Create siRNA designer
    designer = SiRNADesigner()
    
    # Example mRNA sequence (partial GAPDH)
    example_sequence = """
    ATGGGGAAGGTGAAGGTCGGAGTCAACGGATTTGGTCGTATTGGGCGCCTGGTCACCAGGGCTGC
    TTTTAACTCTGGTAAAGTGGATATTGTTGCCATCAATGACCCCTTCATTGACCTCAACTACATGG
    TTTACATGTTCCAATATGATTCCACCCATGGCAAATTCCATGGCACCGTCAAGGCTGAGAACGGG
    AAGCTTGTCATCAATGGAAATCCCATCACCATCTTCCAGGAGCGAGATCCCTCCAAAATCAAGTG
    GGGCGATGCTGGCGCTGAGTACGTCGTGGAGTCCACTGGCGTCTTCACCACCATGGAGAAGGCTG
    """
    # Clean the sequence
    example_sequence = re.sub(r'\s+', '', example_sequence)
    
    print("Designing siRNAs for target sequence...")
    
    # Option 2: Fetch sequence from NCBI using accession number
    # Example:
    # accession = "NM_002046"  # GAPDH
    # example_sequence = designer.fetch_sequence_from_ncbi(accession)
    
    # Generate candidates
    candidates = designer.generate_candidates(example_sequence)
    print(f"Generated {len(candidates)} siRNA candidates")
    
    # Score candidates
    print("Scoring candidates based on design rules...")
    scored_candidates = designer.score_candidates(candidates)
    
    # Get top candidates
    top_candidates = designer.get_top_candidates(scored_candidates)
    
    # Display results
    designer.format_output(top_candidates)
    
    # Export to CSV
    designer.export_candidates(scored_candidates, 'sirna_candidates.csv')
    
    # Optional: Visualize results (uncomment to use)
    # designer.visualize_candidates(scored_candidates)