In [None]:
# Ch13-1-grna-design

In [None]:
# Get the BRCA1 gene 
! wget -O brca1_sequence.fasta "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&id=NG_005905.2&rettype=fasta&retmode=text"

In [None]:
## Function to design Guide RNAs against a sequence - uses BRCA gene as an example ##
"""
CRISPR Guide RNA Design for BRCA Gene Mutation Knock-in
This script identifies optimal guide RNAs to introduce a specific mutation in the BRCA1/2 gene.
"""
# Import Libraries
from Bio import SeqIO
import matplotlib.pyplot as plt
import os
import pandas as pd

In [None]:
## Function to retrieve the BRCA1 gene sequence from a file ##
def load_brca_sequence_from_file(filename, gene="BRCA1"):
    """Load BRCA gene sequence from a local FASTA file."""
    print(f"Loading {gene} sequence from file: {filename}")
    
    # Check if file exists
    if not os.path.exists(filename):
        raise FileNotFoundError(f"File not found: {filename}")
    
    # Read the sequence from the file
    try:
        record = SeqIO.read(filename, "fasta")
        sequence = str(record.seq).upper()
    except Exception as e:
        raise Exception(f"Error reading FASTA file: {str(e)}")
   
    print(f"Sequence length: {len(sequence)} bp")
    return sequence, record.id

In [None]:
## Function to design guides for the mutation position ##
def design_mutation_guides(sequence, mutation_pos, mutation_from, mutation_to, window=100):
    """Design guides for introducing a specific mutation."""
    # Extract the region around the mutation
    start = max(0, mutation_pos - window)
    end = min(len(sequence), mutation_pos + window)
    region_seq = sequence[start:end]
    
    # Verify that the base at mutation position matches the expected base
    if sequence[mutation_pos] != mutation_from:
        raise ValueError(f"Base at position {mutation_pos} is {sequence[mutation_pos]}, not {mutation_from}")
    
    # Find all potential PAM sites (NGG for SpCas9) near the mutation
    guides = []
    
    # Search window around mutation site
    for i in range(max(0, mutation_pos - window), min(len(sequence), mutation_pos + window)):
        # Check for NGG PAM
        if i + 2 < len(sequence) and sequence[i+1:i+3] == "GG":
            # Ensure we have enough sequence for a 20-nt guide
            if i >= 20:
                guide_seq = sequence[i-20:i]
                
                # Calculate distance from mutation to cut site (3bp upstream of PAM)
                cut_site = i - 3
                distance = mutation_pos - cut_site
                
                guides.append({
                    "sequence": guide_seq,
                    "pam": "NGG",
                    "strand": "+",
                    "start": i-20,
                    "end": i,
                    "mutation_distance": abs(distance),
                    "cut_site": cut_site
                })
        
        # Check reverse strand (look for CC which is the reverse complement of GG)
        if i >= 1 and sequence[i-1:i+1] == "CC":
            # Ensure we have enough sequence for a 20-nt guide
            if i + 20 < len(sequence):
                # For reverse strand, we take the reverse complement
                # But here we'll store the target sequence for simplicity
                guide_seq = sequence[i+1:i+21]
                
                # Calculate cut site and distance
                cut_site = i + 3
                distance = mutation_pos - cut_site
                
                guides.append({
                    "sequence": guide_seq,
                    "pam": "NGG",
                    "strand": "-",
                    "start": i+1,
                    "end": i+21,
                    "mutation_distance": abs(distance),
                    "cut_site": cut_site
                })
    return guides

In [None]:
## Function to calculate GC content ##
def calculate_gc_content(sequence):
    """Calculate GC content of a sequence."""
    gc_count = sequence.count("G") + sequence.count("C")
    return (gc_count / len(sequence)) * 100

In [None]:
## Design the repair template ##
def design_repair_template(sequence, mutation_pos, mutation_from, mutation_to, 
                           homology_arm_length=50):
    """Design a repair template for HDR."""
    # Extract sequence for left homology arm
    left_arm_start = mutation_pos - homology_arm_length
    left_arm = sequence[left_arm_start:mutation_pos]
    
    # Extract sequence for right homology arm
    right_arm_end = mutation_pos + 1 + homology_arm_length
    right_arm = sequence[mutation_pos+1:right_arm_end]
    
    # Create repair template with mutation
    repair_template = left_arm + mutation_to + right_arm
    
    return {
        "template": repair_template,
        "left_arm": left_arm,
        "right_arm": right_arm,
        "mutation": mutation_to,
        "length": len(repair_template)
    }

In [None]:
def score_guides(guides):
    """Score and filter guides based on various criteria."""
    scored_guides = []
    
    for guide in guides:
        # Calculate basic properties
        seq = guide["sequence"]
        
        # Calculate GC content (ideal: 40-60%)
        gc_content = calculate_gc_content(seq)
        gc_score = 1.0 - abs(gc_content - 50) / 50
        
        # Scoring based on distance from mutation (ideal: -10 to +10 bp from cut site)
        distance = guide["mutation_distance"]
        if 0 <= distance <= 10:
            distance_score = 1.0
        else:
            distance_score = max(0, 1.0 - (distance - 10) / 40)
        
        # Check for homopolymers (repeats of 4+ nucleotides)
        has_homopolymer = any(base * 4 in seq for base in "ATGC")
        
        # Check for self-complementarity (simplified)
        has_self_comp = False  # Simplified check, we could use ViennaRNA here for example
        
        # Calculate overall score (weighted average)
        overall_score = (0.3 * gc_score) + (0.7 * distance_score)
        if has_homopolymer or has_self_comp:
            overall_score *= 0.5
        
        # Add scores to guide
        guide_scored = guide.copy()
        guide_scored.update({
            "gc_content": gc_content,
            "gc_score": gc_score,
            "distance_score": distance_score,
            "has_homopolymer": has_homopolymer,
            "has_self_complementarity": has_self_comp,
            "overall_score": overall_score
        })
        
        scored_guides.append(guide_scored)
    
    # Sort by overall score
    return sorted(scored_guides, key=lambda x: x["overall_score"], reverse=True)

In [None]:
def visualize_guides(sequence, guides, mutation_pos, mutation_from, mutation_to, top_n=5):
    """Visualize top guides and their positions relative to the mutation."""
    plt.figure(figsize=(12, 6))
    
    # Plot the sequence region
    region_start = max(0, mutation_pos - 100)
    region_end = min(len(sequence), mutation_pos + 100)
    
    # Plot mutation position
    plt.axvline(x=mutation_pos, color='red', linestyle='--', label=f'Mutation: {mutation_from} → {mutation_to}')
    
    # Plot top guides
    top_guides = guides[:top_n]
    
    for i, guide in enumerate(top_guides):
        if guide["strand"] == "+":
            plt.hlines(y=i+1, xmin=guide["start"], xmax=guide["end"], color='blue', alpha=0.7)
            plt.plot(guide["cut_site"], i+1, 'o', color='green')
        else:
            plt.hlines(y=i+1, xmin=guide["start"], xmax=guide["end"], color='purple', alpha=0.7)
            plt.plot(guide["cut_site"], i+1, 'o', color='green')
    
    # Add legend and labels
    plt.legend()
    plt.xlim(region_start, region_end)
    plt.ylim(0, top_n+1)
    plt.xlabel('Genomic Position')
    plt.yticks(range(1, top_n+1), [f"Guide {i+1}: Score={g['overall_score']:.2f}" for i, g in enumerate(top_guides)])
    plt.title(f'Top {top_n} Guide RNAs for {mutation_from}{mutation_pos}{mutation_to} Mutation')
    
    plt.tight_layout()
    plt.savefig('guide_visualization.png')
    plt.close()

In [None]:
def main():
    # BRCA1 5382insC mutation (common in Ashkenazi Jewish population)
    gene = "BRCA1"
    # Example position - in a real scenario, you would map to the exact genomic coordinate
    mutation_pos = 42500  # position in the sequence
    mutation_from = "T"
    mutation_to = "C"
    
    # File containing the BRCA1 sequence
    brca_file = "brca1_sequence.fasta"  # Change this to your actual file path
    
    # Load BRCA sequence from file
    try:
        sequence, gene_id = load_brca_sequence_from_file(brca_file, gene)
    except Exception as e:
        print(f"Error loading sequence: {str(e)}")
        print("Please ensure your FASTA file exists and is properly formatted.")
        print("You can download the BRCA1 sequence from NCBI and save it as brca1_sequence.fasta")
        return
    
    print(f"Designing guides for {gene} mutation {mutation_from}{mutation_pos}{mutation_to}")
    
    # Design guides
    try:
        guides = design_mutation_guides(sequence, mutation_pos, mutation_from, mutation_to)
        print(f"Found {len(guides)} potential guides")
    except ValueError as e:
        print(f"Error: {str(e)}")
        print("Check that the mutation position and reference base are correct.")
        return
    
    # Score and filter guides
    scored_guides = score_guides(guides)
    
    # Design repair template
    repair_template = design_repair_template(sequence, mutation_pos, mutation_from, mutation_to)
    
    # Print top guides
    print("\nTop 5 Guide RNAs:")
    for i, guide in enumerate(scored_guides[:5], 1):
        print(f"Guide {i}: {guide['sequence']} (PAM: {guide['pam']}, Strand: {guide['strand']})")
        print(f"  Position: {guide['start']}-{guide['end']}, Cut site: {guide['cut_site']}")
        print(f"  Distance to mutation: {guide['mutation_distance']} bp")
        print(f"  GC content: {guide['gc_content']:.1f}%, Score: {guide['overall_score']:.2f}")
    
    # Visualize guides
    visualize_guides(sequence, scored_guides, mutation_pos, mutation_from, mutation_to)
    
    # Print repair template information
    print("\nRepair Template Design:")
    print(f"Length: {repair_template['length']} bp")
    print(f"Left homology arm: {repair_template['left_arm']}")
    print(f"Mutation: {mutation_from} → {repair_template['mutation']}")
    print(f"Right homology arm: {repair_template['right_arm']}")
    
    # Save guide information
    guides_df = pd.DataFrame(scored_guides)
    guides_df.to_csv(f"{gene}_mutation_guides.csv", index=False)
    
    print(f"\nResults saved to {gene}_mutation_guides.csv and guide_visualization.png")

if __name__ == "__main__":
    main()

In [None]:
## End of Notebook ##