In [1]:
import os
from typing import List, Dict, Tuple
import pandas as pd
from Bio.Seq import Seq

os.chdir(r"E:\RPA–CRISPR-model")

# CRISPR-Cas12a Parameters
PAM_MOTIF = "TTT"  # LbCas12a canonical PAM is TTTV, but we scan for TTT backbone
SPACER_LENGTH = 20  # Standard spacer length (20-24nt)
SEED_REGION_LENGTH = 10  # PAM-proximal bases where mismatches are fatal to binding

# Explicit SNP Definitions (to ensure we build the correct Mutant sequence)
# Format: {Gene: {WT_Base -> Mut_Base}} based on H37Rv reference strand
# Note: These must match the biological reality of the resistance mechanism.
SNP_DEFINITIONS = {
    "rpoB": {"pos_offset": 70, "wt": "C", "mut": "T"}, # S531L (TCG->TTG on - strand means CGA->CAA on +?? Check carefully below)
    # Correction for rpoB S531L (TCG -> TTG on coding strand).
    # rpoB is on MINUS strand. 
    # Let's rely on the sequences extracted. We will auto-detect the WT base at offset.
    
    # We will inject the known mutation types based on literature for detection simulation:
    # rpoB S531L: TCG(Ser) -> TTG(Leu). 
    # katG S315T: AGC(Ser) -> ACC(Thr).
    # inhA C-15T: C -> T promoter mutation.
}

In [2]:
# 2. HELPER FUNCTIONS

def get_reverse_complement(seq_str: str) -> str:
    return str(Seq(seq_str).reverse_complement())

def apply_mutation(amplicon_seq: str, snp_offset: int, gene: str) -> Tuple[str, str, str]:
    """
    Generate the Mutant sequence from the WT amplicon and known mutation logic.
    Returns: (Mutant_Seq, WT_Base, Mut_Base)
    """
    wt_base = amplicon_seq[snp_offset]
    
    # Define mutation logic based on gene
    # Note: This is a simplification. In a full pipeline, we'd parse the 'mut_codon' column.
    if gene == "rpoB": 
        # rpoB S531L often involves C->T or G->A depending on strand view.
        # Common RIF mutation in extracted fragment context:
        mut_base = "T" if wt_base == "C" else "A" # Placeholder logic for S531L
    elif gene == "katG":
        # S315T (AGC->ACC). G->C mutation usually.
        mut_base = "C" if wt_base == "G" else "G"
    elif gene == "inhA_promoter":
        # C-15T is explicitly C -> T
        mut_base = "T" if wt_base == "C" else "A"
    else:
        mut_base = "N" # Unknown
        
    mut_seq = list(amplicon_seq)
    mut_seq[snp_offset] = mut_base
    return "".join(mut_seq), wt_base, mut_base

def find_cas12a_guides(
    sequence: str, 
    strand_name: str, 
    snp_pos_in_seq: int
) -> List[Dict]:
    """
    Scan a sequence (Sense or Antisense) for TTTV PAMs and extract spacers.
    Check if the SNP falls within the spacer and specifically the seed region.
    """
    guides = []
    seq_len = len(sequence)
    
    # Simple scan for 'TTT' (We verify the 4th base 'V' later)
    # LbCas12a PAM: 5'-TTTV-3' followed by Spacer
    
    for i in range(seq_len - SPACER_LENGTH - 4):
        # Extract potential PAM (4 nt)
        pam_candidate = sequence[i : i+4]
        
        # Check strict PAM (TTTV = TTTA, TTTC, TTTG). TTTN is permissive but strictly TTTT is usually depleted.
        if not pam_candidate.startswith("TTT"):
            continue
            
        # 4th base check (V = not T preferred, but we log all TTTN for analysis)
        pam_4th = pam_candidate[3]
        is_canonical = (pam_4th in ['A', 'C', 'G'])
        
        # Spacer starts immediately after PAM
        spacer_start = i + 4
        spacer_end = spacer_start + SPACER_LENGTH
        spacer_seq = sequence[spacer_start : spacer_end]
        
        # Check SNP overlap
        # snp_pos_in_seq is the index of the mutation in THIS sequence orientation
        snp_in_spacer = (spacer_start <= snp_pos_in_seq < spacer_end)
        
        if snp_in_spacer:
            # Calculate position in spacer (1-based, from 5' PAM-proximal)
            pos_in_guide = snp_pos_in_seq - spacer_start + 1
            
            # Seed region: usually 1-6 nt from PAM
            in_seed = (1 <= pos_in_guide <= 10)
            
            guides.append({
                "pam": pam_candidate,
                "spacer_seq": spacer_seq,
                "strand": strand_name,
                "pam_start": i,
                "is_canonical_pam": is_canonical,
                "snp_position_in_guide": pos_in_guide,
                "in_seed_region": in_seed
            })
            
    return guides

In [3]:
# 3. MAIN DESIGN LOOP
# Load Data
amplicons_df = pd.read_csv("data/simulated/rpa_amplicons_designed.csv")

all_guides = []

print(f"Loaded {len(amplicons_df)} amplicons. Starting guide design...\n")

for idx, row in amplicons_df.iterrows():
    gene = row["gene"]
    wt_amp_seq = row["amplicon_sequence"]
    snp_offset = row["snp_offset_in_amplicon_bp"]
    
    # 1. Construct Mutant Sequence (Target)
    # We want to detect the MUTANT, so we design guides against the Mutant sequence.
    mut_amp_seq, wt_base, mut_base = apply_mutation(wt_amp_seq, snp_offset, gene)
    
    print(f"Processing {gene} | SNP: {wt_base}->{mut_base} at offset {snp_offset}")
    
    # 2. Analyze Sense Strand (+)
    guides_sense = find_cas12a_guides(mut_amp_seq, "+", snp_offset)
    
    # 3. Analyze Antisense Strand (-)
    # We must reverse complement the sequence AND recalculate SNP index
    mut_amp_rc = get_reverse_complement(mut_amp_seq)
    seq_len = len(mut_amp_seq)
    # New SNP index = Length - 1 - Old_Index
    snp_offset_rc = seq_len - 1 - snp_offset
    
    guides_antisense = find_cas12a_guides(mut_amp_rc, "-", snp_offset_rc)
    
    # 4. Aggregate & Filter
    candidates = guides_sense + guides_antisense
    
    for g in candidates:
        # Determine specificity status
        # Ideally: In seed region + Canonical PAM
        if g["in_seed_region"] and g["is_canonical_pam"]:
            rank = "High (Seed + PAM)"
        elif g["in_seed_region"]:
            rank = "Medium (Seed only)"
        else:
            rank = "Low (Distal SNP)"
            
        all_guides.append({
            "gene": gene,
            "drug": row["drug"],
            "target_strand": g["strand"],
            "pam": g["pam"],
            "spacer_sequence": g["spacer_seq"],
            "snp_position_in_spacer": g["snp_position_in_guide"],
            "in_seed": g["in_seed_region"],
            "rank": rank,
            "amplicon_context": "Mutant" # Designed against Mutant
        })

Loaded 3 amplicons. Starting guide design...

Processing rpoB | SNP: C->T at offset 70
Processing katG | SNP: G->C at offset 70
Processing inhA_promoter | SNP: C->T at offset 74


In [4]:
# 4. EXPORT & SUMMARY

guides_df = pd.DataFrame(all_guides)

# Sort by Rank validity
rank_order = {"High (Seed + PAM)": 1, "Medium (Seed only)": 2, "Low (Distal SNP)": 3}
guides_df["sort_key"] = guides_df["rank"].map(rank_order)
guides_df = guides_df.sort_values(["gene", "sort_key"])

# Filter for the best guides (Top 2 per gene)
best_guides = guides_df.groupby("gene").head(3).drop(columns=["sort_key"])

print("\n" + "="*80)
print("BEST CRISPR-Cas12a GUIDES DESIGNED")
print("="*80)
print(best_guides[["gene", "pam", "spacer_sequence", "snp_position_in_spacer", "rank"]].to_string(index=False))

# Save
output_path = "data/simulated/crispr_guides_designed.csv"
best_guides.to_csv(output_path, index=False)
print(f"\n✓ Saved best guides to: {output_path}")

# Generate ordering list (Fasta)
fasta_out = "data/simulated/crrna_order_list.fasta"
with open(fasta_out, "w") as f:
    for idx, row in best_guides.iterrows():
        # crRNA usually includes the LbCas12a scaffold + Spacer
        # Scaffold: UAA UUU CUA CUA AGU GUA GAU
        scaffold = "TAATTTCTACTAAGTGTAGAT" 
        full_crrna = scaffold + row["spacer_sequence"]
        f.write(f">crRNA_{row['gene']}_{row['drug']}_SNPpos{row['snp_position_in_spacer']}\n")
        f.write(f"{full_crrna}\n")

print(f"✓ Saved ordering FASTA (Scaffold+Spacer) to: {fasta_out}")


BEST CRISPR-Cas12a GUIDES DESIGNED
gene  pam      spacer_sequence  snp_position_in_spacer              rank
katG TTTC CGTTGAGCGACACGGTCGTT                       2 High (Seed + PAM)

✓ Saved best guides to: data/simulated/crispr_guides_designed.csv
✓ Saved ordering FASTA (Scaffold+Spacer) to: data/simulated/crrna_order_list.fasta
