# Batch Protein Extraction - All Species

**Goal:** Extract all 4 BGC proteins (PsiD, PsiK, PsiM, PsiH) from all 71 Psilocybe species

**Input:** 
- 71 genome scaffold files from `03_Sequences_Paper/Assembly_scaffolds/`
- 4 reference proteins (PsiD, PsiK, PsiM, PsiH) from P. cubensis

**Output:**
- ~284 protein sequences (71 species × 4 genes)
- Combined FASTA files for each gene (for phylogenetic analysis)

---

## Setup

In [None]:
import os
import subprocess
from pathlib import Path
from Bio import SeqIO
import pandas as pd
from tqdm import tqdm

# Set paths
PROJECT_DIR = Path.cwd().parent
SCAFFOLD_DIR = PROJECT_DIR / "03_Sequences_Paper" / "Assembly_scaffolds"
REF_PROTEIN_DIR = Path.cwd() / "reference_proteins"
INPUT_DIR = PROJECT_DIR / "01_Colab_codes_InputFiles" / "Files_paper_P_baeocystis"

# Create output directories
BATCH_RESULTS_DIR = Path.cwd() / "batch_results"
BATCH_RESULTS_DIR.mkdir(exist_ok=True)

# Create subdirectories for each gene
for gene in ['PsiD', 'PsiK', 'PsiM', 'PsiH']:
    (BATCH_RESULTS_DIR / gene).mkdir(exist_ok=True)

print(f"Scaffold directory: {SCAFFOLD_DIR}")
print(f"Results directory: {BATCH_RESULTS_DIR}")
print(f"Reference proteins: {REF_PROTEIN_DIR}")

## Define BGC Genes and Reference Proteins

In [None]:
# Define all 4 BGC genes with their reference proteins
BGC_GENES = {
    'PsiD': {
        'name': 'Tryptophan decarboxylase',
        'reference': INPUT_DIR / "PsiD_Psilocybe_cubensis_reference.faa",
        'function': 'Converts tryptophan to tryptamine'
    },
    'PsiK': {
        'name': 'Kinase',
        'reference': REF_PROTEIN_DIR / "PsiK_Psilocybe_cubensis_reference.faa",
        'function': 'Phosphorylates intermediate'
    },
    'PsiM': {
        'name': 'Methyltransferase',
        'reference': REF_PROTEIN_DIR / "PsiM_Psilocybe_cubensis_reference.faa",
        'function': 'Methylates tryptamine'
    },
    'PsiH': {
        'name': 'P450 monooxygenase',
        'reference': REF_PROTEIN_DIR / "PsiH_Psilocybe_cubensis_reference.faa",
        'function': 'Hydroxylates at C-4 position (critical for activity)'
    }
}

# Verify all reference files exist
print("Checking reference protein files:")
for gene, info in BGC_GENES.items():
    exists = info['reference'].exists()
    symbol = "✅" if exists else "❌"
    print(f"  {symbol} {gene}: {info['reference'].name}")

## List All Species Scaffolds

In [None]:
# Get all scaffold files
scaffold_files = sorted(list(SCAFFOLD_DIR.glob("*.scaffolds.fasta")))

print(f"Found {len(scaffold_files)} species scaffold files\n")
print("First 10 species:")
for i, scaffold_file in enumerate(scaffold_files[:10], 1):
    species_name = scaffold_file.stem.replace('.scaffolds', '')
    print(f"  {i}. {species_name}")
    
print(f"\n... and {len(scaffold_files) - 10} more")

## Define Extraction Function

In [None]:
def extract_protein(species_name, scaffold_file, gene, reference_protein, output_dir, verbose=False):
    """
    Extract a single protein from a genome scaffold.
    
    Returns:
        dict: Results with status, protein length, and any warnings
    """
    # Define output files
    output_cds = output_dir / f"{species_name}_{gene}.cds.fa"
    output_protein = output_dir / f"{species_name}_{gene}.prot.fa"
    
    results = {
        'species': species_name,
        'gene': gene,
        'status': 'pending',
        'protein_length': 0,
        'stop_codons': 0,
        'warnings': []
    }
    
    try:
        # Step 1: Extract CDS with exonerate
        cmd_cds = f'''exonerate --model protein2genome \
  "{reference_protein}" \
  "{scaffold_file}" \
  --bestn 1 \
  --showalignment no --showvulgar no --verbose 0 \
  --ryo ">{gene}|%ti:%tab-%tae(%tS)\\n%tcs\\n" > "{output_cds}"'''
        
        result = subprocess.run(cmd_cds, shell=True, capture_output=True, text=True, timeout=300)
        
        if result.returncode != 0:
            results['status'] = 'failed_exonerate'
            results['warnings'].append(f"Exonerate failed: {result.stderr[:100]}")
            return results
        
        # Check if CDS was found
        if not output_cds.exists() or output_cds.stat().st_size == 0:
            results['status'] = 'no_match'
            results['warnings'].append("No match found in genome")
            return results
        
        # Step 2: Translate to protein
        cmd_translate = f'transeq -sequence "{output_cds}" -outseq "{output_protein}" -frame 1'
        result = subprocess.run(cmd_translate, shell=True, capture_output=True, text=True, timeout=60)
        
        if result.returncode != 0:
            results['status'] = 'failed_translate'
            results['warnings'].append(f"Translation failed: {result.stderr[:100]}")
            return results
        
        # Step 3: Quality control
        if output_protein.exists():
            record = list(SeqIO.parse(output_protein, "fasta"))[0]
            sequence = str(record.seq)
            
            # Check for internal stop codons
            internal_seq = sequence[:-1] if sequence.endswith('*') else sequence
            stop_count = internal_seq.count('*')
            
            results['protein_length'] = len(sequence)
            results['stop_codons'] = stop_count
            
            if stop_count > 0:
                results['warnings'].append(f"{stop_count} internal stop codon(s)")
            
            if len(sequence) < 50:
                results['warnings'].append(f"Protein very short: {len(sequence)} aa")
            
            results['status'] = 'success'
        else:
            results['status'] = 'failed_qc'
            results['warnings'].append("Protein file not created")
    
    except subprocess.TimeoutExpired:
        results['status'] = 'timeout'
        results['warnings'].append("Command timeout")
    except Exception as e:
        results['status'] = 'error'
        results['warnings'].append(f"Exception: {str(e)[:100]}")
    
    return results

print("✓ Extraction function defined")

## Test on a Few Species First

In [None]:
# Test on first 3 species
test_species = scaffold_files[:3]

print("Testing extraction on 3 species...\n")
test_results = []

for scaffold_file in test_species:
    species_name = scaffold_file.stem.replace('.scaffolds', '')
    print(f"Processing: {species_name}")
    
    for gene, info in BGC_GENES.items():
        output_dir = BATCH_RESULTS_DIR / gene
        result = extract_protein(
            species_name=species_name,
            scaffold_file=scaffold_file,
            gene=gene,
            reference_protein=info['reference'],
            output_dir=output_dir
        )
        test_results.append(result)
        
        status_symbol = "✅" if result['status'] == 'success' else "⚠️"
        print(f"  {status_symbol} {gene}: {result['status']} ({result['protein_length']} aa)")
    
    print()

# Show summary
df_test = pd.DataFrame(test_results)
print("\nTest Summary:")
print(df_test[['species', 'gene', 'status', 'protein_length']].to_string(index=False))

## Batch Process All Species

**Note:** This will take some time (~71 species × 4 genes = 284 extractions)

In [None]:
# Process all species
all_results = []

print(f"Processing {len(scaffold_files)} species × 4 genes = {len(scaffold_files) * 4} extractions\n")

for scaffold_file in tqdm(scaffold_files, desc="Species"):
    species_name = scaffold_file.stem.replace('.scaffolds', '')
    
    for gene, info in BGC_GENES.items():
        output_dir = BATCH_RESULTS_DIR / gene
        result = extract_protein(
            species_name=species_name,
            scaffold_file=scaffold_file,
            gene=gene,
            reference_protein=info['reference'],
            output_dir=output_dir,
            verbose=False
        )
        all_results.append(result)

print("\n✓ Batch extraction complete!")

## Summary Statistics

In [None]:
# Create summary DataFrame
df_results = pd.DataFrame(all_results)

print("="*80)
print("BATCH EXTRACTION SUMMARY")
print("="*80)

# Overall stats
print(f"\nTotal extractions: {len(df_results)}")
print(f"Successful: {len(df_results[df_results['status'] == 'success'])}")
print(f"Failed: {len(df_results[df_results['status'] != 'success'])}")

# Stats by gene
print("\nSuccess rate by gene:")
for gene in ['PsiD', 'PsiK', 'PsiM', 'PsiH']:
    gene_data = df_results[df_results['gene'] == gene]
    success_count = len(gene_data[gene_data['status'] == 'success'])
    total = len(gene_data)
    pct = (success_count / total * 100) if total > 0 else 0
    print(f"  {gene}: {success_count}/{total} ({pct:.1f}%)")

# Protein length statistics
print("\nProtein length statistics (successful extractions):")
success_df = df_results[df_results['status'] == 'success']
for gene in ['PsiD', 'PsiK', 'PsiM', 'PsiH']:
    gene_data = success_df[success_df['gene'] == gene]['protein_length']
    if len(gene_data) > 0:
        print(f"  {gene}: mean={gene_data.mean():.0f} aa, min={gene_data.min()}, max={gene_data.max()}")

# Show failures
failed_df = df_results[df_results['status'] != 'success']
if len(failed_df) > 0:
    print(f"\nFailed extractions ({len(failed_df)}):")
    print(failed_df[['species', 'gene', 'status']].head(10).to_string(index=False))
    if len(failed_df) > 10:
        print(f"... and {len(failed_df) - 10} more")

print("\n" + "="*80)

# Save results to CSV
results_csv = BATCH_RESULTS_DIR / "extraction_summary.csv"
df_results.to_csv(results_csv, index=False)
print(f"\nResults saved to: {results_csv}")

## Combine Proteins by Gene

Create combined FASTA files for each gene (all species together)

In [None]:
print("Combining proteins by gene...\n")

for gene in ['PsiD', 'PsiK', 'PsiM', 'PsiH']:
    gene_dir = BATCH_RESULTS_DIR / gene
    combined_file = BATCH_RESULTS_DIR / f"{gene}_all_species.faa"
    
    # Collect all successful protein files
    protein_files = sorted(gene_dir.glob("*_prot.fa"))
    
    combined_records = []
    for prot_file in protein_files:
        try:
            record = list(SeqIO.parse(prot_file, "fasta"))[0]
            # Update ID to include species name
            species_name = prot_file.stem.replace(f"_{gene}.prot", "")
            record.id = f"{species_name}|{gene}"
            record.description = f"{gene} [{species_name}]"
            combined_records.append(record)
        except:
            pass  # Skip files that can't be read
    
    # Write combined file
    SeqIO.write(combined_records, combined_file, "fasta")
    print(f"✓ {gene}: {len(combined_records)} sequences → {combined_file.name}")

print("\n" + "="*80)
print("BATCH PROCESSING COMPLETE!")
print("="*80)
print(f"\nResults location: {BATCH_RESULTS_DIR}")
print("\nNext steps:")
print("  1. Review extraction_summary.csv for success rates")
print("  2. Use *_all_species.faa files for multiple sequence alignment (MAFFT)")
print("  3. Build phylogenetic trees (IQ-TREE)")
print("  4. Perform ancestral sequence reconstruction (ASR)")