In [1]:
# main_script_debug.py
import os
import pandas as pd

# Import necessary modules
# Make sure these point to the updated files with debug prints
from data_classes import parse_maf_file, ReferenceGenome, GeneAnnotation
from dnds_analysis import DnDsAnalysis

# --- Configuration ---
MAF_FILE = "/Users/xbh0403/Desktop/25SP/BENG285/project_2/TCGA.LUAD.mutations.txt"
FASTA_FILE = "/Users/xbh0403/Desktop/25SP/BENG285/project_2/hg19.fa"
GTF_FILE = "/Users/xbh0403/Desktop/25SP/BENG285/project_2/gencode.v19.annotation.gtf"
OUTPUT_DIR = "results_directory_debug_final"
HYPERMUTATOR_THRESHOLD = 500
FDR_THRESHOLD = 0.1
DEBUG_LIMIT = 5 # How many items to print in loops for debugging

# --- File Checks ---
print("--- Debug: Checking File Paths ---")
for f_path in [MAF_FILE, FASTA_FILE, GTF_FILE]:
    if os.path.exists(f_path):
        print(f"Debug: Found file: {f_path}")
    else:
        print(f"Debug: ERROR - File not found: {f_path}")
        exit() # Stop if essential files are missing
print("--- Debug: File Paths OK ---")

# --- Step 1: Load Mutations ---
print("\n--- Debug: Loading MAF File ---")
try:
    mutation_data = parse_maf_file(MAF_FILE)
    print(f"Debug: Initial MAF load successful. Found {len(mutation_data.mutations)} total mutations.")
    # Inspect first few mutations
    print(f"Debug: Inspecting first {DEBUG_LIMIT} mutations:")
    for i, mut in enumerate(mutation_data.mutations[:DEBUG_LIMIT]):
        print(f"  - Mutation {i}: {mut} | Type: {mut.variant_type} | Class: {mut.variant_classification} | Ref: {mut.ref_allele} | Alt: {mut.alt_allele}")
except Exception as e:
    print(f"Debug: ERROR loading MAF file: {e}")
    exit()
print("--- Debug: MAF Loading OK ---")

# --- Step 2: Filter Hypermutators ---
print("\n--- Debug: Filtering Hypermutators ---")
try:
    filtered_data, hypermutators = mutation_data.filter_hypermutators(threshold=HYPERMUTATOR_THRESHOLD)
    print(f"Debug: Filtering complete. Removed {len(hypermutators)} hypermutator samples.")
    print(f"Debug: Remaining mutations: {len(filtered_data.mutations)} across {len(filtered_data.samples)} samples.")
    # Check if any mutations remain
    if not filtered_data.mutations:
        print("Debug: WARNING - No mutations left after filtering hypermutators!")
    else:
         print(f"Debug: Inspecting first {DEBUG_LIMIT} mutations after filtering:")
         for i, mut in enumerate(filtered_data.mutations[:DEBUG_LIMIT]):
            print(f"  - Mutation {i}: {mut} | Patient: {mut.patient_id} | Ref: {mut.ref_allele} | Alt: {mut.alt_allele}")

except Exception as e:
    print(f"Debug: ERROR filtering hypermutators: {e}")
    exit()
print("--- Debug: Filtering OK ---")


# --- Step 3: Initialize Reference Genome ---
print("\n--- Debug: Initializing Reference Genome ---")
try:
    ref_genome = ReferenceGenome(FASTA_FILE)
    print(f"Debug: Reference Genome initialized using {ref_genome.method}.")
    # Test fetching a sequence (adjust coordinates if needed for hg19)
    test_chrom = 'chr1' # Or just '1' depending on your FASTA
    test_start = 10000
    test_end = 10010
    print(f"Debug: Testing fetch: {test_chrom}:{test_start}-{test_end}")
    seq = ref_genome.fetch(test_chrom, test_start, test_end)
    if seq:
        print(f"Debug: Fetched sequence: {seq}")
    else:
        print(f"Debug: WARNING - Fetch returned None. Check chromosome name format ('chr1' vs '1') and FASTA integrity.")
except ImportError as e:
     print(f"Debug: ERROR - Missing dependency for ReferenceGenome: {e}")
     exit()
except Exception as e:
    print(f"Debug: ERROR initializing ReferenceGenome: {e}")
    # Try fetching without 'chr' prefix if the first attempt failed
    if 'chr' in test_chrom:
        try:
            print(f"Debug: Retrying fetch without 'chr' prefix...")
            seq_no_chr = ref_genome.fetch(test_chrom[3:], test_start, test_end)
            if seq_no_chr:
                 print(f"Debug: Fetched sequence without 'chr': {seq_no_chr}")
            else:
                 print(f"Debug: WARNING - Fetch without 'chr' also failed.")
        except Exception as e2:
            print(f"Debug: ERROR during retry fetch: {e2}")
    exit()
print("--- Debug: Reference Genome OK ---")


# --- Step 4: Initialize Gene Annotation ---
print("\n--- Debug: Initializing Gene Annotation ---")
try:
    gene_annotation = GeneAnnotation(GTF_FILE)
    print(f"Debug: Gene Annotation initialized. Found info for {len(gene_annotation.genes)} genes.")
    # Test getting info for a known gene (e.g., TP53)
    test_gene = 'TP53'
    print(f"Debug: Testing annotation for gene: {test_gene}")
    gene_info = gene_annotation.get_gene_info(test_gene)
    if gene_info:
        print(f"Debug: Found info for {test_gene}: Chrom={gene_info.get('chromosome')}, Strand={gene_info.get('strand')}, CodingLen={gene_info.get('coding_length')}")
        print(f"Debug: Exons (first {DEBUG_LIMIT}): {gene_info.get('exons', [])[:DEBUG_LIMIT]}")
    else:
        print(f"Debug: WARNING - Could not find annotation info for {test_gene}. Check GTF content and gene name format.")
except Exception as e:
    print(f"Debug: ERROR initializing GeneAnnotation: {e}")
    exit()
print("--- Debug: Gene Annotation OK ---")


# --- Step 5: Initialize DnDsAnalysis ---
print("\n--- Debug: Initializing DnDsAnalysis ---")
try:
    dnds_analysis = DnDsAnalysis(
        mutation_dataset=filtered_data,
        reference_genome=ref_genome,
        gene_annotation=gene_annotation,
        covariates_df=None # Keep as None if not using covariates
    )
    print("Debug: DnDsAnalysis initialized successfully.")
except Exception as e:
    print(f"Debug: ERROR initializing DnDsAnalysis: {e}")
    exit()
print("--- Debug: DnDsAnalysis Initialization OK ---")


--- Debug: Checking File Paths ---
Debug: Found file: /Users/xbh0403/Desktop/25SP/BENG285/project_2/TCGA.LUAD.mutations.txt
Debug: Found file: /Users/xbh0403/Desktop/25SP/BENG285/project_2/hg19.fa
Debug: Found file: /Users/xbh0403/Desktop/25SP/BENG285/project_2/gencode.v19.annotation.gtf
--- Debug: File Paths OK ---

--- Debug: Loading MAF File ---
Debug (ParseMAF): Creating MutationDataset and calling load_from_maf for /Users/xbh0403/Desktop/25SP/BENG285/project_2/TCGA.LUAD.mutations.txt
Debug (MAF Load): Reading MAF file: /Users/xbh0403/Desktop/25SP/BENG285/project_2/TCGA.LUAD.mutations.txt
Debug (MAF Load): Read 224373 rows from MAF.
Debug (MAF Load): Processing 224373 rows...
Debug (MAF Load): Finished processing. Added 224373 mutations.
Debug (ParseMAF): load_from_maf finished. Dataset summary: MutationDataset: 224373 mutations in 18718 genes from 512 samples
Debug: Initial MAF load successful. Found 224373 total mutations.
Debug: Inspecting first 5 mutations:
  - Mutation 0: CPN1

In [2]:
# --- Step 6: Run Analysis ---
print("\n--- Debug: Starting dN/dS Analysis Run ---")
try:
    # Create output directory if it doesn't exist
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    print(f"Debug: Results will be saved to: {OUTPUT_DIR}")

    results = dnds_analysis.run_analysis(
        output_dir=OUTPUT_DIR,
        hypermutator_threshold=HYPERMUTATOR_THRESHOLD, # Note: Filtering already done, but param is kept
        fdr_threshold=FDR_THRESHOLD
    )
    print("\n--- Debug: dN/dS Analysis Run Finished ---")
    if results is not None and not results.empty:
        print(f"Debug: Analysis completed. Results DataFrame shape: {results.shape}")
        print(f"Debug: Results columns: {results.columns.tolist()}")
        print(f"Debug: First {DEBUG_LIMIT} rows of results:")
        print(results.head(DEBUG_LIMIT))
    else:
        print("Debug: WARNING - Analysis finished, but results DataFrame is empty or None.")

except Exception as e:
    print(f"\nDebug: ERROR during dnds_analysis.run_analysis: {e}")
    import traceback
    traceback.print_exc() # Print detailed traceback
    exit()

print("\n--- Debug: Script Execution Complete ---")

# Close the reference genome file if necessary
if ref_genome and hasattr(ref_genome, 'close'):
    ref_genome.close()
    print("Debug: Closed reference genome file.")


--- Debug: Starting dN/dS Analysis Run ---
Debug: Results will be saved to: results_directory_debug_final

--- Debug (Analysis.run): Starting dN/dS analysis run ---
Debug (Analysis.run): Using provided dataset with 71477 mutations, 354 samples.

Debug (Analysis.run): Step 2 - Fitting trinucleotide substitution model...
Debug (Analysis.run): Using 71477 mutations from dataset to fit model.
Debug (TrinucModel.fit): Starting model fitting...
Debug (TrinucModel.fit): Processing 71477 mutations...
  Debug (RefGenome.Fetch): Original chrom '10' not found, using 'chr10' instead.
  --- Processed mutation 0/71477 (CPN1:10:101814119:G>C): Context='TGG', NormRef='C', NormAlt='G', NormCtx='CCA', Strand=1, SubKey='C>G'(idx=1), CtxKey='CNA'(idx=4)
  Debug (RefGenome.Fetch): Original chrom '10' not found, using 'chr10' instead.
  --- Processed mutation 1/71477 (MKI67:10:129902901:G>A): Context='TGT', NormRef='C', NormAlt='T', NormCtx='ACA', Strand=1, SubKey='C>T'(idx=2), CtxKey='ANA'(idx=0)
  Debug 