In [1]:
# main_script_debug.py
import os
import pandas as pd

# Import necessary modules
# Make sure these point to the updated files with debug prints
from data_classes import parse_maf_file, ReferenceGenome, GeneAnnotation
from dnds_analysis import DnDsAnalysis

# --- Configuration ---
MAF_FILE = "/Users/xbh0403/Desktop/25SP/BENG285/project_2/TCGA.LUAD.mutations.txt"
FASTA_FILE = "/Users/xbh0403/Desktop/25SP/BENG285/project_2/hg19.fa"
GTF_FILE = "/Users/xbh0403/Desktop/25SP/BENG285/project_2/gencode.v19.annotation.gtf"
OUTPUT_DIR = "results_directory_debug_final_3"
HYPERMUTATOR_THRESHOLD = 500
FDR_THRESHOLD = 0.1

mutation_data = parse_maf_file(MAF_FILE)
filtered_data, hypermutators = mutation_data.filter_hypermutators(threshold=HYPERMUTATOR_THRESHOLD)
ref_genome = ReferenceGenome(FASTA_FILE)
gene_annotation = GeneAnnotation(GTF_FILE)
dnds_analysis = DnDsAnalysis(
    mutation_dataset=filtered_data,
    reference_genome=ref_genome,
    gene_annotation=gene_annotation,
    covariates_df=None # Keep as None if not using covariates
)

Debug (ParseMAF): Creating MutationDataset and calling load_from_maf for /Users/xbh0403/Desktop/25SP/BENG285/project_2/TCGA.LUAD.mutations.txt
Debug (MAF Load): Reading MAF file: /Users/xbh0403/Desktop/25SP/BENG285/project_2/TCGA.LUAD.mutations.txt
Debug (MAF Load): Read 224373 rows from MAF.
Debug (MAF Load): Processing 224373 rows...
Debug (MAF Load): Finished processing. Added 224373 mutations.
Debug (ParseMAF): load_from_maf finished. Dataset summary: MutationDataset: 224373 mutations in 18718 genes from 512 samples
Debug (FilterHyper): Checking for hypermutators (threshold > 500 mutations/sample)...
Debug (FilterHyper): Found 158 hypermutator samples: ['TCGA-05-4382', 'TCGA-05-4390', 'TCGA-05-4396', 'TCGA-05-4397', 'TCGA-05-4398']...
Debug (FilterHyper): Creating new dataset excluding hypermutators...
Debug (FilterHyper): New dataset created with 71477 mutations from 354 samples.
Debug (RefGenome): pysam loaded successfully. Found 93 contigs.

Debug (GeneAnnotation): Finished read

In [2]:
# --- Step 6: Run Analysis ---
print("\n--- Debug: Starting dN/dS Analysis Run ---")
try:
    # Create output directory if it doesn't exist
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    print(f"Debug: Results will be saved to: {OUTPUT_DIR}")

    results = dnds_analysis.run_analysis(
        output_dir=OUTPUT_DIR,
        hypermutator_threshold=HYPERMUTATOR_THRESHOLD, # Note: Filtering already done, but param is kept
        fdr_threshold=FDR_THRESHOLD
    )

except Exception as e:
    print(f"\nDebug: ERROR during dnds_analysis.run_analysis: {e}")
    import traceback
    traceback.print_exc() # Print detailed traceback
    exit()

print("\n--- Debug: Script Execution Complete ---")

# Close the reference genome file if necessary
if ref_genome and hasattr(ref_genome, 'close'):
    ref_genome.close()
    print("Debug: Closed reference genome file.")


--- Debug: Starting dN/dS Analysis Run ---
Debug: Results will be saved to: results_directory_debug_final_2

--- Debug (Analysis.run): Starting dN/dS analysis run ---
Debug (Analysis.run): Using provided dataset with 71477 mutations, 354 samples.

Debug (Analysis.run): Step 2 - Fitting trinucleotide substitution model...
Debug (Analysis.run): Using 71477 mutations from dataset to fit model.
Debug (TrinucModel.fit): Starting model fitting...
Debug (TrinucModel.fit): Processing 71477 mutations...
  Debug (RefGenome.Fetch): Original chrom '10' not found, using 'chr10' instead.
  --- Processed mutation 0/71477 (CPN1:10:101814119:G>C): Context='TGG', NormRef='C', NormAlt='G', NormCtx='CCA', Strand=1, SubKey='C>G'(idx=1), CtxKey='CNA'(idx=4)
  Debug (RefGenome.Fetch): Original chrom '10' not found, using 'chr10' instead.
  --- Processed mutation 1/71477 (MKI67:10:129902901:G>A): Context='TGT', NormRef='C', NormAlt='T', NormCtx='ACA', Strand=1, SubKey='C>T'(idx=2), CtxKey='ANA'(idx=0)
  Debu