In [1]:
# Imports and Configuration
import pandas as pd
import numpy as np
from pathlib import Path

# Project Imports
from notebooks.consts import *
from notebooks.consts import NOTEBOOK_PATH
from notebooks.preprocessing import preprocess_aso_data, get_unique_genes
from notebooks.notebook_utils import log_correction
from tauso.genome.read_human_genome import get_locus_to_data_dict
from tauso.new_model.data_handling import get_populated_df_with_structure_features

# New Refactored Modules
from tauso.genome.TranscriptMapper import GeneCoordinateMapper, build_gene_sequence_registry
from tauso.algorithms.genomic_context_windows import add_external_mrna_and_context_columns

# Pandas Display Settings
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.width', 2000)
pd.set_option('display.max_columns', None)

In [2]:
# Load and preprocess the dataset
# This handles: Filtering Human, Dropping NaNs, Log Correction, and Pre-mRNA Mapping
all_data = preprocess_aso_data(UPDATED_CSV, include_smiles=False)

print(f"Data ready. Shape: {all_data.shape}")

Preprocessing complete. Final valid rows: 29987
Data ready. Shape: (29987, 29)


In [3]:
# Path Constants
DB_PATH = "/home/michael/.local/share/tauso/GRCh38.db"

# 1. Identify unique genes in our clean dataset
genes_u = get_unique_genes(all_data)

# 2. Load Gene Locus Objects (Contains raw genomic sequence/introns)
gene_to_data = get_locus_to_data_dict(include_introns=True, gene_subset=genes_u)

# 3. Initialize the Coordinate Mapper (Connects to GFF DB)
mapper = GeneCoordinateMapper(db_path=DB_PATH)

# 4. Build Sequence Registry (Pre-calculates Spliced CDS sequences)
# This creates a fast lookup dict: {gene: {'pre_mrna': "...", 'cds': "..."}}
registry = build_gene_sequence_registry(genes_u, gene_to_data, mapper)

print(f"Registry built for {len(registry)} genes.")

Elapsed DB:  0.0015513896942138672
Elapsed Fasta:  0.0015513896942138672
Length:  3267117988
Registry built for 15 genes.


In [4]:
# Add structural features (e.g. secondary structure energy)
all_data = get_populated_df_with_structure_features(all_data, genes_u, gene_to_data)

In [5]:
FLANK_SIZES_PREMRNA = [20, 30, 40, 50, 60, 70]
CDS_WINDOWS = [20, 30, 40, 50, 60, 70]

# Run the optimized context generator
df_with_context = add_external_mrna_and_context_columns(
    df=all_data,
    mapper=mapper,
    gene_registry=registry,
    flank_sizes_premrna=FLANK_SIZES_PREMRNA,
    flank_sizes_cds=CDS_WINDOWS
)

print("Genomic context added.")

Genomic context added.


In [6]:
import random

def diagnose_mapping_loss(df, mapper, sample_size=100):
    """
    Audits ASOs that failed CDS mapping to see if they are:
    1. RECOVERABLE: Map to the CDS of a *different* isoform.
    2. UTR: Map to an Exon, but not CDS (unscoreable for CAI).
    3. INTRONIC: Map to no exon at all (unscoreable for CAI).
    """
    # 1. Filter for failures
    failures = df[
        (df['sense_start'] != -1) &
        (df['in_coding_region'] == False)
    ]

    if failures.empty:
        print("No failures to diagnose!")
        return

    print(f"Diagnosing a sample of {min(len(failures), sample_size)} out of {len(failures)} unmapped ASOs...")
    print("-" * 60)

    stats = {
        'Recoverable (Isoform Mismatch)': 0,
        'UTR (Exonic but non-Coding)': 0,
        'True Intronic': 0
    }

    # Sample random rows for speed
    sample_indices = random.sample(list(failures.index), min(len(failures), sample_size))
    sample_df = failures.loc[sample_indices]

    for ridx, row in sample_df.iterrows():
        gene_name = row[CANONICAL_GENE]
        # Calculate Genomic Position
        # Note: This assumes sense_start is relative to the Gene start
        try:
            gene_id = mapper.gene_name_map.get(gene_name)
            if not gene_id: continue
            gene = mapper.db[gene_id]

            aso_start_idx = int(row['sense_start'])
            if gene.strand == '+':
                genomic_pos = gene.start + aso_start_idx
            else:
                genomic_pos = gene.end - aso_start_idx
        except Exception as e:
            continue

        # Check against ALL transcripts for this gene
        hit_any_cds = False
        hit_any_exon = False

        transcripts = list(mapper.db.children(gene, featuretype='transcript'))

        for t in transcripts:
            # Check CDS
            for cds in mapper.db.children(t, featuretype='CDS'):
                if cds.start <= genomic_pos <= cds.end:
                    hit_any_cds = True
                    break

            # Check Exons (if no CDS hit yet)
            if not hit_a_ny_cds:
                for exon in mapper.db.children(t, featuretype='exon'):
                    if exon.start <= genomic_pos <= exon.end:
                        hit_any_exon = True
                        break

            if hit_any_cds: break

        # Categorize
        if hit_any_cds:
            stats['Recoverable (Isoform Mismatch)'] += 1
        elif hit_any_exon:
            stats['UTR (Exonic but non-Coding)'] += 1
        else:
            stats['True Intronic'] += 1

    # Print Report
    total = sum(stats.values())
    print(f"{'CATEGORY':<35} | {'COUNT':<5} | {'PCT':<5}")
    print("-" * 60)
    for cat, count in stats.items():
        pct = (count / total * 100) if total > 0 else 0
        print(f"{cat:<35} | {count:<5} | {pct:.1f}%")
    print("-" * 60)

    if stats['Recoverable (Isoform Mismatch)'] > (total * 0.2):
        print("\n⚠️  RECOMMENDATION: Your 'Longest CDS' selection is losing significant data.")
        print("    Consider updating GeneCoordinateMapper to select the 'Principal' isoform")
        print("    or the isoform that maximizes coverage of your specific ASO list.")
    else:
        print("\n✅ VERDICT: Most dropouts are genuine Introns/UTRs.")
        print("    These regions do not have codons, so they cannot have a CAI score.")

# Run the diagnosis
diagnose_mapping_loss(df_with_context, mapper, sample_size=26088)

Diagnosing a sample of 26088 out of 26088 unmapped ASOs...
------------------------------------------------------------
CATEGORY                            | COUNT | PCT  
------------------------------------------------------------
Recoverable (Isoform Mismatch)      | 37    | 0.1%
UTR (Exonic but non-Coding)         | 11697 | 44.8%
True Intronic                       | 14354 | 55.0%
------------------------------------------------------------

✅ VERDICT: Most dropouts are genuine Introns/UTRs.
    These regions do not have codons, so they cannot have a CAI score.


In [7]:
# Filter for rows where mapping to CDS failed (Intronic sites)
# We check: Valid start index on Pre-mRNA (sense_start != -1) BUT Not in Coding Region
intronic_asos = df_with_context[
    (df_with_context['sense_start'] != -1) &
    (df_with_context['in_coding_region'] == False)
].copy()

print(f"Total Rows: {len(df_with_context)}")
print(f"Exonic (Mapped to CDS): {len(df_with_context) - len(intronic_asos)}")
print(f"Intronic (No CDS map):  {len(intronic_asos)}")

if not intronic_asos.empty:
    print("\nSample Intronic ASOs:")
    cols_to_show = [CANONICAL_GENE, 'sense_start', 'in_coding_region']
    print(intronic_asos[cols_to_show].head(10))

Total Rows: 29987
Exonic (Mapped to CDS): 3899
Intronic (No CDS map):  26088

Sample Intronic ASOs:
   Canonical Gene Name  sense_start  in_coding_region
0                 KRAS        41212             False
2                 KRAS        43363             False
4                 KRAS        41168             False
5                 KRAS        41347             False
6                 KRAS        41408             False
7                 KRAS        41693             False
8                 KRAS        41695             False
9                 KRAS        41825             False
10                KRAS        41831             False
11                KRAS        41349             False
