In [1]:
import pandas as pd

from notebooks.consts import *
from notebooks.preprocessing import preprocess_aso_data

In [2]:
all_data = preprocess_aso_data(UPDATED_CSV)

Preprocessing complete. Final valid rows: 29987


In [3]:
from notebooks.preprocessing import get_unique_genes

target_genes = get_unique_genes(all_data)

In [4]:
from notebooks.notebook_utils import read_cached_gene_to_data

gene_to_data = read_cached_gene_to_data(target_genes)

In [5]:
from tauso.new_model.data_handling import get_populated_df_with_structure_features

all_data = get_populated_df_with_structure_features(all_data, target_genes, gene_to_data)

In [6]:
from tauso.features.context.mrna_halflife import populate_mrna_halflife_features, load_halflife_mapping, HalfLifeProvider

print("--- Initializing mRNA Half-Life Oracle ---")

# 1. Load the data (Run once within function scope)
mapping = load_halflife_mapping()
provider = HalfLifeProvider(mapping)

all_data = populate_mrna_halflife_features(all_data, provider)

--- Initializing mRNA Half-Life Oracle ---
Loading half-life data from /home/michael/.local/share/tauso/mrna_half_life.csv.gz...
Successfully loaded 327615 specific (Gene+Cell) stability profiles.
Oracle ready. Global Geometric Mean: 2.04h (N=327615)
Calculating stability features for 29987 rows...
Features populated successfully.


In [7]:
from notebooks.features.feature_extraction import save_feature

save_feature(all_data, "mRNA_HalfLife")
save_feature(all_data, "Mapped_Cell_Proxy")

Merge complete. New columns added: 'mRNA_HalfLife', 'Mapped_Cell_Proxy'
File exists for 'mRNA_HalfLife' but values are identical (within tolerance). No action taken.
File exists for 'Mapped_Cell_Proxy' but values are identical (within tolerance). No action taken.
SUCCESS: Features saved via save_feature().


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr

# --- Step 1: Prepare the Data ---
# We need to merge the calculated half-lives back into your main experimental data
# Assuming 'all_data' has columns: 'Target_Gene', 'Cell_Line_Name', 'Inhibition'
# And 'df_halflife' has: 'Target_Gene', 'Original_Cell_Line', 'Half_Life'

# Ensure names match for merging
df_halflife_merge = all_data.rename(columns={'Original_Cell_Line': 'Cell_Line_ID'})

# Merge only on the specific Gene+Cell pairs we calculated
merged_df = pd.merge(
    all_data, # Your main dataset with inhibition scores
    df_halflife_merge[['Target_Gene', 'Cell_Line_ID', 'Half_Life', 'Mapped_TTDB_Proxy']],
    left_on=[CANONICAL_GENE, CELL_LINE], # Adjust column names to match your data
    right_on=['Target_Gene', 'Cell_Line_ID'],
    how='inner'
)

# --- Step 2: Visualization ---
plt.figure(figsize=(16, 6))

# GRAPH A: Information Gain (Specific vs Global)
# Did the mapping actually give us new numbers?
plt.subplot(1, 2, 1)
global_avg = 6.01 # The global geometric mean we found earlier
sns.histplot(merged_df['Half_Life'], kde=True, color='teal', label='Cell-Specific HL')
plt.axvline(global_avg, color='red', linestyle='--', label='Global Average (6.0h)')
plt.title("Feature Variance: Specific vs Global Average")
plt.xlabel("mRNA Half-Life (Hours)")
plt.legend()
plt.grid(True, alpha=0.3)

# GRAPH B: Utility (Correlation with Inhibition)
# Does stability actually affect how well the ASO works?
plt.subplot(1, 2, 2)
sns.scatterplot(
    data=merged_df,
    x='Half_Life',
    y=INHIBITION, # Replace with your actual efficacy column name
    hue='Mapped_TTDB_Proxy',
    alpha=0.7,
    palette='viridis'
)

# Calculate Correlation
corr, p_val = spearmanr(merged_df['Half_Life'], merged_df[INHIBITION])
plt.title(f"Utility Check: HL vs Efficacy\nSpearman R: {corr:.3f} (p={p_val:.2e})")
plt.xlabel("mRNA Half-Life (Hours)")
plt.ylabel(INHIBITION)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
from tauso.features.context.mrna_halflife import load_halflife_mapping, HalfLifeProvider

# 1. Load the data (Run once)
mapping = load_halflife_mapping()
oracle = HalfLifeProvider(mapping)
# --- 1. Create the Map Dynamically ---
# Group by CELL_LINE and collect unique CANONICAL_GENE values into a list
cell_line_to_gene = (
    all_data.groupby(CELL_LINE)[CANONICAL_GENE]
    .apply(lambda x: list(set(x)))  # Use set to remove duplicate genes per cell line
    .to_dict()
)

# Optional: Print the first few entries to verify
print(f"Generated map  {cell_line_to_gene}")
from tauso.features.context.mrna_halflife import cell_line_mapping

# --- 3. Execution Loop ---
data_rows = []

for original_cell, genes in cell_line_to_gene.items():
    # Step A: Find the best TTDB proxy.
    # If not in map, keep original name (which will likely trigger Oracle fallback)
    proxy_cell = cell_line_mapping.get(original_cell, original_cell)

    for gene in genes:
        # Step B: Query Oracle using the PROXY name
        hl_val, source, n, std = oracle.get_halflife(gene, proxy_cell)

        # Clip artifacts
        hl_final = min(hl_val, 48.0)

        data_rows.append({
            "Target_Gene": gene,
            "Original_Cell_Line": original_cell,  # Keep track of your label
            "Mapped_TTDB_Proxy": proxy_cell,  # Keep track of what we used
            "Half_Life": hl_final,
            "HL_Source": source,
            "HL_N_Support": n,
            "HL_StdDev": std
        })

df_halflife = pd.DataFrame(data_rows)

# Display results to verify mapping worked
print(df_halflife[['Target_Gene', 'Original_Cell_Line', 'Mapped_TTDB_Proxy', 'Half_Life', 'HL_Source']].head(50))