In [2]:
import pandas as pd
from notebooks.consts import *

In [3]:
csv_path = NOTEBOOK_PATH / 'data' / 'data_asoptimizer_updated.csv'
all_data = pd.read_csv(str(csv_path), low_memory=False)

In [4]:
from notebooks.notebook_utils import log_correction, get_unique_human_genes

# Remove rows with missing values in the INHIBITION column
all_data_no_nan = all_data.dropna(subset=[INHIBITION]).copy()
# Create a new column with transformed inhibition values on a negative log scale
log_correction(all_data_no_nan) # to avoid log 0

In [5]:
genes_u = get_unique_human_genes(all_data_no_nan)

In [6]:
from notebooks.notebook_utils import read_cached_gene_to_data

gene_to_data = read_cached_gene_to_data(genes_u)

In [7]:
from tauso.new_model.data_handling import get_populated_df_with_structure_features

# Filter the data to include only rows where the cell line organism is human
all_data_no_nan_human = all_data_no_nan[all_data_no_nan[CELL_LINE_ORGANISM] == 'human']

# Filter data to keep only rows with valid gene information
all_data_human_gene = all_data_no_nan_human[all_data_no_nan_human[CANONICAL_GENE].isin(genes_u)].copy()

all_data_human_gene = get_populated_df_with_structure_features(all_data_human_gene, genes_u, gene_to_data)

In [8]:
all_data_human_gene.columns
all_data_human_gene.head()

Unnamed: 0,index,ISIS,Target_gene,Cell_line,Density(cells/well),Transfection,ASO_volume(nM),Treatment_Period(hours),Primer_probe_set,Sequence,...,mod_scan,cell_line_uniform,log_inhibition,sense_start,sense_start_from_end,sense_length,sense_exon,sense_intron,sense_utr,sense_type
0,0,540733,K-RAS,A431,5000.0,free uptake,2000.0,24,RTS3496_MGB,GCTAAAACAAATGCTA,...,0,A431,-4.204842,41212,4472,16,0,1,0,intron
1,1,540747,K-RAS,A431,5000.0,free uptake,2000.0,24,RTS3496_MGB,TATAATGGTGAATATC,...,0,A431,-4.532707,23686,21998,16,0,1,0,intron
2,2,540806,K-RAS,A431,5000.0,free uptake,2000.0,24,RTS3496_MGB,GCATGAAGATTTCTGG,...,1,A431,-3.637849,43363,2321,16,0,1,0,intron
3,3,651479,K-RAS,A431,5000.0,free uptake,2000.0,24,RTS3496_MGB,GGTGAATATCTTCAAA,...,0,A431,-4.276805,23680,22004,16,0,1,0,intron
4,4,651490,K-RAS,A431,5000.0,free uptake,2000.0,24,RTS3496_MGB,CACTTGTACTAGTATG,...,0,A431,-4.159039,41168,4516,16,0,1,0,intron


In [23]:
from tauso.hybridization.md_weights import get_2moe_md_diff
from tauso.features.seq_features import get_gc_content, at_skew
from tauso.hybridization.hybridization_features import get_exp_psrna_hybridization, get_exp_dna_rna_hybridization, \
    calculate_lna, calculate_dna, calculate_cet, calc_methylcytosines



all_data_human_gene['MOE_DIFF_GB'] = all_data_human_gene.apply(
    lambda row: get_2moe_md_diff(
        row[SEQUENCE],
        row[CHEMICAL_PATTERN], simul_type='gb'
    ),
    axis=1
)

all_data_human_gene['MOE_DIFF_PB'] = all_data_human_gene.apply(
    lambda row: get_2moe_md_diff(
        row[SEQUENCE],
        row[CHEMICAL_PATTERN], simul_type='pb'
    ),
    axis=1
)



all_data_human_gene['METHYL_CYTOSINES'] = all_data_human_gene.apply(
    lambda row: calc_methylcytosines(
        row[SEQUENCE],
        row[CHEMICAL_PATTERN],
        row[MODIFICATION],
    ),
    axis=1
)

all_data_human_gene['LNA_DIFF_37_HYBR'] = all_data_human_gene.apply(
    lambda row: calculate_lna(
        row[SEQUENCE],
        row[CHEMICAL_PATTERN],
    ),
    axis=1
)

all_data_human_gene['at_skew'] = all_data_human_gene.apply(
    lambda row: at_skew(
        row[SEQUENCE],
    ),
    axis=1
)


# --- Usage ---
# Ensure you pass BOTH dictionaries
all_data_human_gene['CET_DIFF_37_HYBR'] = all_data_human_gene.apply(
    lambda row: calculate_cet(
        row[SEQUENCE],
        row[CHEMICAL_PATTERN],
    ),
    axis=1
)

all_data_human_gene['TOTAL_PSDNA_HYBR'] = all_data_human_gene.apply(
    lambda row: get_exp_psrna_hybridization(
        row[SEQUENCE],
    ) / 1000,
    axis=1
)

all_data_human_gene['gc_content'] = all_data_human_gene.apply(
    lambda row: get_gc_content(
        row[SEQUENCE],
    ),
    axis=1
)
all_data_human_gene['TOTAL_DNA_HYBR'] = all_data_human_gene.apply(
    lambda row: calculate_dna(
        row[SEQUENCE],
    ),
    axis=1
)

all_data_human_gene['TOTAL_DNA_RNA_HYBR'] = all_data_human_gene.apply(
    lambda row: get_exp_dna_rna_hybridization(
        row[SEQUENCE],
    ),
    axis=1
)


In [24]:
from notebooks.features.feature_extraction import save_feature

features = ['TOTAL_DNA_RNA_HYBR', 'TOTAL_DNA_HYBR', 'gc_content', 'TOTAL_PSDNA_HYBR', 'CET_DIFF_37_HYBR', 'at_skew', 'LNA_DIFF_37_HYBR', 'MOE_DIFF_PB', 'MOE_DIFF_GB']

for feature in features:
    save_feature(all_data_human_gene, feature)

In [15]:
# Define your list of target modifications
# target_modifications = ['MOE/(S)-cEt/5-methylcytosines/deoxy']
target_modifications = ['MOE/5-methylcytosines/deoxy']

# Use .isin() to filter for anything in that list
all_data_human_gene_filtered = all_data_human_gene[
    all_data_human_gene[MODIFICATION].isin(target_modifications)
].copy()
# all_data_human_gene_filtered = all_data_human_gene[all_data_human_gene[CHEMICAL_PATTERN] != 'CCCddddddddddCCC']
# all_data_human_gene_filtered = all_data_human_gene_filtered[all_data_human_gene_filtered[VOLUME] == 5]

In [16]:
# all_data_human_gene_filtered = all_data_human_gene[all_data_human_gene[MODIFICATION] == 'LNA/deoxy']
# # all_data_human_gene_filtered = all_data_human_gene[all_data_human_gene[CHEMICAL_PATTERN] != '']
# all_data_human_gene_filtered = all_data_human_gene_filtered[all_data_human_gene_filtered[VOLUME] == 7500]


In [17]:
from notebooks.utils.print import calc_mutual_information, print_correlations

mi_result = calc_mutual_information(all_data_human_gene_filtered, 'METHYL_CYTOSINES', 'log_inhibition')
print(f"New MI Score: {mi_result:.4f}")
mi_result = calc_mutual_information(all_data_human_gene_filtered, 'MOE_DIFF_PB', 'log_inhibition')
print(f"New MI Score: {mi_result:.4f}")
mi_result = calc_mutual_information(all_data_human_gene_filtered, 'MOE_DIFF_GB', 'log_inhibition')
print(f"New MI Score: {mi_result:.4f}")
mi_result = calc_mutual_information(all_data_human_gene_filtered, 'TOTAL_PSDNA', 'log_inhibition')
print(f"New MI Score: {mi_result:.4f}")
mi_result = calc_mutual_information(all_data_human_gene_filtered, 'gc_content', 'log_inhibition')
print(f"New MI Score: {mi_result:.4f}")
mi_result = calc_mutual_information(all_data_human_gene_filtered, 'at_skew', 'log_inhibition')
print(f"New MI Score: {mi_result:.4f}")

New MI Score: 0.0105
New MI Score: 0.0695
New MI Score: 0.0974
New MI Score: 0.1078
New MI Score: 0.0910
New MI Score: 0.0586


In [21]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.inspection import permutation_importance

# --- 1. Setup & Helper Functions (Keep your existing calcs) ---
# Assuming 'all_data_human_gene' is already loaded with your basic columns
# ... (Your existing calculate_lna, calculate_cet, etc. functions go here) ...

def prepare_features(df):
    """
    Centralized feature engineering to avoid copy-paste errors.
    """
    df = df.copy()

    # Example custom feature logic
    df['LNA_BOOST'] = -df['LNA_DIFF_37'] + df['TOTAL_PSDNA']

    return df

def analyze_model_robustness(df, feature_cols, target_col, n_splits=5):
    """
    Runs K-Fold CV and Permutation Importance to rigorously test model stability.
    """
    # 1. Clean Data
    # Drop rows with NaNs in features or target
    data = df[feature_cols + [target_col]].dropna()

    # 2. Drop Constant Columns (Safety check)
    # If Volume is always 7500, it provides 0 info. Drop it to stop noise.
    X = data[feature_cols]
    X = X.loc[:, (X != X.iloc[0]).any()]
    valid_features = X.columns.tolist()

    if not valid_features:
        print("ERROR: No valid features left (all were constant or empty).")
        return

    y = data[target_col]

    print(f"\n--- Analyzing Model with {len(data)} samples ---")
    print(f"Features used: {valid_features}")

    # 3. Initialize Model
    # Lower max_depth prevents memorizing noise.
    rf = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)

    # 4. K-Fold Cross Validation (The Robustness Check)
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    cv_scores = cross_val_score(rf, X, y, cv=kf, scoring='r2')

    print(f"\n> Cross-Validation R^2 (Avg of {n_splits} runs): {np.mean(cv_scores):.3f}")
    print(f"> CV Score Std Dev: {np.std(cv_scores):.3f} (Lower is more stable)")

    # 5. Permutation Importance (The "Real" Drivers)
    # We fit once on the whole reliable set to check importance
    rf.fit(X, y)
    result = permutation_importance(rf, X, y, n_repeats=10, random_state=42, n_jobs=-1)

    importances = pd.DataFrame({
        'Feature': valid_features,
        'Importance_Mean': result.importances_mean,
        'Importance_Std': result.importances_std
    }).sort_values('Importance_Mean', ascending=False)

    print("\n> Permutation Feature Importance:")
    print(importances)

    return rf, valid_features

# --- Usage Example ---

# 1. Filter Data
# Apply your filters cleanly here
# filtered_df = all_data_human_gene[all_data_human_gene[MODIFICATION] == 'cEt/5-methylcytosines/deoxy'].copy()
# filtered_df = filtered_df[filtered_df[CHEMICAL_PATTERN] != 'CCCddddddddddCCC'].copy()

filtered_df = all_data_human_gene[all_data_human_gene[MODIFICATION] == 'MOE/5-methylcytosines/deoxy'].copy()
filtered_df = filtered_df[~filtered_df[CELL_LINE].isin(['A431', 'SH-SY5Y'])].copy()

# filtered_df = filtered_df[filtered_df[CHEMICAL_PATTERN] != 'MMMMMddddddddddMMMMM'].copy()

filtered_df = prepare_features(filtered_df)
# 2. Define Feature Sets to Compare
feature_sets = {
    "Thermodynamics Only": [VOLUME,  'TOTAL_PSDNA', 'MOE_DIFF_GB', 'METHYL_CYTOSINES', 'gc_content', 'at_skew', 'true_length_of_seq'],
    # "Decoupled Physics":   [VOLUME, 'TOTAL_DNA', 'LNA_BOOST', 'gc_content', 'true_length_of_seq'],
    # "Structure + Content": [VOLUME, 'TOTAL_DNA', 'LNA_BOOST','METHYL_CYTOSINES', 'gc_content']
}

target = 'log_inhibition'

# 3. Run Comparison
for name, feats in feature_sets.items():
    print(f"\n{'='*10} Testing: {name} {'='*10}")
    # Check if cols exist before running
    available_feats = [f for f in feats if f in filtered_df.columns]
    analyze_model_robustness(filtered_df, available_feats, target)



--- Analyzing Model with 4149 samples ---
Features used: ['ASO_volume(nM)', 'TOTAL_PSDNA', 'MOE_DIFF_GB', 'METHYL_CYTOSINES', 'gc_content', 'at_skew', 'true_length_of_seq']

> Cross-Validation R^2 (Avg of 5 runs): 0.452
> CV Score Std Dev: 0.025 (Lower is more stable)

> Permutation Feature Importance:
              Feature  Importance_Mean  Importance_Std
0      ASO_volume(nM)         0.836584        0.019120
4          gc_content         0.071059        0.004368
2         MOE_DIFF_GB         0.051664        0.001127
1         TOTAL_PSDNA         0.043739        0.002832
3    METHYL_CYTOSINES         0.019172        0.001767
5             at_skew         0.013387        0.000790
6  true_length_of_seq         0.000593        0.000109


  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)
  return _ForkingPickler.loads(res)


In [None]:
from notebooks.utils.print import print_correlations
print_correlations(filtered_df, 'TOTAL_PSDNA', 'MOE_HYBRIDIZATION')