In [1]:
import pandas as pd
from notebooks.consts import *

In [2]:
from notebooks.preprocessing import preprocess_aso_data, get_unique_genes

# Load and preprocess the dataset
all_data = preprocess_aso_data(UPDATED_CSV)

Preprocessing complete. Final valid rows: 29987


In [3]:
from tauso.hybridization.populate import populate_hybridization, HYBR_FEATURE_TO_CALCULATION

all_data = populate_hybridization(all_data, n_cores=None)

INFO: Pandarallel will run on 32 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
Calculating feature: MOE_DIFF_37_MD_GB_HYBR...
Calculating feature: MOE_DIFF_37_MD_PB_HYBR...
Calculating feature: PSDNA_RNA_MD_37_GB_TOTAL_HYBR...
Calculating feature: PSDNA_RNA_MD_37_PB_TOTAL_HYBR...
Calculating feature: METHYL_CYTOSINES...
Calculating feature: LNA_DIFF_37_HYBR...
Calculating feature: CET_DIFF_37_HYBR...
Calculating feature: TOTAL_PSDNA_HYBR...
Calculating feature: PSDNA_DIFF_37_HYBR...
Calculating feature: TOTAL_DNA_HYBR...
Calculating feature: TOTAL_DNA_RNA_HYBR...
Calculating vectorized feature: DNA_HYBR_DIFF...


In [7]:
from notebooks.features.feature_extraction import save_feature

features = HYBR_FEATURE_TO_CALCULATION

for feature in features:
    save_feature(all_data, feature)

File exists for 'MOE_DIFF_37_MD_GB_HYBR' but values are identical (within tolerance). No action taken.
File exists for 'MOE_DIFF_37_MD_PB_HYBR' but values are identical (within tolerance). No action taken.
File exists for 'PSDNA_RNA_MD_37_GB_TOTAL_HYBR' but values are identical (within tolerance). No action taken.
File exists for 'PSDNA_RNA_MD_37_PB_TOTAL_HYBR' but values are identical (within tolerance). No action taken.
File exists for 'METHYL_CYTOSINES' but values are identical (within tolerance). No action taken.
File exists for 'LNA_DIFF_37_HYBR' but values are identical (within tolerance). No action taken.
File exists for 'CET_DIFF_37_HYBR' but values are identical (within tolerance). No action taken.
File exists for 'TOTAL_PSDNA_HYBR' but values are identical (within tolerance). No action taken.
File exists for 'PSDNA_DIFF_37_HYBR' but values are identical (within tolerance). No action taken.
File exists for 'TOTAL_DNA_HYBR' but values are identical (within tolerance). No action

In [None]:
################################################
#### Analysis ##################################
################################################

In [None]:
# Define your list of target modifications
# target_modifications = ['MOE/(S)-cEt/5-methylcytosines/deoxy']
target_modifications = ['LNA/deoxy']

# Use .isin() to filter for anything in that list
all_data_human_gene_filtered = all_data[all_data[MODIFICATION].isin(target_modifications)
].copy()
# all_data_human_gene_filtered = all_data_human_gene[all_data_human_gene[CHEMICAL_PATTERN] != 'CCCddddddddddCCC']
# all_data_human_gene_filtered = all_data_human_gene_filtered[all_data_human_gene_filtered[VOLUME] == 5]

In [None]:
# all_data_human_gene_filtered = all_data_human_gene[all_data_human_gene[MODIFICATION] == 'LNA/deoxy']
# # all_data_human_gene_filtered = all_data_human_gene[all_data_human_gene[CHEMICAL_PATTERN] != '']
# all_data_human_gene_filtered = all_data_human_gene_filtered[all_data_human_gene_filtered[VOLUME] == 7500]


In [None]:
from notebooks.utils.print import calc_mutual_information, print_correlations

mi_result = calc_mutual_information(all_data_human_gene_filtered, 'METHYL_CYTOSINES', 'log_inhibition')
print(f"New MI Score: {mi_result:.4f}")
mi_result = calc_mutual_information(all_data_human_gene_filtered, 'LNA_DIFF_37_HYBR', 'log_inhibition')
print(f"New MI Score: {mi_result:.4f}")
mi_result = calc_mutual_information(all_data_human_gene_filtered, 'TOTAL_DNA_RNA_HYBR', 'log_inhibition')
print(f"New MI Score: {mi_result:.4f}")
mi_result = calc_mutual_information(all_data_human_gene_filtered, 'PSDNA_DIFF_37_HYBR', 'log_inhibition')
print(f"New MI Score: {mi_result:.4f}")
mi_result = calc_mutual_information(all_data_human_gene_filtered, 'gc_content', 'log_inhibition')
print(f"New MI Score: {mi_result:.4f}")
mi_result = calc_mutual_information(all_data_human_gene_filtered, 'at_skew', 'log_inhibition')
print(f"New MI Score: {mi_result:.4f}")

In [None]:
print(all_data.columns)

In [None]:
from tauso.genome.read_human_genome import get_locus_to_data_dict
from tauso.new_model.data_handling import get_populate_fold

genes_u = get_unique_genes(all_data)
gene_to_data = get_locus_to_data_dict()

fold_variants = [(40, 15)]
all_data_human_gene = get_populate_fold(all_data, genes_u, gene_to_data, fold_variants=fold_variants)

In [None]:
from tauso.new_model.data_handling import populate_features

# 4. Sequence Features (GC, Skew, etc.)
easy_to_populate = [
    'at_skew', 'gc_content', 'gc_content_3_prime_5', 'gc_skew', 'hairpin_score',
    'homooligo_count', 'internal_fold', 'nucleotide_diversity', 'self_energy',
    'stop_codon_count', 'at_rich_region_score', 'poly_pyrimidine_stretch'
]
populate_features(all_data_human_gene, easy_to_populate)

In [None]:
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.inspection import permutation_importance


# --- 1. Setup & Helper Functions (Keep your existing calcs) ---
# Assuming 'all_data_human_gene' is already loaded with your basic columns
# ... (Your existing calculate_lna, calculate_cet, etc. functions go here) ...

def analyze_model_robustness(df, feature_cols, target_col, n_splits=5):
    """
    Runs K-Fold CV and Permutation Importance to rigorously test model stability.
    """
    # 1. Clean Data
    # Drop rows with NaNs in features or target
    data = df[feature_cols + [target_col]].dropna()

    # 2. Drop Constant Columns (Safety check)
    # If Volume is always 7500, it provides 0 info. Drop it to stop noise.
    X = data[feature_cols]
    X = X.loc[:, (X != X.iloc[0]).any()]
    valid_features = X.columns.tolist()

    if not valid_features:
        print("ERROR: No valid features left (all were constant or empty).")
        return

    y = data[target_col]

    print(f"\n--- Analyzing Model with {len(data)} samples ---")
    print(f"Features used: {valid_features}")

    # 3. Initialize Model
    # Lower max_depth prevents memorizing noise.
    rf = RandomForestRegressor(n_estimators=100, max_depth=None, random_state=42, n_jobs=-1)
    # rf = LinearRegression()

    # 4. K-Fold Cross Validation (The Robustness Check)
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    cv_scores = cross_val_score(rf, X, y, cv=kf, scoring='r2')

    print(f"\n> Cross-Validation R^2 (Avg of {n_splits} runs): {np.mean(cv_scores):.3f}")
    print(f"> CV Score Std Dev: {np.std(cv_scores):.3f} (Lower is more stable)")

    # 5. Permutation Importance (The "Real" Drivers)
    # We fit once on the whole reliable set to check importance
    rf.fit(X, y)
    result = permutation_importance(rf, X, y, n_repeats=10, random_state=42, n_jobs=-1)

    importances = pd.DataFrame({
        'Feature': valid_features,
        'Importance_Mean': result.importances_mean,
        'Importance_Std': result.importances_std
    }).sort_values('Importance_Mean', ascending=False)

    print("\n> Permutation Feature Importance:")
    print(importances)

    return rf, valid_features


# --- Usage Example ---

# 1. Filter Data
# Apply your filters cleanly here
filtered_df = all_data_human_gene.copy()
# filtered_df = all_data_human_gene[all_data_human_gene[MODIFICATION].isin(['cEt/5-methylcytosines/deoxy', 'LNA/deoxy', 'MOE/5-methylcytosines/deoxy'])].copy()
# filtered_df = all_data_human_gene[all_data_human_gene[MODIFICATION].isin(['MOE/5-methylcytosines/deoxy'])].copy()
# filtered_df = filtered_df[filtered_df[CELL_LINE].isin(['A431'])].copy()

# filtered_df = all_data_human_gene[all_data_human_gene[MODIFICATION] == 'MOE/5-methylcytosines/deoxy'].copy()
# filtered_df = filtered_df[filtered_df[CHEMICAL_PATTERN] != 'CCCddddddddddCCC'].copy()

# filtered_df = all_data_human_gene[all_data_human_gene[MODIFICATION] == 'LNA/deoxy'].copy()
# filtered_df = filtered_df[filtered_df[CELL_LINE].isin(['CC-2580'])].copy()

# filtered_df = filtered_df[filtered_df[CHEMICAL_PATTERN] != 'MMMMMddddddddddMMMMM'].copy()


fold_features = ['on_target_fold_openness_normalized40_15', 'on_target_fold_openness40_15']
general_features = ['sense_start', 'sense_start_from_end', 'sense_exon', 'sense_utr', 'sense_intron']

# 2. Define Feature Sets to Compare
feature_sets = {
    # "Monolith": [VOLUME, TREATMENT_PERIOD, 'PSDNA_DIFF_37_HYBR', 'LNA_DIFF_37_HYBR', 'METHYL_CYTOSINES', 'gc_content',
    #              'at_skew', 'true_length_of_seq'],
    # "Monolith2": [VOLUME, TREATMENT_PERIOD, 'LNA_DIFF_37_HYBR', 'METHYL_CYTOSINES', 'gc_content',
    #              'at_skew', 'true_length_of_seq'],
    # "Monolith3": [VOLUME, TREATMENT_PERIOD, 'PSDNA_DIFF_37_HYBR', 'METHYL_CYTOSINES', 'gc_content',
    #              'at_skew', 'true_length_of_seq'],
    "Monolith4": [VOLUME, TREATMENT_PERIOD, 'TOTAL_DNA_RNA_HYBR', 'DNA_HYBR_DIFF', 'PSDNA_DIFF_37_HYBR',
                  'LNA_DIFF_37_HYBR', 'CET_DIFF_37_HYBR', 'MOE_DIFF_37_MD_PB_HYBR',
                  # 'PSDNA_RNA_MD_37_PB_TOTAL_HYBR',
                  'METHYL_CYTOSINES', 'true_length_of_seq'] + general_features,
    "Dumb": [VOLUME, TREATMENT_PERIOD, 'true_length_of_seq'] + general_features,

    # "Decoupled Physics":   [VOLUME, 'TOTAL_DNA', 'LNA_BOOST', 'gc_content', 'true_length_of_seq'],
    # "Structure + Content": [VOLUME, 'TOTAL_DNA', 'LNA_BOOST','METHYL_CYTOSINES', 'gc_content']
}

target = 'log_inhibition'

# 3. Run Comparison
for name, feats in feature_sets.items():
    print(f"\n{'=' * 10} Testing: {name} {'=' * 10}")
    # Check if cols exist before running
    available_feats = [f for f in feats if f in filtered_df.columns]
    analyze_model_robustness(filtered_df, available_feats, target)

In [None]:
from notebooks.utils.print import print_correlations

print_correlations(filtered_df, 'TOTAL_PSDNA', 'MOE_HYBRIDIZATION')

In [None]:
correlation = filtered_df[['gc_content', 'LNA_DIFF_37_HYBR', 'PSDNA_DIFF_37_HYBR']].corr()
correlation