In [1]:
from xgboost import XGBRegressor

from scripts.data_genertion.consts import *
from scripts.features.feature_extraction import load_all_features

main_df = load_all_features()
main_df[SEQUENCE] = main_df[SEQUENCE].astype(str)

In [2]:
from asodesigner.read_human_genome import get_locus_to_data_dict
import pickle
from asodesigner.consts import CACHE_DIR

genes_u = ['HIF1A', 'APOL1', 'YAP1', 'SOD1', 'SNCA', 'IRF4', 'KRAS', 'KLKB1', 'SNHG14', 'DGAT2', 'IRF5', 'HTRA1',
           'MYH7', 'MALAT1', 'HSD17B13']
cache_path = CACHE_DIR / 'gene_to_data_simple_cache.pickle'
if not cache_path.exists():
    gene_to_data = get_locus_to_data_dict(include_introns=True, gene_subset=genes_u)
    with open(cache_path, 'wb') as f:
        pickle.dump(gene_to_data, f)
else:
    with open(cache_path, 'rb') as f:
        gene_to_data = pickle.load(f)

In [3]:
from scripts.data_genertion.data_handling import get_populated_df_with_structure_features

main_df = get_populated_df_with_structure_features(main_df, genes_u, gene_to_data)

In [4]:
main_df[[SENSE_TYPE, 'sense_intron', 'sense_exon', 'sense_utr', SENSE_START, CANONICAL_GENE]]

Unnamed: 0,sense_type,sense_intron,sense_exon,sense_utr,sense_start,Canonical Gene Name
0,intron,1,0,0,41212,KRAS
1,intron,1,0,0,23686,KRAS
2,intron,1,0,0,43363,KRAS
3,intron,1,0,0,23680,KRAS
4,intron,1,0,0,41168,KRAS
...,...,...,...,...,...,...
34760,intron,1,0,0,7827,APOL1
34761,intron,1,0,0,8250,APOL1
34762,intron,1,0,0,8335,APOL1
34763,utr,0,1,1,13848,APOL1


In [5]:
main_df.loc[:, 'mrna_length'] = [len(gene_to_data[gene].full_mrna) for gene in main_df[CANONICAL_GENE]]
main_df['normalized_sense_start_from_end'] = main_df['sense_start_from_end'] / main_df['mrna_length']

In [6]:
import numpy as np
import pandas as pd

first_filtered = main_df.copy()

first_filtered = first_filtered[first_filtered['Cell line organism'] == 'human']
# filtered = filtered[filtered[CANONICAL_GENE] == 'MALAT1']
first_filtered = first_filtered.dropna(subset=[INHIBITION]).copy()
# filtered = filtered.dropna(subset=[DENSITY_UPDATED]).copy()
log_correction = 1.01
first_filtered.loc[:, 'log_inhibition'] = -np.log(log_correction - first_filtered[INHIBITION] / 100)
first_filtered = first_filtered[~first_filtered[CELL_LINE].isin([
    'Hela',  # scanning modifications
    'Human Neuronal Cell',  # scanning modifications;
    'CC-2580',  # scanning modifications
    'SH-SY5Y'  # non pure PS based
])]
# first_filtered = first_filtered[~((first_filtered[CELL_LINE] == 'A431')
#                                   & (first_filtered[CANONICAL_GENE] == 'SOD1'))] # (Kind of) Scanning modifications

mean_metric = 'log_inhibition'
# mean_metric = INHIBITION

# Ensure you remove only the columns you don't want to group by
# cols_except_inhibition = [c for c in first_filtered.columns
#                           if c not in [INHIBITION, mean_metric, 'index']]

# Group by all remaining columns and average log_inhibition
collapsed = (
    first_filtered
    .groupby(['ISIS', VOLUME, TREATMENT_PERIOD, DENSITY_UPDATED, CHEMICAL_PATTERN], as_index=False)[mean_metric]
    .mean()
)

# first_filtered['mean_inhibition'] = first_filtered.groupby('ISIS')[mean_metric].transform('mean')
# first_filtered = first_filtered.drop_duplicates(subset='ISIS').assign(**{mean_metric: first_filtered['mean_inhibition']}).drop(
#     columns='mean_inhibition')

In [43]:
from sklearn.feature_selection import mutual_info_regression, f_regression

# filtered = filtered[filtered['Linkage'] == 'phosphorothioate']

moe_pattern = 'MMMMMddddddddddMMMMM'
# filtered = filtered[filtered['Chemical_Pattern'] == moe_pattern].copy()
# filtered = filtered[filtered[VOLUME] == 2500].copy()
filtered = first_filtered.copy()

filtered['log_volume'] = np.log(filtered[VOLUME])
filtered['log_treatment'] = np.log(filtered[TREATMENT_PERIOD])
li = filtered['log_inhibition']

# filtered['correct_log_inhibition'] = filtered['log_inhibition'] / (filtered['log_volume'])
filtered['correct_log_inhibition'] = li / (filtered[VOLUME] / (filtered[VOLUME] + 10))
filtered['correct_log_inhibition2'] = li / (filtered[VOLUME] * 0.0000601 + 0.537)
filtered['correct_log_inhibition3'] = filtered['correct_log_inhibition2'] / (-0.032*  filtered[TREATMENT_PERIOD] + 2.36)
# filtered['correct_log_inhibition'] = li



filtered_original = filtered

ALL_CELL_LINES = filtered_original[CELL_LINE].unique()
non_hepa_cancer = filtered_original[
    # (~((filtered_original[CELL_LINE] == 'HepG2') & (filtered_original[TRANSFECTION].str.contains('Lipo')))) &
    (filtered_original[CELL_LINE] != 'HepG2')  # experiment settings too different
    & (filtered_original[CELL_LINE] != 'HepaRG')  # not similar to cancer
    & (filtered_original[CELL_LINE] != 'A-431')
    & (filtered_original[CELL_LINE] != 'SNU-449')
    # & (filtered_original[CELL_LINE] != 'MM.1R')
    # & (filtered_original[CELL_LINE] != 'KARPAS-229')

    ]

metric = 'correct_log_inhibition2'

features = (
    # [TREATMENT_PERIOD] +
    [
        col
        for col in non_hepa_cancer.select_dtypes(include=["number"]).columns
        if col not in [INHIBITION, "log_inhibition", "correct_log_inhibition", "correct_log_inhibition2", "ISIS", 'log_volume', 'index', 'Transcript']
    ]
)

X = non_hepa_cancer[features]
y = non_hepa_cancer[metric]


In [8]:
cols = [c for c in X.columns if "off_target" in c]
X[cols] = X[cols].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[cols] = X[cols].fillna(0)


In [17]:
# Spearman-first version (target vs features + feature-feature Spearman)

import numpy as np
import pandas as pd

# --- prep (numeric + median impute; avoids dropping useful cols) ---
Xc = X.select_dtypes(include=[np.number]).copy()
Xc = Xc.fillna(Xc.median(numeric_only=True))
yc = pd.Series(y, index=Xc.index).astype(float)

# --- Spearman with target (like your mi_df) ---
rho = Xc.corrwith(yc, method="spearman")                       # feature ↔ target
spearman_df = (
    rho.to_frame("spearman")
       .assign(abs_spearman=lambda d: d["spearman"].abs())
       .sort_values("abs_spearman", ascending=False)
       .reset_index()
       .rename(columns={"index": "feature"})
)

# --- Spearman between features (matrix + sorted upper-triangle pairs) ---
rho_ff = Xc.corr(method="spearman").astype(float).abs()        # |feature ↔ feature|
pairs_ff = (
    rho_ff.where(np.triu(np.ones(rho_ff.shape, dtype=bool), 1))
          .stack()
          .rename("abs_spearman")
          .reset_index()
          .rename(columns={"level_0": "f1", "level_1": "f2"})
          .sort_values("abs_spearman", ascending=False)
    # .query("abs_spearman >= 0.85")  # uncomment to view highly correlated pairs
)

# spearman_df -> ranked features by |Spearman with target|
# rho_ff       -> full |Spearman| correlation matrix between features
# pairs_ff     -> tidy table of feature pairs with highest |Spearman|


  return spearmanr(a, b)[0]


In [20]:
def select_spearman_decorrelated(Xc, spearman_df, target_n=30, rho_cut=0.85):
    corr = Xc.corr(method="spearman").abs()
    selected = []
    for f in spearman_df["feature"]:
        if len(selected) >= target_n:
            break
        if not selected or corr.loc[f, selected].max() < rho_cut:
            selected.append(f)
    return selected

selected = select_spearman_decorrelated(Xc, spearman_df, target_n=50, rho_cut=0.85)

In [21]:
selected

['on_target_fold_openness40_15',
 'Modification_min_distance_to_3prime',
 'index',
 'Modification_skew_index',
 'CAI_score_global_CDS',
 'sense_avg_accessibility',
 'at_skew',
 'Modification_mean_gap',
 'Modification_3prime_run',
 'Modification_pos_std',
 'md_ps_hybr',
 'ENC_score_global_CDS',
 'premRNA_flexible_dinucleotide_fraction_20',
 'normalized_sense_start_from_end',
 'internal_fold',
 'Treatment_Period(hours)',
 'ASO_gc_block_length',
 'premRNA_gc_content_20',
 'gc_skew',
 'premRNA_at_rich_region_score_40',
 'premRNA_flexible_dinucleotide_fraction_50',
 'Modification_symmetry_score',
 'moe_hybr',
 'premRNA_gc_skew_20',
 'off_target.top200.cutoff600.premRNA_TPM',
 'premRNA_ggg_counts_40',
 'premRNA_at_skew_20',
 'premRNA_ggg_counts_70',
 'premRNA_gc_content_50',
 'ASO_hairpin_tm',
 'Modification_max_block_length',
 'mod_scan',
 'premRNA_ggg_counts_20',
 'premRNA_gc_skew_70',
 'premRNA_at_skew_40',
 'gc_content_3_prime_5',
 'ASO_at_rich_region_score',
 'premRNA_cg_dinucleotide_fr

In [33]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import mutual_info_regression

def select_mi_spearman_decorrelated(Xc, yc, top_k=80, target_n=40, random_state=42):
    """
    Iteratively:
      1) Compute MI on remaining features
      2) Keep top_k by MI
      3) Pick the feature with the lowest max |Spearman| corr
         (vs already-selected; if none selected yet, vs others in top_k)
      4) Remove it and repeat until target_n are selected
    Returns (selected_list, selection_df)
    """
    assert target_n <= Xc.shape[1]
    remaining = Xc.columns.tolist()
    selected = []
    records = []

    for step in range(target_n):
        # MI on remaining
        mi = mutual_info_regression(Xc[remaining], yc, random_state=random_state)
        mi_s = pd.Series(mi, index=remaining)

        # top_k candidates by MI
        candidates = mi_s.nlargest(min(top_k, len(remaining))).index.tolist()

        # Spearman block over (candidates + selected)
        cols = candidates + selected
        corr = Xc[cols].corr(method="spearman").abs()

        if selected:
            # max corr of each candidate vs the selected set
            max_corr_to_sel = corr.loc[candidates, selected].max(axis=1).fillna(0.0)
        else:
            # no selected yet: use within-candidates redundancy
            cc = corr.loc[candidates, candidates].copy()
            np.fill_diagonal(cc.values, 0.0)
            max_corr_to_sel = cc.max(axis=1)

        # choose candidate with minimal max corr; tie-break by higher MI
        mc_min = max_corr_to_sel.min()
        tie = [c for c in candidates if max_corr_to_sel[c] == mc_min]
        best = pd.Series({c: mi_s[c] for c in tie}).idxmax()

        print("Selected feature: ", best)
        selected.append(best)
        remaining.remove(best)
        records.append({
            "rank": step+1,
            "feature": best,
            "max_spearman_to_selected": float(mc_min),
            "mi_at_pick": float(mi_s[best])
        })

    return selected, pd.DataFrame(records)

# usage:
selected_30, summary = select_mi_spearman_decorrelated(Xc, yc, top_k=80, target_n=40)
print(selected_30)
print(summary)


ValueError: Input X contains NaN.

In [47]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import mutual_info_regression
from scipy.stats import spearmanr

# --- slice & clean ---
mask_cell = (filtered_original[CELL_LINE] == 'A431') & (filtered_original[CANONICAL_GENE] == 'MALAT1')
Xmmr = filtered_original.loc[mask_cell, features].copy()
yMMR = pd.to_numeric(filtered_original.loc[mask_cell, 'correct_log_inhibition3'], errors='coerce')

# drop feature columns with any NaNs; then drop rows with NaN in y (if any)
Xmmr = Xmmr.loc[:, Xmmr.notna().all(axis=0)]
row_mask = yMMR.notna()
Xmmr = Xmmr.loc[row_mask]
y = yMMR.loc[row_mask].astype(float).values

# --- Mutual Information ---
mi_vals = mutual_info_regression(Xmmr.astype(float), y, random_state=42)

# --- Spearman rho & p-value ---
rhos, pvals = [], []
for col in Xmmr.columns:
    x = pd.to_numeric(Xmmr[col], errors='coerce').values
    rho, p = spearmanr(x, y)  # no NaNs expected after cleaning
    rhos.append(rho)
    pvals.append(p)

# --- Assemble result ---
mi_df = (pd.DataFrame({
            "feature": Xmmr.columns,
            "mi": mi_vals,
            "spearman_rho": rhos,
            "spearman_p": pvals
        })
        .sort_values("mi", ascending=False)
        .reset_index(drop=True))

pd.set_option("display.max_columns", None)     # show all columns
pd.set_option("display.width", 200)            # wider print
pd.set_option("display.max_colwidth", None)    # don’t truncate cell contents

print(mi_df.head(20))


                                      feature        mi  spearman_rho    spearman_p
0                     correct_log_inhibition3  4.611676      1.000000  0.000000e+00
1                      Location_div_by_length  0.181713      0.124697  4.704783e-12
2                                 sense_start  0.181711      0.124697  4.704783e-12
3                        Location_in_sequence  0.180527      0.124697  4.704783e-12
4                            normalized_start  0.180127      0.124697  4.704783e-12
5             normalized_sense_start_from_end  0.180126     -0.124697  4.704783e-12
6                        sense_start_from_end  0.179877     -0.124697  4.704783e-12
7                                     index.1  0.152745      0.227935  2.833219e-37
8                on_target_fold_openness40_15  0.139374      0.291598  6.606943e-61
9                               mfe_window_45  0.136296      0.303088  6.869733e-66
10    on_target_fold_openness_normalized40_15  0.132296      0.286547  8.679

  rho, p = spearmanr(x, y)  # no NaNs expected after cleaning


In [34]:
# usage:
selected_30, summary = select_mi_spearman_decorrelated(Xmmr, yMMR, top_k=80, target_n=40)
print(selected_30)
print(summary)

Selected feature:  Treatment_Period(hours)
Selected feature:  normalized_sense_start_from_end
Selected feature:  ENC_score_global_CDS
Selected feature:  Modification_min_distance_to_3prime
Selected feature:  CAI_score_global_CDS
Selected feature:  ASO_hairpin_tm
Selected feature:  premRNA_cg_dinucleotide_fraction_70
Selected feature:  premRNA_ggg_counts_20
Selected feature:  ASO_at_rich_region_score
Selected feature:  Modification_mean_gap
Selected feature:  RNaseH1_Krel_score_R7_krel
Selected feature:  premRNA_gc_skew_70
Selected feature:  premRNA_at_skew_40
Selected feature:  ASO_gc_block_length
Selected feature:  premRNA_flexible_dinucleotide_fraction_20
Selected feature:  at_skew
Selected feature:  self_energy
Selected feature:  gc_content_3_prime_5
Selected feature:  premRNA_stop_codon_count_70
Selected feature:  sense_avg_accessibility
Selected feature:  ASO_flexible_dinucleotide_fraction
Selected feature:  gc_skew
Selected feature:  premRNA_gc_block_length_40
Selected feature:  

In [48]:
mask = (filtered_original[CELL_LINE] == 'A431') &(filtered_original[CANONICAL_GENE] == 'KRAS')
Xmmr = filtered_original[mask][features]
Xmmr = Xmmr.loc[:, Xmmr.notna().all(axis=0)]

yMMR = filtered_original[mask][INHIBITION]

mi = mutual_info_regression(Xmmr, yMMR, random_state=42)

mi_df = (pd.Series(mi, index=Xmmr.columns, name="score")
           .sort_values(ascending=False)
           .reset_index()
           .rename(columns={"index": "feature"}))

In [50]:
mi_df = mi_df.sort_values("score", ascending=False).reset_index(drop=True)
print(mi_df.head(20))

                                                     feature     score
0                                    correct_log_inhibition3  3.589216
1                                                    index.1  0.531335
2                                             ASO_volume(nM)  0.325728
3                                              log_treatment  0.182698
4                    off_target.top100.cutoff600.premRNA_TPM  0.166716
5                                    Treatment_Period(hours)  0.164405
6                                                exp_ps_hybr  0.149815
7                                           exp_ps_hybr_norm  0.147853
8                         premRNA_dispersed_repeats_score_30  0.145306
9     dsm_su95_rev_wGU_pos1384t37Trueon_target_energy_max600  0.143503
10   dsm_su95_rev_wGU_pos1382t37Falseon_target_energy_max600  0.139901
11   dsm_su95_rev_woGU_pos1384t37Trueon_target_energy_max600  0.139507
12  dsm_su95_rev_woGU_pos1382t37Falseon_target_energy_max600  0.139139
13   d

In [None]:
# usage:
selected_30, summary = select_mi_spearman_decorrelated(Xmmr, yMMR, top_k=80, target_n=40)
print(selected_30)
print(summary)

In [None]:
Xmmr = filtered_original[filtered_original[CELL_LINE] == 'SK-MEL-28'][features]
Xmmr = Xmmr.loc[:, Xmmr.notna().all(axis=0)]

yMMR = filtered_original[filtered_original[CELL_LINE] == 'SK-MEL-28'][INHIBITION]

mi = mutual_info_regression(Xmmr, yMMR, random_state=42)

mi_df = (pd.Series(mi, index=Xmmr.columns, name="score")
           .sort_values(ascending=False)
           .reset_index()
           .rename(columns={"index": "feature"}))

In [None]:
# usage:
selected_30, summary = select_mi_spearman_decorrelated(Xmmr, yMMR, top_k=80, target_n=40)
print(selected_30)
print(summary)

In [23]:
selected = ['on_target_fold_openness40_15',
 'Modification_min_distance_to_3prime',
 'Modification_skew_index',
 'CAI_score_global_CDS',
 'sense_avg_accessibility',
 'at_skew',
 'Modification_mean_gap',
 'Modification_3prime_run',
 'Modification_pos_std',
 'md_ps_hybr',
 'ENC_score_global_CDS',
 'premRNA_flexible_dinucleotide_fraction_20',
 'normalized_sense_start_from_end',
 'internal_fold',
 'Treatment_Period(hours)',
 'ASO_gc_block_length',
 'premRNA_gc_content_20',
 'gc_skew',
 'premRNA_at_rich_region_score_40',
 'premRNA_flexible_dinucleotide_fraction_50',
 'Modification_symmetry_score',
 'moe_hybr',
 'premRNA_gc_skew_20',
 'off_target.top200.cutoff600.premRNA_TPM',
 'premRNA_ggg_counts_40',
 'premRNA_at_skew_20',
 'premRNA_ggg_counts_70',
 'premRNA_gc_content_50',
 'ASO_hairpin_tm',
 'Modification_max_block_length',
 'premRNA_ggg_counts_20',
 'premRNA_gc_skew_70',
 'premRNA_at_skew_40',
 'gc_content_3_prime_5',
 'ASO_at_rich_region_score',
 'premRNA_cg_dinucleotide_fraction_30',
 'premRNA_stop_codon_count_30',
 'ASO_flexible_dinucleotide_fraction',
 'premRNA_gc_block_length_40',
 'premRNA_gc_block_length_70',
 'premRNA_cg_dinucleotide_fraction_70',
 'off_target_feature_50_log',
 'dsm_su95_rev_wGU_pos1384t37Falseon_target_energy_max600',
 'premRNA_stop_codon_count_70',
 'self_energy',
 'Modification_char_entropy',
 'RNaseH1_Krel_score_R7_krel',
 'sense_utr']