In [1]:
from scripts.data_genertion.consts import *
from scripts.features.feature_extraction import load_all_features

main_df = load_all_features()
main_df[SEQUENCE] = main_df[SEQUENCE].astype(str)

In [3]:
from asodesigner.read_human_genome import get_locus_to_data_dict
import pickle
from asodesigner.consts import CACHE_DIR

genes_u = ['HIF1A', 'APOL1', 'YAP1', 'SOD1', 'SNCA', 'IRF4', 'KRAS', 'KLKB1', 'SNHG14', 'DGAT2', 'IRF5', 'HTRA1',
           'MYH7', 'MALAT1', 'HSD17B13']
cache_path = CACHE_DIR / 'gene_to_data_simple_cache.pickle'
if not cache_path.exists():
    gene_to_data = get_locus_to_data_dict(include_introns=True, gene_subset=genes_u)
    with open(cache_path, 'wb') as f:
        pickle.dump(gene_to_data, f)
else:
    with open(cache_path, 'rb') as f:
        gene_to_data = pickle.load(f)

In [4]:
from scripts.data_genertion.data_handling import get_populated_df_with_structure_features

main_df = get_populated_df_with_structure_features(main_df, genes_u, gene_to_data)

In [5]:
main_df[[SENSE_TYPE, 'sense_intron', 'sense_exon', 'sense_utr', SENSE_START, CANONICAL_GENE]]

Unnamed: 0,sense_type,sense_intron,sense_exon,sense_utr,sense_start,Canonical Gene Name
0,intron,1,0,0,41212,KRAS
1,intron,1,0,0,23686,KRAS
2,intron,1,0,0,43363,KRAS
3,intron,1,0,0,23680,KRAS
4,intron,1,0,0,41168,KRAS
...,...,...,...,...,...,...
34760,intron,1,0,0,7827,APOL1
34761,intron,1,0,0,8250,APOL1
34762,intron,1,0,0,8335,APOL1
34763,utr,0,1,1,13848,APOL1


In [6]:
main_df.loc[:, 'mrna_length'] = [len(gene_to_data[gene].full_mrna) for gene in main_df[CANONICAL_GENE]]
main_df['normalized_sense_start_from_end'] = main_df['sense_start_from_end'] / main_df['mrna_length']

In [7]:
import numpy as np
import pandas as pd

first_filtered = main_df.copy()

first_filtered = first_filtered[first_filtered['Cell line organism'] == 'human']
# filtered = filtered[filtered[CANONICAL_GENE] == 'MALAT1']
first_filtered = first_filtered.dropna(subset=[INHIBITION]).copy()
# filtered = filtered.dropna(subset=[DENSITY_UPDATED]).copy()
log_correction = 1.01
first_filtered.loc[:, 'log_inhibition'] = -np.log(log_correction - first_filtered[INHIBITION] / 100)
first_filtered = first_filtered[~first_filtered[CELL_LINE].isin([
    'Hela',  # scanning modifications
    'Human Neuronal Cell',  # scanning modifications;
    'CC-2580',  # scanning modifications
    'SH-SY5Y'  # non pure PS based
])]
# first_filtered = first_filtered[~((first_filtered[CELL_LINE] == 'A431')
#                                   & (first_filtered[CANONICAL_GENE] == 'SOD1'))] # (Kind of) Scanning modifications

mean_metric = 'log_inhibition'
# mean_metric = INHIBITION

# Ensure you remove only the columns you don't want to group by
# cols_except_inhibition = [c for c in first_filtered.columns
#                           if c not in [INHIBITION, mean_metric, 'index']]

# Group by all remaining columns and average log_inhibition
collapsed = (
    first_filtered
    .groupby(['ISIS', VOLUME, TREATMENT_PERIOD, DENSITY_UPDATED, CHEMICAL_PATTERN], as_index=False)[mean_metric]
    .mean()
)

# first_filtered['mean_inhibition'] = first_filtered.groupby('ISIS')[mean_metric].transform('mean')
# first_filtered = first_filtered.drop_duplicates(subset='ISIS').assign(**{mean_metric: first_filtered['mean_inhibition']}).drop(
#     columns='mean_inhibition')

In [8]:
from sklearn.feature_selection import mutual_info_regression, f_regression

# filtered = filtered[filtered['Linkage'] == 'phosphorothioate']

moe_pattern = 'MMMMMddddddddddMMMMM'
# filtered = filtered[filtered['Chemical_Pattern'] == moe_pattern].copy()
# filtered = filtered[filtered[VOLUME] == 2500].copy()
filtered = first_filtered.copy()

filtered['log_volume'] = np.log(filtered[VOLUME])
filtered['log_treatment'] = np.log(filtered[TREATMENT_PERIOD])
li = filtered['log_inhibition']

# filtered['correct_log_inhibition'] = filtered['log_inhibition'] / (filtered['log_volume'])
filtered['correct_log_inhibition'] = li / (filtered[VOLUME] / (filtered[VOLUME] + 10))
filtered['correct_log_inhibition2'] = li / (filtered[VOLUME] * 0.0000601 + 0.537)
# filtered['correct_log_inhibition'] = li



filtered_original = filtered

ALL_CELL_LINES = filtered_original[CELL_LINE].unique()
non_hepa_cancer = filtered_original[
    # (~((filtered_original[CELL_LINE] == 'HepG2') & (filtered_original[TRANSFECTION].str.contains('Lipo')))) &
    (filtered_original[CELL_LINE] != 'HepG2')  # experiment settings too different
    & (filtered_original[CELL_LINE] != 'HepaRG')  # not similar to cancer
    & (filtered_original[CELL_LINE] != 'A-431')
    & (filtered_original[CELL_LINE] != 'SNU-449')
    ]

metric = 'correct_log_inhibition2'

features = (
    # [TREATMENT_PERIOD] +
    [
        col
        for col in non_hepa_cancer.select_dtypes(include=["number"]).columns
        if col not in [INHIBITION, "log_inhibition", "correct_log_inhibition", "correct_log_inhibition2", "ISIS", 'log_volume']
    ]
)

X = non_hepa_cancer[features]
y = non_hepa_cancer[metric]


In [16]:
cols = [c for c in X.columns if "off_target" in c]
X[cols] = X[cols].fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[cols] = X[cols].fillna(0)


In [17]:
# drop rows with NaNs to avoid MI errors
Xc = X.loc[:, X.notna().all(axis=0)]
yc = y

mi = mutual_info_regression(Xc, yc, random_state=42)

mi_df = (pd.Series(mi, index=Xc.columns, name="score")
           .sort_values(ascending=False)
           .reset_index()
           .rename(columns={"index": "feature"}))

In [52]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import mutual_info_regression

def select_mi_spearman_decorrelated(Xc, yc, top_k=80, target_n=40, random_state=42):
    """
    Iteratively:
      1) Compute MI on remaining features
      2) Keep top_k by MI
      3) Pick the feature with the lowest max |Spearman| corr
         (vs already-selected; if none selected yet, vs others in top_k)
      4) Remove it and repeat until target_n are selected
    Returns (selected_list, selection_df)
    """
    assert target_n <= Xc.shape[1]
    remaining = Xc.columns.tolist()
    selected = []
    records = []

    for step in range(target_n):
        # MI on remaining
        mi = mutual_info_regression(Xc[remaining], yc, random_state=random_state)
        mi_s = pd.Series(mi, index=remaining)

        # top_k candidates by MI
        candidates = mi_s.nlargest(min(top_k, len(remaining))).index.tolist()

        # Spearman block over (candidates + selected)
        cols = candidates + selected
        corr = Xc[cols].corr(method="spearman").abs()

        if selected:
            # max corr of each candidate vs the selected set
            max_corr_to_sel = corr.loc[candidates, selected].max(axis=1).fillna(0.0)
        else:
            # no selected yet: use within-candidates redundancy
            cc = corr.loc[candidates, candidates].copy()
            np.fill_diagonal(cc.values, 0.0)
            max_corr_to_sel = cc.max(axis=1)

        # choose candidate with minimal max corr; tie-break by higher MI
        mc_min = max_corr_to_sel.min()
        tie = [c for c in candidates if max_corr_to_sel[c] == mc_min]
        best = pd.Series({c: mi_s[c] for c in tie}).idxmax()

        print("Selected feature: ", best)
        selected.append(best)
        remaining.remove(best)
        records.append({
            "rank": step+1,
            "feature": best,
            "max_spearman_to_selected": float(mc_min),
            "mi_at_pick": float(mi_s[best])
        })

    return selected, pd.DataFrame(records)

# usage:
selected_30, summary = select_mi_spearman_decorrelated(Xc, yc, top_k=80, target_n=40)
print(selected_30)
print(summary)


Selected feature:  self_energy
Selected feature:  Modification_skew_index
Selected feature:  premRNA_gc_skew_50
Selected feature:  premRNA_at_skew_50
Selected feature:  premRNA_homooligo_count_70
Selected feature:  Modification_evenness
Selected feature:  normalized_start
Selected feature:  dsm_su95_rev_woGU_pos1386t37Falseon_target_energy_max600
Selected feature:  RNaseH1_Krel_score_R7_krel
Selected feature:  Density(cells_per_well)
Selected feature:  ASO_volume(nM)
Selected feature:  premRNA_stop_codon_count_70
Selected feature:  Treatment_Period(hours)
Selected feature:  ENC_score_global_CDS
Selected feature:  premRNA_tandem_repeats_score_70
Selected feature:  RNaseH1_score_R4b
Selected feature:  premRNA_sequence_entropy_50
Selected feature:  Modification_adjacent_pair_count
Selected feature:  min_mfe_45
Selected feature:  mod_scan
Selected feature:  off_target.MS.top200.cutoff1000.premRNA
Selected feature:  Modification_symmetry_score
Selected feature:  Modification_in_core
Selecte

In [59]:
Xmmr = filtered_original[filtered_original[CELL_LINE] == 'MM.1R'][features]
Xmmr = Xmmr.loc[:, Xmmr.notna().all(axis=0)]

yMMR = filtered_original[filtered_original[CELL_LINE] == 'MM.1R'][INHIBITION]

mi = mutual_info_regression(Xmmr, yMMR, random_state=42)

mi_df = (pd.Series(mi, index=Xmmr.columns, name="score")
           .sort_values(ascending=False)
           .reset_index()
           .rename(columns={"index": "feature"}))

In [60]:
# usage:
selected_30, summary = select_mi_spearman_decorrelated(Xmmr, yMMR, top_k=80, target_n=40)
print(selected_30)
print(summary)

Selected feature:  ASO_volume(nM)
Selected feature:  premRNA_gc_skew_70
Selected feature:  RNaseH1_Krel_score_R4a_krel
Selected feature:  RNaseH1_score_R4b
Selected feature:  premRNA_dispersed_repeats_score_40
Selected feature:  RNaseH1_Krel_dinucleotide_score_R7_krel_dinuc
Selected feature:  premRNA_flexible_dinucleotide_fraction_50
Selected feature:  premRNA_at_skew_40
Selected feature:  Modification_pos_std
Selected feature:  dsm_su95_rev_woGU_pos1386t37Trueon_target_energy_max600
Selected feature:  self_energy
Selected feature:  premRNA_homooligo_count_50
Selected feature:  ASO_ENC
Selected feature:  normalized_start
Selected feature:  index.1
Selected feature:  at_skew
Selected feature:  RNaseH1_Krel_score_R7_krel
Selected feature:  premRNA_sequence_entropy_60
Selected feature:  sense_avg_accessibility
Selected feature:  premRNA_stop_codon_count_60
Selected feature:  ASO_gc_skew
Selected feature:  premRNA_tandem_repeats_score_70
Selected feature:  premRNA_purine_content_30
Selecte

In [55]:
Xmmr = filtered_original[filtered_original[CELL_LINE] == 'A431'][features]
Xmmr = Xmmr.loc[:, Xmmr.notna().all(axis=0)]

yMMR = filtered_original[filtered_original[CELL_LINE] == 'A431'][INHIBITION]

mi = mutual_info_regression(Xmmr, yMMR, random_state=42)

mi_df = (pd.Series(mi, index=Xmmr.columns, name="score")
           .sort_values(ascending=False)
           .reset_index()
           .rename(columns={"index": "feature"}))

In [56]:
# usage:
selected_30, summary = select_mi_spearman_decorrelated(Xmmr, yMMR, top_k=80, target_n=40)
print(selected_30)
print(summary)

Selected feature:  self_energy
Selected feature:  Modification_3prime_run
Selected feature:  Modification_symmetry_score
Selected feature:  premRNA_homooligo_count_70
Selected feature:  exp_ps_hybr_norm
Selected feature:  premRNA_at_skew_60
Selected feature:  RNaseH1_Krel_dinucleotide_score_R4b_krel_dinuc
Selected feature:  normalized_sense_start_from_end
Selected feature:  RNaseH1_score_R7
Selected feature:  premRNA_gc_skew_30
Selected feature:  premRNA_flexible_dinucleotide_fraction_70
Selected feature:  Modification_adjacent_pair_count
Selected feature:  premRNA_dinucleotide_entropy_20
Selected feature:  premRNA_tandem_repeats_score_60
Selected feature:  on_target_fold_openness40_15
Selected feature:  index.1
Selected feature:  off_target.top100.cutoff600.premRNA_TPM
Selected feature:  ASO_volume(nM)
Selected feature:  Modification_evenness
Selected feature:  premRNA_entropy_70
Selected feature:  premRNA_4 palindromic_70
Selected feature:  premRNA_at_skew_20
Selected feature:  off_t

In [57]:
Xmmr = filtered_original[filtered_original[CELL_LINE] == 'SK-MEL-28'][features]
Xmmr = Xmmr.loc[:, Xmmr.notna().all(axis=0)]

yMMR = filtered_original[filtered_original[CELL_LINE] == 'SK-MEL-28'][INHIBITION]

mi = mutual_info_regression(Xmmr, yMMR, random_state=42)

mi_df = (pd.Series(mi, index=Xmmr.columns, name="score")
           .sort_values(ascending=False)
           .reset_index()
           .rename(columns={"index": "feature"}))

In [58]:
# usage:
selected_30, summary = select_mi_spearman_decorrelated(Xmmr, yMMR, top_k=80, target_n=40)
print(selected_30)
print(summary)

Selected feature:  ASO_volume(nM)
Selected feature:  premRNA_6 palindromic_70
Selected feature:  ASO_dinucleotide_entropy
Selected feature:  premRNA_gc_content_3prime_end_30
Selected feature:  sense_exon
Selected feature:  premRNA_gc_skew_40
Selected feature:  premRNA_sequence_entropy_40
Selected feature:  internal_fold
Selected feature:  premRNA_at_skew_70
Selected feature:  Modification_max_block_length
Selected feature:  RNaseH1_score_R7
Selected feature:  premRNA_cg_dinucleotide_fraction_20
Selected feature:  log_treatment
Selected feature:  premRNA_homooligo_count_70
Selected feature:  sense_avg_accessibility
Selected feature:  premRNA_ggg_counts_70
Selected feature:  index.1
Selected feature:  ASO_gc_skew_ends
Selected feature:  ASO_toxic_motif_count
Selected feature:  RNaseH1_motif_presence_GCGC
Selected feature:  premRNA_tandem_repeats_score_60
Selected feature:  premRNA_dispersed_repeats_score_20
Selected feature:  premRNA_stop_codon_count_40
Selected feature:  at_skew
Selecte