In [1]:
import pandas as pd
from notebooks.consts import *
from tauso.file_utils import read_human_genome_fasta_dict
from tauso.consts import *
import numpy as np
from tauso.util import get_antisense
import pickle
from tauso.genome.read_human_genome import get_locus_to_data_dict
from tauso.file_utils import read_human_genome_fasta_dict
import RNA

In [2]:
csv_path = NOTEBOOK_PATH / 'data' / 'data_asoptimizer_updated.csv'
all_data = pd.read_csv(str(csv_path), low_memory=False)

Preprocessing ASO Experimental Dataset

In [3]:
from notebooks.notebook_utils import log_correction

# Remove rows with missing values in the INHIBITION column
all_data_no_nan = all_data.dropna(subset=[INHIBITION]).copy()
# Create a new column with transformed inhibition values on a negative log scale
log_correction(all_data_no_nan) # to avoid log 0

In [4]:
# Filter the data to include only rows where the cell line organism is human
all_data_no_nan_human = all_data_no_nan[all_data_no_nan[CELL_LINE_ORGANISM] == 'human']
genes = all_data_no_nan[CANONICAL_GENE].copy()
genes_u = list(set(genes))
# Remove non-human or negative controls from the gene list
genes_u.remove('HBV')
genes_u.remove('negative_control')

In [5]:
from notebooks.notebook_utils import read_cached_gene_to_data

gene_to_data = read_cached_gene_to_data(genes_u)

Elapsed DB:  0.0011098384857177734
Elapsed Fasta:  0.0011098384857177734
Length:  3099750718
Feature type:  transcript
Feature type:  transcript
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  start_codon
Feature type:  start_codon
Feature type:  transcript
Feature type:  CDS
Feature type:  transcript
Feature type:  transcript
Feature type:  transcript
Feature type:  transcript
Feature type:  transcript
Feature type:  transcript
Feature type:  transcript
Feature type:  transcript
Feature type:  transcript
Feature type:  transcript
Feature type:  transcript
Feature type:  transcript
Feature type:  transcript
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS


In [6]:
# Filter data to keep only rows with valid gene information
all_data_human_gene = all_data_no_nan_human[all_data_no_nan_human[CANONICAL_GENE].isin(genes_u)].copy()

# Define names for new columns
SENSE_SEQUENCE = 'sense_sequence'
PRE_MRNA_SEQUENCE = 'pre_mrna_sequence'
SENSE_START = 'sense_start'
SENSE_LENGTH = 'sense_length'

# Initialize new columns
all_data_human_gene[SENSE_SEQUENCE] = ""
all_data_human_gene[PRE_MRNA_SEQUENCE] = ""
all_data_human_gene[SENSE_START] = np.zeros_like(all_data_human_gene[CANONICAL_GENE], dtype=int)
all_data_human_gene[SENSE_LENGTH] = np.zeros_like(all_data_human_gene[CANONICAL_GENE], dtype=int)

# Iterate over each row and compute the antisense complement and the gene's pre-mRNA
for index, row in all_data_human_gene.iterrows():
    gene_name = row[CANONICAL_GENE]

    if gene_name not in gene_to_data:
        continue  # Skip genes not found in genome annotation

    locus_info = gene_to_data[gene_name]
    pre_mrna = locus_info.full_mrna
    antisense = row[SEQUENCE]
    sense = get_antisense(antisense)
    idx = pre_mrna.find(sense)

    # Store computed sequences in new columns
    all_data_human_gene.loc[index, SENSE_START] = idx
    all_data_human_gene.loc[index, SENSE_LENGTH] = len(antisense)
    all_data_human_gene.at[index, SENSE_SEQUENCE] = sense
    all_data_human_gene.at[index, PRE_MRNA_SEQUENCE] = pre_mrna

In [7]:
all_data_human_gene.columns

Index(['index', 'ISIS', 'Target_gene', 'Cell_line', 'Density(cells/well)',
       'Transfection', 'ASO_volume(nM)', 'Treatment_Period(hours)',
       'Primer_probe_set', 'Sequence', 'Modification', 'Location',
       'Chemical_Pattern', 'Linkage', 'Linkage_Location', 'Smiles',
       'Inhibition(%)', 'seq_length', 'Canonical Gene Name',
       'Cell line organism', 'Transcript', 'Location_in_sequence',
       'Location_div_by_length', 'true_length_of_seq', 'mod_scan',
       'cell_line_uniform', 'log_inhibition', 'sense_sequence',
       'pre_mrna_sequence', 'sense_start', 'sense_length'],
      dtype='object')

In [None]:
all_data_human_gene.head()

In [None]:
from tauso.features.rna_access.access_calculator import get_sense_with_flanks

FLANK_SIZE = 120  # Change this as needed
FLANKED_SENSE_COL = f'sense_with_flank_{FLANK_SIZE}nt'

# Create new column with flanked sequences
all_data_human_gene[FLANKED_SENSE_COL] = all_data_human_gene.apply(
    lambda row: get_sense_with_flanks(
        row['pre_mrna_sequence'],
        row['sense_start'],
        row['sense_length'],
        flank_size=FLANK_SIZE
    ) if row['sense_start'] != -1 else "",  # Handle cases where sense was not found
    axis=1
)

ViennaRNA energy calculation

In [None]:
from tqdm import tqdm
from scipy.stats import spearmanr
import ViennaRNA
from Bio.Seq import Seq

In [None]:
def calculate_avg_mfe_over_sense_region(sequence, sense_start, sense_length, flank_size=120, window_size=120, step=1):
    sequence = str(sequence).upper().replace('T', 'U')
    sequence_length = len(sequence)
    energy_values = np.zeros(sequence_length)
    counts = np.zeros(sequence_length)

    for i in range(0, sequence_length - window_size + 1, step):
        subseq = sequence[i:i + window_size]
        fc = ViennaRNA.fold_compound(subseq)
        _, mfe = fc.mfe()
        mfe_per_nt = mfe / window_size

        for j in range(i, i + window_size):
            energy_values[j] += mfe_per_nt
            counts[j] += 1

    counts[counts == 0] = 1
    avg_energies = energy_values / counts

    flank_start = max(0, sense_start - flank_size)
    sense_start_in_flank = sense_start - flank_start
    sense_end_in_flank = sense_start_in_flank + sense_length

    if 0 <= sense_start_in_flank < sequence_length and sense_end_in_flank <= sequence_length:
        return np.mean(avg_energies[sense_start_in_flank:sense_end_in_flank])
    else:
        return np.nan


### Final best parameters
Parameters that were tested:

- **Step sizes:**
  `[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]`

- **Windows sizes:**
  `[20, 30, ,35, 40, 45, 50, 60, 70]`

The chosen parameters are: window size = 45, step = 7

In [None]:
from tqdm import tqdm
from scipy.stats import spearmanr
import numpy as np
import pandas as pd

# Define flank size and relevant column name
FLANK_SIZE = 120
FLANKED_SENSE_COL = f'sense_with_flank_{FLANK_SIZE}nt'

# Copy the full DataFrame to avoid modifying the original
df_all = all_data_human_gene.copy()

# Filter for valid rows: valid sense start, non-null sequence, and sufficient length
valid_mask = (
    (df_all['sense_start'] != -1) &
    (df_all[FLANKED_SENSE_COL].notna()) &
    (df_all[FLANKED_SENSE_COL].str.len() >= FLANK_SIZE)
)
df_valid = df_all.loc[valid_mask].copy().reset_index()  # Add original index as a column

# Compute the average MFE over the sense region using sliding windows
tqdm.pandas(desc="Computing final MFE (window=45)")
df_valid['mfe_window_45'] = df_valid.progress_apply(
    lambda row: calculate_avg_mfe_over_sense_region(
        sequence=row[FLANKED_SENSE_COL],
        sense_start=row['sense_start'],
        sense_length=row['sense_length'],
        flank_size=FLANK_SIZE,
        window_size=45,
        step=7
    ),
    axis=1
)

# Filter rows with non-missing MFE and inhibition values
df_mfe_vs_inhibition = df_valid.dropna(subset=['mfe_window_45', 'log_inhibition'])[
    ['index', 'mfe_window_45', 'log_inhibition']
].copy()

# Compute Spearman correlation
corr, pval = spearmanr(df_mfe_vs_inhibition['mfe_window_45'], df_mfe_vs_inhibition['log_inhibition'])
print(f"Spearman correlation = {corr:.3f}, p-value = {pval:.3g}")

# Save to CSV
# Save only the index and mfe_window_45 columns
df_mfe_vs_inhibition[['index', 'mfe_window_45']].to_csv("avg_mfe_vienna_window45_flank120_step7.csv", index=False)

In [None]:
df_mfe_vs_inhibition.head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 6))

# Create scatter plot with regression line
sns.regplot(
    x='mfe_window_45',
    y='log_inhibition',
    data=df_mfe_vs_inhibition,
    scatter_kws={'s': 20, 'alpha': 0.5},  # Small, semi-transparent points
    line_kws={'color': 'red'},            # Red regression line
    ci=None                               # No confidence interval
)

# Set plot title and labels
plt.title(f'log_inhibition vs. MFE (window=45, step=7)\nSpearman r={corr:.3f}, p={pval:.1g}')
plt.xlabel('Average MFE in Sense Region (window=45)')
plt.ylabel('log_inhibition')
plt.grid(True)
plt.tight_layout()
plt.show()

### Calculate average mfe over the edges of the mRNA sense

In [None]:
from notebooks.utils.fold import calculate_mfe_over_edges_sense_region
from tqdm import tqdm
import numpy as np

# Set flank size and define the column name for the flanked RNA sequence
FLANK_SIZE = 120
FLANKED_SENSE_COL = f'sense_with_flank_{FLANK_SIZE}nt'

# Create a copy of the original DataFrame
df_all = all_data_human_gene.copy()

# Filter valid rows: must have a valid sense_start, non-null sequence, and sufficient length
valid_mask = (
    (df_all['sense_start'] != -1) &
    (df_all[FLANKED_SENSE_COL].notna()) &
    (df_all[FLANKED_SENSE_COL].str.len() >= FLANK_SIZE)
)
df_valid = df_all.loc[valid_mask].copy()

# Compute average MFE over the edges (first and last 4 nt) of the sense region
tqdm.pandas(desc="Computing edge-only MFE (window=45)")
df_valid['mfe_edges_45'] = df_valid.progress_apply(
    lambda row: calculate_mfe_over_edges_sense_region(
        sequence=row[FLANKED_SENSE_COL],
        sense_start=row['sense_start'],
        sense_length=row['sense_length'],
        flank_size=FLANK_SIZE,
        window_size=45,
        step=7
    ),
    axis=1
)

# Keep only relevant columns and save to CSV
df_mfe_edges = df_valid[['index', 'mfe_edges_45']].copy()
df_mfe_edges.to_csv("mfe_edges_vienna_window45_flank120_step7.csv", index=False)

In [None]:
df_mfe_edges_corr = df_valid[['index', 'mfe_edges_45','log_inhibition']].copy()
df_mfe_edges_corr.head()

In [None]:
# Compute Spearman correlation
corr, pval = spearmanr(df_mfe_edges_corr['mfe_edges_45'], df_mfe_edges_corr['log_inhibition'])
print(f"Spearman correlation = {corr:.3f}, p-value = {pval:.3g}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 6))

# Create scatter plot with regression line
sns.regplot(
    x='mfe_edges_45',
    y='log_inhibition',
    data=df_mfe_edges_corr,
    scatter_kws={'s': 20, 'alpha': 0.5},  # Small, semi-transparent points
    line_kws={'color': 'red'},            # Red regression line
    ci=None                               # No confidence interval
)

# Set plot title and labels
plt.title(f'log_inhibition vs. MFE in the edges (window=45, step=7)\nSpearman r={corr:.3f}, p={pval:.1g}')
plt.xlabel('Average MFE in the Edges of the Sense Region (4 noc each side)')
plt.ylabel('log_inhibition')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'scripts', 'features')))

from notebooks.utils.feature_extraction import save_feature

In [None]:
df_avg_mfe_wind45_flank120_step7 = pd.read_csv("avg_mfe_vienna_window45_flank120_step7.csv")

In [None]:
df_avg_mfe_wind45_flank120_step7.head()

In [None]:
save_feature(df_avg_mfe_wind45_flank120_step7, 'mfe_window_45')

In [None]:
mfe_edges_wind45_flank120_step7 = pd.read_csv("mfe_edges_vienna_window45_flank120_step7.csv")

In [None]:
mfe_edges_wind45_flank120_step7.head()

In [None]:
save_feature(mfe_edges_wind45_flank120_step7, 'mfe_edges_45')

In [None]:
from notebooks.utils.fold import calculate_min_mfe_over_sense_region

In [None]:
# Set flank size and define the column name for the flanked RNA sequence
FLANK_SIZE = 120
FLANKED_SENSE_COL = f'sense_with_flank_{FLANK_SIZE}nt'

# Create a copy of the original DataFrame
df_all = all_data_human_gene.copy()

# Filter valid rows: must have a valid sense_start, non-null sequence, and sufficient length
valid_mask = (
    (df_all['sense_start'] != -1) &
    (df_all[FLANKED_SENSE_COL].notna()) &
    (df_all[FLANKED_SENSE_COL].str.len() >= FLANK_SIZE)
)
df_valid = df_all.loc[valid_mask].copy()

In [None]:
from tqdm import tqdm

# Compute minimum MFE over the sense region (new feature)
tqdm.pandas(desc="Computing minimum MFE (window=45)")

df_valid['min_mfe_45'] = df_valid.progress_apply(
    lambda row: calculate_min_mfe_over_sense_region(
        sequence=row[FLANKED_SENSE_COL],
        sense_start=row['sense_start'],
        sense_length=row['sense_length'],
        flank_size=FLANK_SIZE,
        window_size=45,
        step=7
    ),
    axis=1
)

# Keep only relevant columns and save to CSV
df_min_mfe = df_valid[['index', 'min_mfe_45']].copy()
df_min_mfe.to_csv("min_mfe_vienna_window45_flank120_step7.csv", index=False)

In [None]:
df_valid[['index', 'min_mfe_45','log_inhibition']].head()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import spearmanr  # Import spearmanr

df_valid_new = df_valid.dropna(subset=['min_mfe_45', 'log_inhibition']).copy()
corr, pval = spearmanr(df_valid_new['min_mfe_45'], df_valid_new['log_inhibition'])

plt.figure(figsize=(8, 6))

# Create scatter plot with regression line
sns.regplot(
    x='min_mfe_45',
    y='log_inhibition',
    data=df_valid_new,
    scatter_kws={'s': 20, 'alpha': 0.5},  # Small, semi-transparent points
    line_kws={'color': 'red', 'linewidth': 2},            # Red regression line
    ci=None                               # No confidence interval
)

# Set plot title and labels
plt.title(f'log_inhibition vs. min MFE in the sense (window=45, step=7)\nSpearman r={corr:.3f}, p={pval:.1g}')
plt.xlabel('Avg-Min MFE of the Sense Region')
plt.ylabel('log_inhibition')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
from feature_extraction import save_feature
save_feature(df_valid_new, 'min_mfe_45')