In [None]:
# conda activate aso_design
import sys
from pathlib import Path

p = Path.cwd()
while p != p.parent and not (p / "pyproject.toml").exists():
    p = p.parent

sys.path.insert(0, str(p))
print("Project root:", p)



import pandas as pd
from notebooks.consts import *
from tauso.file_utils import read_human_genome_fasta_dict
from tauso.consts import *
import numpy as np
from tauso.util import get_antisense
import pickle
from tauso.genome.read_human_genome import get_locus_to_data_dict
from tauso.file_utils import read_human_genome_fasta_dict
import RNA

In [None]:
csv_path = NOTEBOOK_PATH / 'data' / 'data_asoptimizer_updated.csv'
all_data = pd.read_csv(str(csv_path), low_memory=False)

In [None]:
from notebooks.notebook_utils import log_correction

# Remove rows with missing values in the INHIBITION column
all_data_no_nan = all_data.dropna(subset=[INHIBITION]).copy()
# Create a new column with transformed inhibition values on a negative log scale
log_correction(all_data_no_nan) # to avoid log 0

In [None]:
# Filter the data to include only rows where the cell line organism is human
all_data_no_nan_human = all_data_no_nan[all_data_no_nan[CELL_LINE_ORGANISM] == 'human']
genes = all_data_no_nan[CANONICAL_GENE].copy()
genes_u = list(set(genes))
# Remove non-human or negative controls from the gene list
genes_u.remove('HBV')
genes_u.remove('negative_control')






In [None]:
from notebooks.notebook_utils import read_cached_gene_to_data

gene_to_data = read_cached_gene_to_data(genes_u)

In [None]:
# Filter data to keep only rows with valid gene information
all_data_human_gene = all_data_no_nan_human[all_data_no_nan_human[CANONICAL_GENE].isin(genes_u)].copy()

# Define names for new columns
SENSE_SEQUENCE = 'sense_sequence'
PRE_MRNA_SEQUENCE = 'pre_mrna_sequence'
SENSE_START = 'sense_start'
SENSE_LENGTH = 'sense_length'

# Initialize new columns
all_data_human_gene[SENSE_SEQUENCE] = ""
all_data_human_gene[PRE_MRNA_SEQUENCE] = ""
all_data_human_gene[SENSE_START] = np.zeros_like(all_data_human_gene[CANONICAL_GENE], dtype=int)
all_data_human_gene[SENSE_LENGTH] = np.zeros_like(all_data_human_gene[CANONICAL_GENE], dtype=int)

# Iterate over each row and compute the antisense complement and the gene's pre-mRNA
for index, row in all_data_human_gene.iterrows():
    gene_name = row[CANONICAL_GENE]

    if gene_name not in gene_to_data:
        continue  # Skip genes not found in genome annotation

    locus_info = gene_to_data[gene_name]
    pre_mrna = locus_info.full_mrna
    antisense = row[SEQUENCE]
    sense = get_antisense(antisense)
    idx = pre_mrna.find(sense)

    # Store computed sequences in new columns
    all_data_human_gene.loc[index, SENSE_START] = idx
    all_data_human_gene.loc[index, SENSE_LENGTH] = len(antisense)
    all_data_human_gene.at[index, SENSE_SEQUENCE] = sense
    all_data_human_gene.at[index, PRE_MRNA_SEQUENCE] = pre_mrna

In [None]:
aso_df = all_data_human_gene.copy()

Reading the A expression file that contain the RNA expression matrix for each cell line

In [21]:
expr = pd.read_csv("OmicsExpressionProteinCodingGenesTPMLogp1.csv")
expr = expr.rename(columns={expr.columns[0]: "DepMap_ID"})
print(expr.head())

    DepMap_ID  TSPAN6 (7105)  TNMD (64102)  DPM1 (8813)  SCYL3 (57147)  \
0  ACH-001113       4.956577      0.000000     7.577648       3.179411   
1  ACH-001289       4.954992      0.617243     7.334747       2.783576   
2  ACH-001339       3.421952      0.000000     7.546069       2.615880   
3  ACH-001979       4.651643      0.000000     5.946408       2.454515   
4  ACH-002438       4.336705      0.000000     6.879387       2.262824   

   FIRRM (55732)  FGR (2268)  CFH (3075)  FUCA2 (2519)  GCLC (2729)  ...  \
0       4.765742    0.037483    1.510547      3.103039     6.697482  ...   
1       3.736280    0.000000    0.149060      3.855960     4.377061  ...   
2       4.476233    0.064571    1.397491      6.833915     3.980336  ...   
3       1.852111    0.000000    7.680796      6.156866     5.042574  ...   
4       3.256491    0.000000    2.253926      6.895436     3.470992  ...   

   H3C3 (8352)  CIMIP3 (114841037)  SMIM42 (117981789)  NPBWR1 (2831)  \
0     0.093516           

loading the Metadata mapping ACH IDs to readable cell line names.

In [22]:
model = pd.read_csv("Model.csv")

Merge the two df

In [23]:
expr_named = expr.merge(
    model[["ModelID", "StrippedCellLineName"]],
    left_on="DepMap_ID", right_on="ModelID", how="left"
)
expr_named = expr_named.drop(columns=["DepMap_ID"])
expr_named = expr_named.rename(columns={"StrippedCellLineName": "cell line"})
cols = ["cell line"] + [c for c in expr_named.columns if c != "cell line"]
expr_named = expr_named[cols]
print(expr_named.head())



       cell line  TSPAN6 (7105)  TNMD (64102)  DPM1 (8813)  SCYL3 (57147)  \
0        LC1SQSF       4.956577      0.000000     7.577648       3.179411   
1       COGAR359       4.954992      0.617243     7.334747       2.783576   
2        COLO794       3.421952      0.000000     7.546069       2.615880   
3          NZM11       4.651643      0.000000     5.946408       2.454515   
4  CCLFPEDS0026T       4.336705      0.000000     6.879387       2.262824   

   FIRRM (55732)  FGR (2268)  CFH (3075)  FUCA2 (2519)  GCLC (2729)  ...  \
0       4.765742    0.037483    1.510547      3.103039     6.697482  ...   
1       3.736280    0.000000    0.149060      3.855960     4.377061  ...   
2       4.476233    0.064571    1.397491      6.833915     3.980336  ...   
3       1.852111    0.000000    7.680796      6.156866     5.042574  ...   
4       3.256491    0.000000    2.253926      6.895436     3.470992  ...   

   CIMIP3 (114841037)  SMIM42 (117981789)  NPBWR1 (2831)  ACTL10 (170487)  \
0  

Reading the catalog of RNA-binding proteins (RBPs) and their motifs

In [24]:
motifs= pd.read_csv("RBS_motifs_Homo_sapiens.csv")

merging all the df

In [25]:
import re
import pandas as pd

# --- 1) Helper: canonicalize gene symbols (strip parentheses, trim, uppercase) ---
def canonical_symbol(s: str) -> str:
    s = str(s).strip()
    # remove trailing " (12345)" or any parentheses block at the end
    s = re.sub(r"\s*\(.*?\)\s*$", "", s)
    # collapse internal whitespace
    s = re.sub(r"\s+", "", s)
    return s.upper()

# --- 2) Melt expression to long and add a canonical gene column ---
expr_long = expr_named.melt(
    id_vars=["cell line"],
    var_name="Gene_raw",
    value_name="Expression"
)
expr_long["Gene_name"] = expr_long["Gene_raw"].apply(canonical_symbol)

# (optional) keep only real genes (drop the 'cell line' row if it leaked)
expr_long = expr_long[expr_long["Gene_name"].ne("CELLLINE")]


# make sure the column exists; if it's differently named, change here
assert "Gene_name" in motifs.columns, "Expected 'Gene_name' column in motifs file."

motifs = motifs.copy()
motifs["Gene_name"] = motifs["Gene_name"].apply(canonical_symbol)



# --- 4) Merge on the canonical symbol ---
merged = expr_long.merge(motifs, on="Gene_name", how="inner")

# sanity check
print("Rows after merge:", len(merged))
print(merged.head())


Rows after merge: 5478052
  cell line      Gene_raw Expression Gene_name          Gene_id Mutated  \
0   LC1SQSF  RBM5 (10181)   5.591937      RBM5  ENSG00000003756      no   
1   LC1SQSF  RBM5 (10181)   5.591937      RBM5  ENSG00000003756      no   
2   LC1SQSF  RBM5 (10181)   5.591937      RBM5  ENSG00000003756      no   
3   LC1SQSF  RBM5 (10181)   5.591937      RBM5  ENSG00000003756      no   
4   LC1SQSF  RBM5 (10181)   5.591937      RBM5  ENSG00000003756      no   

       Organism      Motif  Len  \
0  Homo_sapiens    AAAAAAA    7   
1  Homo_sapiens     AGGUAA    6   
2  Homo_sapiens    CCCCCCC    7   
3  Homo_sapiens  CUCUUCUCU    9   
4  Homo_sapiens    GAAGGAA    7   

                              Experiment_description Database    Pubmed  \
0  Homopolymer binding assay with recombinant pro...        S  11029660   
1                       Chemical shift mapping (NMR)        S  22162216   
2  Homopolymer binding assay with recombinant pro...        S  11029660   
3  EMSA and 

Filter merged table to only ASO-relevant cell lines (only for the cell lines in our ASO data)

In [26]:
import re

def _norm_cell(v: str) -> str:
    # Uppercase + remove spaces/hyphens/underscores/slashes
    s = str(v).upper()
    s = re.sub(r"[\s\-\_/]+", "", s)
    return s

# Unique cell lines from ASO file (normalized)
aso_cell_lines = {_norm_cell(c) for c in aso_df["Cell_line"].dropna()}

# Add A549 manually (normalized)
aso_cell_lines.add(_norm_cell("A549"))

# Normalize the "cell line" column in merged and filter
merged_norm = merged["cell line"].astype(str).map(_norm_cell)
filtered = merged[ merged_norm.isin(aso_cell_lines) ].copy()

print("Original size:", len(merged))
print("Filtered size:", len(filtered))


Original size: 5478052
Filtered size: 29277


save as csv

In [27]:
filtered.to_csv("RBP_motifs_for_each_cell_line_with_mRNA_expression.csv",index=False )