In [6]:
import os
os.chdir('/home/espl/ASOdesign/scripts')

from data_genertion.consts import *
from scripts.features.feature_extraction import load_all_features

main_df = load_all_features()
main_df[SEQUENCE] = main_df[SEQUENCE].astype(str)

Loading features from: ['ASO_volume(nM).csv', 'Canonical Gene Name.csv', 'Cell line organism.csv', 'Cell_line.csv', 'Chemical_Pattern.csv', 'Density(cells_per_well).csv', 'ISIS.csv', 'Inhibition(%).csv', 'Linkage.csv', 'Linkage_Location.csv', 'Location.csv', 'Location_div_by_length.csv', 'Location_in_sequence.csv', 'Modification.csv', 'Primer_probe_set.csv', 'Sequence.csv', 'Target_gene.csv', 'Transcript.csv', 'Transfection.csv', 'Treatment_Period(hours).csv', 'at_rich_region_score.csv', 'at_skew.csv', 'cell_line_uniform.csv', 'dsm_su95_rev_wGU_pos1382t37Falseon_target_energy_max600.csv', 'dsm_su95_rev_wGU_pos1382t37Trueon_target_energy_max600.csv', 'dsm_su95_rev_wGU_pos1384t37Falseon_target_energy_max600.csv', 'dsm_su95_rev_wGU_pos1384t37Trueon_target_energy_max600.csv', 'dsm_su95_rev_wGU_pos1386t37Falseon_target_energy_max600.csv', 'dsm_su95_rev_wGU_pos1386t37Trueon_target_energy_max600.csv', 'dsm_su95_rev_woGU_pos1382t37Falseon_target_energy_max600.csv', 'dsm_su95_rev_woGU_pos1382t3

In [7]:
#half life per cell line feature
import gzip
import pandas as pd

# Gene mapping
genes = {
    "ENSG00000133703": "KRAS",
    "ENSG00000145335": "SNCA", 
    "ENSG00000128604": "IRF5",
    "ENSG00000251562": "MALAT1",
    "ENSG00000137265": "IRF4",
    "ENSG00000137693": "YAP1",
    "ENSG00000166033": "HTRA1",
    "ENSG00000170509": "HSD17B13",
    "ENSG00000100644": "HIF1A",
    "ENSG00000224078": "SNHG14",
    "ENSG00000092054": "MYH7",
    "ENSG00000142168": "SOD1",
    "ENSG00000062282": "DGAT2",
    "ENSG00000100342": "APOL1",
}

# Read and process
with gzip.open("all_HLs_human_featTable.txt.gz", "rt") as f:
    df = pd.read_csv(f, sep="\t")

filtered = df[df["ENSID"].isin(genes.keys())].copy()
filtered["GENE"] = filtered["ENSID"].map(genes)
halflife_df = filtered[["GENE", "HALFLIFE"]]

# Create a dictionary mapping gene name to halflife
gene_to_halflife = dict(zip(halflife_df["GENE"], halflife_df["HALFLIFE"]))

# Add the halflife column to main_df by mapping the canonical gene name
main_df["halflife"] = main_df[CANONICAL_GENE].map(gene_to_halflife)

from scipy.stats import pearsonr, spearmanr

# Drop rows where either 'halflife' or 'log_inhibition' is NaN
valid = main_df[['halflife', 'log_inhibition']].dropna()

pearson_corr, pearson_p = pearsonr(valid['halflife'], valid['log_inhibition'])
spearman_corr, spearman_p = spearmanr(valid['halflife'], valid['log_inhibition'])

print(f"Pearson correlation between halflife and log_inhibition: {pearson_corr:.3f} (p={pearson_p:.3g})")
print(f"Spearman correlation between halflife and log_inhibition: {spearman_corr:.3f} (p={spearman_p:.3g})")

Pearson correlation between halflife and log_inhibition: 0.192 (p=9.55e-151)
Spearman correlation between halflife and log_inhibition: 0.270 (p=1.86e-302)


In [None]:
# expression level of on target feature
import pandas as pd

# Cell line mapping
cellline_to_ach = {
    'A431': 'ACH-001328',
    'A-431': 'ACH-001328', 
    'NCI-H460': 'ACH-000463',
    'SH_SY5Y': 'ACH-001188',
    'SH-SY5Y': 'ACH-001188',
    'HeLa': 'ACH-001086',
    'Hela': 'ACH-001086',
    'HepG2': 'ACH-000739',
    'U-251MG': 'ACH-000232',
    'U251': 'ACH-000232'
}

# Read the mRNA expression data - genes are in columns, not rows
mrna_df = pd.read_csv("OmicsExpressionProteinCodingGenesTPMLogp1.csv", index_col=0)
mrna_df = mrna_df.T
# Create a mapping from clean gene names to the full index names
gene_name_mapping = {}
for full_name in mrna_df.index:
    clean_name = full_name.split(' (')[0]
    gene_name_mapping[clean_name] = full_name

# Function to get mRNA expression for a gene in a cell line
def get_mrna_expression(gene_name, cell_line):
    ach_id = cellline_to_ach.get(cell_line)
    if ach_id and ach_id in mrna_df.columns:
        full_gene_name = gene_name_mapping.get(gene_name)
        if full_gene_name and full_gene_name in mrna_df.index:
            return mrna_df.loc[full_gene_name, ach_id]
    return None

# Add mRNA expression column to main_df
main_df['mRNA_expression'] = main_df.apply(
    lambda row: get_mrna_expression(row['Canonical Gene Name'], row['Cell_line']), 
    axis=1
) 
from scipy.stats import pearsonr, spearmanr

# Calculate Pearson and Spearman correlation between mRNA_expression and log_inhibition
corr_df = main_df[['mRNA_expression', 'log_inhibition']].dropna()

pearson_corr, pearson_p = pearsonr(corr_df['mRNA_expression'], corr_df['log_inhibition'])
spearman_corr, spearman_p = spearmanr(corr_df['mRNA_expression'], corr_df['log_inhibition'])

print(f"Pearson correlation between mRNA_expression and log_inhibition: {pearson_corr:.4f} (p={pearson_p:.4g})")
print(f"Spearman correlation between mRNA_expression and log_inhibition: {spearman_corr:.4f} (p={spearman_p:.4g})")


Pearson correlation between mRNA_expression and log_inhibition: 0.0383 (p=0.000158)
Spearman correlation between mRNA_expression and log_inhibition: 0.0223 (p=0.0279)


In [9]:
#Rnash levels feature

# Cell line mapping
cellline_to_ach = {
    'A431': 'ACH-001328',
    'A-431': 'ACH-001328', 
    'NCI-H460': 'ACH-000463',
    'SH_SY5Y': 'ACH-001188',
    'SH-SY5Y': 'ACH-001188',
    'HeLa': 'ACH-001086',
    'Hela': 'ACH-001086',
    'HepG2': 'ACH-000739',
    'U-251MG': 'ACH-000232',
    'U251': 'ACH-000232'
}

# Read the mRNA expression data - genes are in columns, not rows
mrna_df = pd.read_csv("OmicsExpressionProteinCodingGenesTPMLogp1.csv", index_col=0)

# Add RNASEH1 expression feature per cell line using the mapping

# Get the full column name for RNASEH1
rnaseh1_col = 'RNASEH1 (246243)'

# Function to get RNASEH1 expression for a cell line
def get_rnaseh1_expression(cell_line):
    ach_id = cellline_to_ach.get(cell_line)
    if ach_id and ach_id in mrna_df.index:
        return mrna_df.loc[ach_id, rnaseh1_col]
    return None

# Add RNASEH1 expression column to main_df
main_df['RNASEH1_expression'] = main_df['Cell_line'].apply(get_rnaseh1_expression)

from scipy.stats import pearsonr, spearmanr

# Calculate Pearson and Spearman correlation between RNASEH1_expression and log_inhibition
# Drop rows with missing values in either column
corr_df = main_df[['RNASEH1_expression', 'log_inhibition']].dropna()

pearson_corr, pearson_p = pearsonr(corr_df['RNASEH1_expression'], corr_df['log_inhibition'])
spearman_corr, spearman_p = spearmanr(corr_df['RNASEH1_expression'], corr_df['log_inhibition'])

print(f"Pearson correlation between RNASEH1_expression and log_inhibition: {pearson_corr:.4f} (p={pearson_p:.4g})")
print(f"Spearman correlation between RNASEH1_expression and log_inhibition: {spearman_corr:.4f} (p={spearman_p:.4g})")


Pearson correlation between RNASEH1_expression and log_inhibition: 0.1189 (p=1.946e-41)
Spearman correlation between RNASEH1_expression and log_inhibition: 0.1975 (p=1.595e-112)
