In [1]:
from pathlib import Path
import sys
import time
import numpy as np
import pandas as pd
from concurrent.futures import ProcessPoolExecutor

from IPython.core.pylabtools import figsize

from notebooks.consts import *
from notebooks.notebook_utils import log_correction, read_cached_gene_to_data, read_cached_gene_to_data
from src.tauso.util import get_antisense
from src.tauso.features.rna_access.access_calculator import AccessCalculator
from src.tauso.features.rna_access.rna_access import RNAAccess
from src.tauso.new_model.populate.populate_sense_accessibility import populate_sense_accessibility, calculate_sense_accessibility_batch
from src.tauso.features.rna_access.access_calculator import get_sense_with_flanks
from notebooks.preprocessing import *
from notebooks.features.feature_extraction import save_feature

In [2]:
PROJECT_ROOT = Path.cwd().parents[1]
sys.path.insert(0, str(PROJECT_ROOT))
csv_path = PROJECT_ROOT / "data" / "data_asoptimizer_updated.csv"

In [3]:
aso_data = preprocess_aso_data(csv_path=csv_path)

Preprocessing complete. Final valid rows: 29987


In [4]:
aso_data.head()

Unnamed: 0,index,ISIS,Target_gene,Cell_line,Density(cells/well),Transfection,ASO_volume(nM),Treatment_Period(hours),Primer_probe_set,Sequence,...,Location_in_sequence,Location_div_by_length,true_length_of_seq,mod_scan,cell_line_uniform,log_inhibition,sense_sequence,pre_mrna_sequence,sense_start,sense_length
0,0,540733,K-RAS,A431,5000.0,free uptake,2000.0,24,RTS3496_MGB,GCTAAAACAAATGCTA,...,41212.0,0.901972,16,0,A431,-4.204842,TAGCATTTGTTTTAGC,GATTTTCCTAGGCGGCGGCCGCGGCGGCGGAGGCAGCAGCGGCGGC...,41212,16
1,1,540747,K-RAS,A431,5000.0,free uptake,2000.0,24,RTS3496_MGB,TATAATGGTGAATATC,...,23686.0,0.518395,16,0,A431,-4.532707,GATATTCACCATTATA,GATTTTCCTAGGCGGCGGCCGCGGCGGCGGAGGCAGCAGCGGCGGC...,23686,16
2,2,540806,K-RAS,A431,5000.0,free uptake,2000.0,24,RTS3496_MGB,GCATGAAGATTTCTGG,...,43363.0,0.949049,16,1,A431,-3.637849,CCAGAAATCTTCATGC,GATTTTCCTAGGCGGCGGCCGCGGCGGCGGAGGCAGCAGCGGCGGC...,43363,16
3,3,651479,K-RAS,A431,5000.0,free uptake,2000.0,24,RTS3496_MGB,GGTGAATATCTTCAAA,...,23680.0,0.518264,16,0,A431,-4.276805,TTTGAAGATATTCACC,GATTTTCCTAGGCGGCGGCCGCGGCGGCGGAGGCAGCAGCGGCGGC...,23680,16
4,4,651490,K-RAS,A431,5000.0,free uptake,2000.0,24,RTS3496_MGB,CACTTGTACTAGTATG,...,41168.0,0.901009,16,0,A431,-4.159039,CATACTAGTACAAGTG,GATTTTCCTAGGCGGCGGCCGCGGCGGCGGAGGCAGCAGCGGCGGC...,41168,16


In [5]:
df_new = aso_data.copy()

batch_result = calculate_sense_accessibility_batch(
    df_new,
    batch_size=1000
)

df_new = df_new.merge(
    batch_result,
    left_index=True,
    right_on='rna_id',
    how='left'
)

new_result = df_new['access'].astype(float)

In [6]:
batch_result = batch_result.rename(columns={'rna_id':'index'})

In [7]:
batch_result = batch_result.rename(columns={'access':'access_120flank_13access_size_13seed_size'})

In [8]:
save_feature(df=batch_result, feature_name= 'access_120flank_13access_size_13seed_size')

In [4]:
configurations = [
    {"flank": 120, "access": 20, "seeds": [13]},
    {"flank": 120, "access": 13, "seeds": [4, 6, 8]},
    {"flank": 120, "access": 20, "seeds": [4, 6, 8]}
]

df_work = aso_data.copy()

for config in configurations:
    c_flank = config["flank"]
    c_access = config["access"]
    c_seeds = config["seeds"]

    print(f"Running: Flank={c_flank}, Access={c_access}, Seeds={c_seeds})...")

    batch_result = calculate_sense_accessibility_batch(
        df_work,
        batch_size=1000,
        flank_size=c_flank,
        access_size=c_access,
        seed_sizes=c_seeds
    )
    seeds_str = "-".join(map(str, c_seeds))
    feature_name = f'access_{c_flank}flank_{c_access}access_{seeds_str}seed_sizes'

    batch_result = batch_result.rename(columns={'rna_id': 'index', 'access': feature_name})
    save_feature(df=batch_result, feature_name=feature_name)
    print(f"Saved: {feature_name}")

Running: Flank=120, Access=20, Seeds=[13])...
Saved: access_120flank_20access_13seed_sizes
Running: Flank=120, Access=13, Seeds=[4, 6, 8])...
Saved: access_120flank_13access_4-6-8seed_sizes
Running: Flank=120, Access=20, Seeds=[4, 6, 8])...
Saved: access_120flank_20access_4-6-8seed_sizes
