Imports

In [1]:
from pathlib import Path
import sys
import time
import numpy as np
import pandas as pd
from concurrent.futures import ProcessPoolExecutor
from notebooks.consts import *
from notebooks.notebook_utils import log_correction, read_cached_gene_to_data, read_cached_gene_to_data
from src.tauso.util import get_antisense
from src.tauso.features.rna_access.access_calculator import AccessCalculator
from src.tauso.features.rna_access.rna_access import RNAAccess
from src.tauso.new_model.populate.populate_sense_accessibility import populate_sense_accessibility, populate_sense_accessibility_batch
from src.tauso.features.rna_access.access_calculator import get_sense_with_flanks

Data preprocess

In [2]:
PROJECT_ROOT = Path.cwd().parents[1]
sys.path.insert(0, str(PROJECT_ROOT))
csv_path = PROJECT_ROOT / "notebooks" / "data" / "data_asoptimizer_updated.csv"
all_data = pd.read_csv(str(csv_path), low_memory=False)
# Remove rows with missing values in the INHIBITION column
all_data_no_nan = all_data.dropna(subset=[INHIBITION]).copy()
# Create a new column with transformed inhibition values on a negative log scale
log_correction(all_data_no_nan) # to avoid log 0
# Filter the data to include only rows where the cell line organism is human
all_data_no_nan_human = all_data_no_nan[all_data_no_nan[CELL_LINE_ORGANISM] == 'human']
genes = all_data_no_nan[CANONICAL_GENE].copy()
genes_u = list(set(genes))
# Remove non-human or negative controls from the gene list
genes_u.remove('HBV')
genes_u.remove('negative_control')
gene_to_data = read_cached_gene_to_data(genes_u)
# Filter data to keep only rows with valid gene information
all_data_human_gene = all_data_no_nan_human[all_data_no_nan_human[CANONICAL_GENE].isin(genes_u)].copy()

# Define names for new columns
SENSE_SEQUENCE = 'sense_sequence'
PRE_MRNA_SEQUENCE = 'pre_mrna_sequence'
SENSE_START = 'sense_start'
SENSE_LENGTH = 'sense_length'

# Initialize new columns
all_data_human_gene[SENSE_SEQUENCE] = ""
all_data_human_gene[PRE_MRNA_SEQUENCE] = ""
all_data_human_gene[SENSE_START] = np.zeros_like(all_data_human_gene[CANONICAL_GENE], dtype=int)
all_data_human_gene[SENSE_LENGTH] = np.zeros_like(all_data_human_gene[CANONICAL_GENE], dtype=int)

# Iterate over each row and compute the antisense complement and the gene's pre-mRNA
for index, row in all_data_human_gene.iterrows():
    gene_name = row[CANONICAL_GENE]

    if gene_name not in gene_to_data:
        continue  # Skip genes not found in genome annotation

    locus_info = gene_to_data[gene_name]
    pre_mrna = locus_info.full_mrna
    antisense = row[SEQUENCE]
    sense = get_antisense(antisense)
    idx = pre_mrna.find(sense)

    # Store computed sequences in new columns
    all_data_human_gene.loc[index, SENSE_START] = idx
    all_data_human_gene.loc[index, SENSE_LENGTH] = len(antisense)
    all_data_human_gene.at[index, SENSE_SEQUENCE] = sense
    all_data_human_gene.at[index, PRE_MRNA_SEQUENCE] = pre_mrna

FLANK_SIZE = 120  # Change this as needed
FLANKED_SENSE_COL = f'sense_with_flank_{FLANK_SIZE}nt'

# Create new column with flanked sequences
all_data_human_gene[FLANKED_SENSE_COL] = all_data_human_gene.apply(
    lambda row: get_sense_with_flanks(
        row['pre_mrna_sequence'],
        row['sense_start'],
        row['sense_length'],
        flank_size=FLANK_SIZE
    ) if row['sense_start'] != -1 else "",  # Handle cases where sense was not found
    axis=1
)
valid_data = all_data_human_gene[
    all_data_human_gene[FLANKED_SENSE_COL].astype(str).str.len() > 0
].copy()

In [3]:
df_test = valid_data.head(500).copy()

In [5]:
df_old = df_test.copy()

populate_sense_accessibility(df_old)

print(df_old[['Canonical Gene Name', SENSE_AVG_ACCESSIBILITY]].head(20))

   Canonical Gene Name  sense_avg_accessibility
0                 KRAS                 5.215761
1                 KRAS                 7.393584
2                 KRAS                 6.227737
3                 KRAS                 6.976352
4                 KRAS                 8.586243
5                 KRAS                 5.109508
6                 KRAS                 4.396005
7                 KRAS                 3.239753
8                 KRAS                 3.073216
9                 KRAS                 3.076650
10                KRAS                 3.404072
11                KRAS                 4.906226
12                KRAS                 3.792322
13                KRAS                 4.145692
14                KRAS                 4.933757
15                KRAS                 3.269480
16                KRAS                 6.914944
17                KRAS                 6.164655
18                KRAS                 6.276175
19                KRAS                 7

In [6]:
df_new = df_test.copy()

batch_result = populate_sense_accessibility_batch(
    df_new,
    batch_size=100
)

df_new = df_new.merge(
    batch_result,
    left_index=True,
    right_on='rna_id',
    how='left'
)

new_result = df_new['access'].astype(float)

In [7]:
new_result.head(10)

0     5.215761
1     7.393584
12    6.227737
23    6.976352
34    8.586242
45    5.109508
56    4.396005
67    3.239753
78    3.073216
89    3.076650
Name: access, dtype: float64

In [8]:
SAMPLE_SIZE = 5000
df_sample = valid_data.sample(n=SAMPLE_SIZE, random_state=42).copy()

# Old access calculation
print("\nRunning Single-Row method...")
df_single = df_sample.copy()
start_time = time.time()

populate_sense_accessibility(df_single)

single_duration = time.time() - start_time
print(f"Single processing took: {single_duration:.4f} seconds")


# New batch access calculation
print("\nRunning Batch method...")
df_batch = df_sample.copy()
start_time = time.time()

batch_results = populate_sense_accessibility_batch(df_batch, batch_size=1000)

batch_duration = time.time() - start_time

df_batch = df_batch.merge(
    batch_results,
    left_index=True,
    right_on='rna_id',
    how='left'
)

df_batch[SENSE_AVG_ACCESSIBILITY] = df_batch['access']

print(f"Batch processing took:  {batch_duration:.4f} seconds")

if batch_duration > 0:
    speedup = single_duration / batch_duration
    print(f"\nSpeedup Factor: x{speedup:.2f}")
    print(f"(The batch method is {speedup:.1f} times faster)")
else:
    print("\nBatch was too fast to measure speedup accurately!")


print("\n=== Validating Results ===")


Running Single-Row method...
Single processing took: 1614.2429 seconds

Running Batch method...
Batch processing took:  1382.9087 seconds

Speedup Factor: x1.17
(The batch method is 1.2 times faster)

=== Validating Results ===


In [9]:
valid_indices = df_single[df_single[SENSE_START] != -1].index

res_single = df_single.loc[valid_indices, SENSE_AVG_ACCESSIBILITY].astype(float).values
res_batch = df_batch.set_index('rna_id').loc[valid_indices, SENSE_AVG_ACCESSIBILITY].astype(float).values

are_close = np.allclose(res_single, res_batch, equal_nan=True, atol=1e-5)

if are_close:
    print("✅ SUCCESS: Results match exactly!")
else:
    print("❌ WARNING: Results differ!")

    diff_mask = ~np.isclose(res_single, res_batch, equal_nan=True, atol=1e-5)
    num_diffs = np.sum(diff_mask)
    print(f"Number of mismatches: {num_diffs} out of {len(res_single)}")

    if num_diffs > 0:
        idx_diff = np.where(diff_mask)[0][0]
        print(f"First mismatch at index {valid_indices[idx_diff]}:")
        print(f"  Single: {res_single[idx_diff]}")
        print(f"  Batch:  {res_batch[idx_diff]}")

Number of mismatches: 49 out of 5000
First mismatch at index 26903:
  Single: 0.0
  Batch:  2.0306253846153846


In [17]:
idx_diff

array([  39,   70,  224,  344,  372,  395,  608,  610,  667,  675,  817,
        830,  846,  847,  958, 1234, 1284, 1413, 1443, 1587, 1667, 1750,
       1830, 1855, 1895, 2049, 2060, 2162, 2327, 2519, 2566, 2587, 3017,
       3303, 3343, 3378, 3445, 3511, 3641, 3757, 3763, 3900, 3938, 3951,
       4082, 4205, 4297, 4863, 4966])

After the check above I have fixed the populate_sense_accessibility_batch function and eventually there where no diffrences

In [20]:
gitSAMPLE_SIZE = 5000
N_WORKERS = 5
SENSE_AVG_ACCESSIBILITY = 'sense_avg_accessibility'

df_sample = valid_data.sample(n=SAMPLE_SIZE, random_state=42).copy()

# ---------------------------------------------------------
# 1. Old calculation - single process
# ---------------------------------------------------------
print(f"\n1. Running Single Process (Standard Loop)...")
df_single = df_sample.copy()
t0 = time.time()

populate_sense_accessibility(df_single)

time_single = time.time() - t0
print(f"   Done in: {time_single:.2f} seconds")


# ---------------------------------------------------------
# 2. New calculation - parallel batch processing
# ---------------------------------------------------------
print(f"\n2. Running Parallel Batch Process ({N_WORKERS} workers)...")
df_batch = df_sample.copy()
t0 = time.time()


def process_chunk(chunk_df):
    return populate_sense_accessibility_batch(chunk_df, batch_size=len(chunk_df))

chunks = np.array_split(df_batch, N_WORKERS)
results = []

with ProcessPoolExecutor(max_workers=N_WORKERS) as executor:
    for res in executor.map(process_chunk, chunks):
        results.append(res)

if results:
    full_batch_result = pd.concat(results, ignore_index=True)

    df_batch = df_batch.merge(
        full_batch_result,
        left_index=True,
        right_on='rna_id',
        how='left'
    )
    df_batch[SENSE_AVG_ACCESSIBILITY] = df_batch['access']

time_batch = time.time() - t0
print(f"   Done in: {time_batch:.2f} seconds")


if time_batch > 0:
    speedup = time_single / time_batch
    print(f"\nFINAL SPEEDUP: x{speedup:.2f}")
    print(f"(Parallel batch is {speedup:.1f} times faster!)")


1. Running Single Process (Standard Loop)...
   Done in: 1633.49 seconds

2. Running Parallel Batch Process (5 workers)...


  return bound(*args, **kwds)


   Done in: 536.09 seconds

FINAL SPEEDUP: x3.05
(Parallel batch is 3.0 times faster!)
