In [1]:
from scripts.data_genertion.consts import *
from scripts.features.feature_extraction import load_all_features

main_df = load_all_features()
main_df[SEQUENCE] = main_df[SEQUENCE].astype(str)

In [2]:
from asodesigner.read_human_genome import get_locus_to_data_dict
import pickle
from asodesigner.consts import CACHE_DIR

genes_u = ['HIF1A', 'APOL1', 'YAP1', 'SOD1', 'SNCA', 'IRF4', 'KRAS', 'KLKB1', 'SNHG14', 'DGAT2', 'IRF5', 'HTRA1',
           'MYH7', 'MALAT1', 'HSD17B13']
cache_path = CACHE_DIR / 'gene_to_data_simple_cache.pickle'
if not cache_path.exists():
    gene_to_data = get_locus_to_data_dict(include_introns=True, gene_subset=genes_u)
    with open(cache_path, 'wb') as f:
        pickle.dump(gene_to_data, f)
else:
    with open(cache_path, 'rb') as f:
        gene_to_data = pickle.load(f)

In [3]:
from scripts.data_genertion.data_handling import get_populated_df_with_structure_features

main_df = get_populated_df_with_structure_features(main_df, genes_u, gene_to_data)

In [4]:
main_df[[SENSE_TYPE, 'sense_intron', 'sense_exon', 'sense_utr', SENSE_START, CANONICAL_GENE]]

Unnamed: 0,sense_type,sense_intron,sense_exon,sense_utr,sense_start,Canonical Gene Name
0,intron,1,0,0,41212,KRAS
1,intron,1,0,0,23686,KRAS
2,intron,1,0,0,43363,KRAS
3,intron,1,0,0,23680,KRAS
4,intron,1,0,0,41168,KRAS
...,...,...,...,...,...,...
34760,intron,1,0,0,7827,APOL1
34761,intron,1,0,0,8250,APOL1
34762,intron,1,0,0,8335,APOL1
34763,utr,0,1,1,13848,APOL1


In [5]:
main_df.loc[:, 'mrna_length'] = [len(gene_to_data[gene].full_mrna) for gene in main_df[CANONICAL_GENE]]
main_df['normalized_sense_start_from_end'] = main_df['sense_start_from_end'] / main_df['mrna_length']

In [6]:
import numpy as np

first_filtered = main_df.copy()
first_filtered = first_filtered[first_filtered['Cell line organism'] == 'human']
first_filtered = first_filtered.dropna(subset=[INHIBITION]).copy()
log_correction = 1.01
first_filtered.loc[:, 'log_inhibition'] = -np.log(log_correction - first_filtered[INHIBITION] / 100)
first_filtered = first_filtered[~first_filtered[CELL_LINE].isin([
    'Hela',  # scanning modifications
    'Human Neuronal Cell',  # scanning modifications;
    'CC-2580',  # scanning modifications
    'SH-SY5Y'  # non pure PS based
])]

In [7]:

ALL_CELL_LINES = first_filtered[CELL_LINE].unique()
# non_hepa_cancer = filtered_original[
#     # (~((filtered_original[CELL_LINE] == 'HepG2') & (filtered_original[TRANSFECTION].str.contains('Lipo')))) &
#     (filtered_original[CELL_LINE] != 'HepG2')  # experiment settings too different
#     & (filtered_original[CELL_LINE] != 'HepaRG')  # not similar to cancer
#     & (filtered_original[CELL_LINE] != 'A-431')
#     & (filtered_original[CELL_LINE] != 'SNU-449')
#     # & (filtered_original[CELL_LINE] != 'KARPAS-229')
#     # & (filtered_original[CELL_LINE] != 'MM.1R')
# ]

# ['A431', 'KARPAS-229', 'MM.1R', 'SK-MEL-28',
non_hepa_cancer = first_filtered[
    # (~((filtered_original[CELL_LINE] == 'HepG2') & (filtered_original[TRANSFECTION].str.contains('Lipo')))) &
    (first_filtered[CELL_LINE] == 'A431')  # experiment settings too different
    | (first_filtered[CELL_LINE] == 'KARPAS-229')  # not similar to cancer
    | (first_filtered[CELL_LINE] == 'MM.1R')
    | (first_filtered[CELL_LINE] == 'SK-MEL-28')
    # & (filtered_original[CELL_LINE] != 'KARPAS-229')
    # & (filtered_original[CELL_LINE] != 'MM.1R')

    ].copy()

In [15]:
selected_features = [
    TREATMENT_PERIOD,
    'at_skew',
    'CAI_score_global_CDS',
    'stop_codon_count',
    'sense_avg_accessibility',
    'on_target_fold_openness_normalized40_15',
    'sense_utr',
    'nucleotide_diversity',
    'internal_fold',
    'normalized_start',
    'RNaseH1_Krel_score_R7_krel', # renamed to best
    'hairpin_score',
    'Modification_min_distance_to_3prime',
    'at_rich_region_score'
]

In [16]:
import matplotlib.pyplot as plt
from scipy.stats import spearmanr, pearsonr

metric = 'correct_log_inhibition2'

li = non_hepa_cancer['log_inhibition'].copy()
non_hepa_cancer['correct_log_inhibition'] = li / (non_hepa_cancer[VOLUME] / (non_hepa_cancer[VOLUME] + 10))
non_hepa_cancer['correct_log_inhibition2'] = li / (non_hepa_cancer[VOLUME] * 0.0000601 + 0.537)
non_hepa_cancer['correct_log_inhibition3'] = non_hepa_cancer['correct_log_inhibition2'] / (-0.032*  non_hepa_cancer[TREATMENT_PERIOD] + 2.36)


In [17]:
min_val = non_hepa_cancer[metric].min()
non_hepa_cancer[metric + 'ndcg'] = (
    non_hepa_cancer[metric] - min_val
) # for evaluation purposes, not training!

In [18]:
cell_lines = ALL_CELL_LINES
genes = non_hepa_cancer[CANONICAL_GENE].unique()

In [19]:
seed = 42


In [20]:
from xgboost import XGBRanker

model = XGBRanker(objective='rank:ndcg', ndcg_exp_gain=False, lambdarank_pair_method="topk",
                  lambdarank_num_pair_per_sample=200,
                  seed=seed, n_jobs=-1
                  )

In [21]:
# Real model, better to train on everything here
df_copy = non_hepa_cancer.copy()
df_copy['group_key'] = df_copy[CELL_LINE].astype(str)
df_copy = df_copy.sort_values('group_key')
group_train = df_copy.groupby('group_key').size().tolist()
X = df_copy[selected_features]  # CHANGED: using selected_features
y = df_copy[metric]
model.fit(X.to_numpy(), y.to_numpy(), group=group_train)

0,1,2
,objective,'rank:ndcg'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [22]:
# Check the wellness of fit
predicted = model.predict(X[selected_features].values)

corr, _ = pearsonr(predicted, y)
corrs, _ = spearmanr(predicted, y)

print(
    f"Pearson {corr}, Spearman {corrs}")


Pearson 0.5207460562690249, Spearman 0.48690341402197834


In [23]:
from sklearn.metrics import ndcg_score

y_true_corrected = df_copy[metric + 'ndcg'].to_numpy()

# ---- New metrics ----
# NDCG@50
ndcg50 = ndcg_score(y_true_corrected.reshape(1, -1), predicted.reshape(1, -1), k=200)
# NDCG (all, just drop k)
ndcg_all = ndcg_score(y_true_corrected.reshape(1, -1), predicted.reshape(1, -1))
# Precision@50 (manual overlap of top-50 by truth vs pred)
precisions = []
values = [50, 100]
for K in values:
    pred_top_idx = np.argpartition(predicted, -K)[-K:]
    true_top_idx = np.argpartition(y_true_corrected, -K)[-K:]
    precisions.append(len(set(pred_top_idx) & set(true_top_idx)) / K)

print(f"NDCG@50: {ndcg50:.4f}, NDCG(all): {ndcg_all:.4f}, Precision: {values, precisions}")

NDCG@50: 0.8041, NDCG(all): 0.9823, Precision: ([50, 100], [0.28, 0.36])


In [24]:
# Very impressive results above, but go figure

In [25]:
TARGETS = ['MALAT1']
SEQUENCES = {}
for target in TARGETS:
    SEQUENCES[target] = gene_to_data[target].full_mrna

In [26]:
from asodesigner.util import get_antisense
import pandas as pd

def get_init_df(target_mrna, end):
    candidates = []
    sense_starts = []
    sense_lengths = []
    sense_starts_from_end = []

    for i in range(0, len(target_mrna) - 19):
        target = target_mrna[i: i + 20]
        candidates.append(get_antisense(str(target)))
        sense_starts.append(i)
        sense_lengths.append(20)
        sense_starts_from_end.append(end - i)
    df = pd.DataFrame(
        {SEQUENCE: candidates, SENSE_START: sense_starts,
         SENSE_LENGTH: sense_lengths, "sense_start_from_end": sense_starts_from_end})
    return df

dfs = {}
for target in TARGETS:
    gene_info = gene_to_data[target]
    dfs[target] = get_init_df(gene_info.full_mrna, gene_info.exon_indices[-1][1] - gene_info.cds_start)

In [27]:
for gene, df in dfs.items():
    df[CANONICAL_GENE] = gene

In [28]:
for df in dfs.values():
    df[CELL_LINE_ORGANISM] = 'human'
    df[INHIBITION] = 0 # Just for the function, not important

In [29]:
for gene, df in dfs.items():
    df = get_populated_df_with_structure_features(df, TARGETS, gene_to_data)
    dfs[gene] = df

In [30]:
from scripts.data_genertion.data_handling import populate_features

for gene, df in dfs.items():
    df[TREATMENT_PERIOD] = 24  # keep constant for all
    df[VOLUME] = 1000  # keep constant for all
    df['log_volume'] = np.log(df[VOLUME])
    df['normalized_start'] = df[SENSE_START] / len(SEQUENCES[gene])
    df['normalized_sense_start_from_end'] = df['sense_start_from_end'] / len(SEQUENCES[gene])
    easy_to_populate = ['at_skew', 'gc_content', 'gc_content_3_prime_5', 'gc_skew', 'hairpin_score',
                        'homooligo_count', 'internal_fold', 'nucleotide_diversity', 'self_energy', 'stop_codon_count',
                        'at_rich_region_score', 'poly_pyrimidine_stretch']
    populate_features(df, easy_to_populate)

In [31]:
from scripts.data_genertion.data_handling import get_populate_fold

for gene, df in dfs.items():
    fold_variants = [(40, 15)]
    df = get_populate_fold(df, TARGETS, gene_to_data, fold_variants=fold_variants)
    dfs[gene] = df

In [32]:
from hybridization.hybridization_features import get_exp_psrna_hybridization

for gene, df in dfs.items():
    df.loc[:, 'exp_ps_hybr'] = [
        get_exp_psrna_hybridization(antisense.replace('T', 'U'), temp=37) for
        antisense in df[SEQUENCE]]

In [33]:
# Missing: CAI_score_global_CDS, 'sense_avg_accessibility', RNaseH1_Krel_score_R7_krel, Modification_min_distance_to_3prime

In [34]:
from features.mod_features import compute_mod_min_distance_to_3prime

# generate MOE 20-mers
for gene, df in dfs.items():
    df.loc[:, 'Modification_min_distance_to_3prime'] = compute_mod_min_distance_to_3prime('MMMMMddddddddddMMMMM')

In [45]:
from yehuda_code.Folding_Functions import get_sense_with_flanks
from yehuda_code.access_calculator import AccessCalculator



FLANK_SIZE = 120
ACCESS_SIZE = 13
SEED_SIZE = 13
SEED_SIZES = [SEED_SIZE * m for m in range(1, 4)]
ACCESS_WIN_SIZE = 80


def compute_sense_accessibility(row, flank_size, access_win_size, seed_sizes, access_size, min_gc=0, max_gc=100, gc_ranges=1):


    try:
        # Skip invalid rows
        if row['sense_start'] == -1 or pd.isna(row['sense_with_flank_120nt']) or row['sense_with_flank_120nt'] == "":
            return None

        seq = row[f'sense_with_flank_{flank_size}nt']
        sense_start = row['sense_start']
        sense_length = row['sense_length']

        # Calculate accessibility
        df_access = AccessCalculator.calc(
            seq, access_size,
            min_gc, max_gc, gc_ranges,
            access_win_size, seed_sizes
        )

        flank_start = max(0, sense_start - flank_size)
        sense_start_in_flank = sense_start - flank_start
        sense_end_in_flank = sense_start_in_flank + sense_length

        if 0 <= sense_start_in_flank < len(df_access) and sense_end_in_flank <= len(df_access):
            values = df_access['avg_access'].iloc[sense_start_in_flank:sense_end_in_flank].dropna()
            return values.mean() if not values.empty else None
        else:
            return None

    except Exception as e:
        print(f"Error at row {row.name} | seq start: {row['sense_start']} | error: {e}")
        return None


for gene, df in dfs.items():
    FLANKED_SENSE_COL = f'sense_with_flank_{FLANK_SIZE}nt'

    val = gene_to_data[gene].full_mrna
    df['pre_mrna_sequence'] = [val] * len(df)


    # Create new column with flanked sequences
    df[FLANKED_SENSE_COL] = df.apply(
    lambda row: get_sense_with_flanks(
        row['pre_mrna_sequence'],
        row['sense_start'],
        row['sense_length'],
        flank_size=FLANK_SIZE
    ) if row['sense_start'] != -1 else "",  # Handle cases where sense was not found
    axis=1
    )


    batch_size = 500
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        batch = df.iloc[start_idx:end_idx].copy()

        print(f"Processing rows {start_idx} to {end_idx}...")

        batch['sense_avg_accessibility'] = batch.apply(
            compute_sense_accessibility,
            axis=1,
            flank_size=FLANK_SIZE,
            access_win_size=ACCESS_WIN_SIZE,
            seed_sizes=SEED_SIZES,
            access_size=ACCESS_SIZE,
        )

        # Save batch to the new folder
        batch.to_csv(f"out/batch_{start_idx}_{end_idx}.csv", index=False)


Processing rows 0 to 500...
Processing rows 500 to 1000...
Processing rows 1000 to 1500...
Processing rows 1500 to 2000...
Processing rows 2000 to 2500...
Processing rows 2500 to 3000...
Processing rows 3000 to 3500...
Processing rows 3500 to 4000...
Processing rows 4000 to 4500...
Processing rows 4500 to 5000...
Processing rows 5000 to 5500...
Processing rows 5500 to 6000...
Processing rows 6000 to 6500...
Processing rows 6500 to 7000...
Processing rows 7000 to 7500...
Processing rows 7500 to 8000...
Processing rows 8000 to 8500...
Processing rows 8500 to 8810...


In [46]:
import pandas as pd
import glob

# Load all batch files from the new output folder
files = sorted(glob.glob(f"out/batch_*.csv"))
df_all = pd.concat([pd.read_csv(f) for f in files], axis=0)


In [47]:
df_all

Unnamed: 0,Sequence,sense_start,sense_length,sense_start_from_end,Canonical Gene Name,Cell line organism,Inhibition(%),sense_exon,sense_intron,sense_utr,...,on_target_fold_openness40_15,on_target_fold_openness_normalized40_15,exp_ps_hybr,Modification_min_distance_to_3prime,RNaseH1_Krel_score_R4a_krel_Krel,RNaseH1_Krel_score_R4b_krel_Krel,RNaseH1_Krel_score_R7_krel_Krel,pre_mrna_sequence,sense_with_flank_120nt,sense_avg_accessibility
0,GCAACCGGTGGGGCTGCGTC,0,20,8829,MALAT1,human,0,1,0,0,...,-10.062500,-0.503125,-2923,0.0,0.978831,1.004691,1.025778,GACGCAGCCCCACCGGTTGCGCAGTCCCTCCCCGCCCCCGCTCTCC...,GACGCAGCCCCACCGGTTGCGCAGTCCCTCCCCGCCCCCGCTCTCC...,8.867892
1,CGCAACCGGTGGGGCTGCGT,1,20,8828,MALAT1,human,0,1,0,0,...,-9.875000,-0.493750,-2941,0.0,0.949957,1.029123,1.015559,GACGCAGCCCCACCGGTTGCGCAGTCCCTCCCCGCCCCCGCTCTCC...,GACGCAGCCCCACCGGTTGCGCAGTCCCTCCCCGCCCCCGCTCTCC...,8.812901
2,GCGCAACCGGTGGGGCTGCG,2,20,8827,MALAT1,human,0,1,0,0,...,-9.687500,-0.484375,-2956,0.0,0.980212,1.002733,0.990129,GACGCAGCCCCACCGGTTGCGCAGTCCCTCCCCGCCCCCGCTCTCC...,GACGCAGCCCCACCGGTTGCGCAGTCCCTCCCCGCCCCCGCTCTCC...,8.846318
3,TGCGCAACCGGTGGGGCTGC,3,20,8826,MALAT1,human,0,1,0,0,...,-9.500000,-0.475000,-2907,0.0,0.968729,0.989912,0.972045,GACGCAGCCCCACCGGTTGCGCAGTCCCTCCCCGCCCCCGCTCTCC...,GACGCAGCCCCACCGGTTGCGCAGTCCCTCCCCGCCCCCGCTCTCC...,8.633534
4,CTGCGCAACCGGTGGGGCTG,4,20,8825,MALAT1,human,0,1,0,0,...,-9.312500,-0.465625,-2867,0.0,0.958289,0.988861,0.957519,GACGCAGCCCCACCGGTTGCGCAGTCCCTCCCCGCCCCCGCTCTCC...,GACGCAGCCCCACCGGTTGCGCAGTCCCTCCCCGCCCCCGCTCTCC...,8.560287
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,TTCCCCAATCAAGATTTTTT,8805,20,24,MALAT1,human,0,1,0,0,...,-0.333333,-0.016667,-2044,0.0,0.991729,0.957326,0.986575,GACGCAGCCCCACCGGTTGCGCAGTCCCTCCCCGCCCCCGCTCTCC...,TTGTCTTTTTCAGGTAATAGCCTGCAGCTGGTGTTTTGAGAAGCCC...,
306,TTTCCCCAATCAAGATTTTT,8806,20,23,MALAT1,human,0,1,0,0,...,-0.300000,-0.015000,-2044,0.0,0.978869,0.959173,1.013243,GACGCAGCCCCACCGGTTGCGCAGTCCCTCCCCGCCCCCGCTCTCC...,TGTCTTTTTCAGGTAATAGCCTGCAGCTGGTGTTTTGAGAAGCCCT...,
307,TTTTCCCCAATCAAGATTTT,8807,20,22,MALAT1,human,0,1,0,0,...,-0.266667,-0.013333,-2044,0.0,1.068860,1.000271,0.982115,GACGCAGCCCCACCGGTTGCGCAGTCCCTCCCCGCCCCCGCTCTCC...,GTCTTTTTCAGGTAATAGCCTGCAGCTGGTGTTTTGAGAAGCCCTA...,
308,TTTTTCCCCAATCAAGATTT,8808,20,21,MALAT1,human,0,1,0,0,...,-0.233333,-0.011667,-2044,0.0,1.008057,0.983111,0.992385,GACGCAGCCCCACCGGTTGCGCAGTCCCTCCCCGCCCCCGCTCTCC...,TCTTTTTCAGGTAATAGCCTGCAGCTGGTGTTTTGAGAAGCCCTAC...,


In [60]:
from asodesigner.features.RNaseH_features import rnaseh1_dict, compute_rnaseh1_score

best_window_start_krel = {
    'R4a_krel': {10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0, 16: 0, 17: 3,  18: 2, 19: 4, 20: 3, 21: 0, 22: 0, 25:0},
    'R4b_krel': {10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0, 16: 0, 17: 1,  18: 3, 19: 1, 20: 3, 21: 0, 22: 0, 25:0},
    'R7_krel':  {10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0, 16: 3, 17: 2,  18: 4, 19: 6, 20: 4, 21: 0, 22: 0, 25:0},
}

for exp in ['R4a_krel', 'R4b_krel', 'R7_krel']:
    weights = rnaseh1_dict(exp)

    def score_row(row):
        length = len(row['Sequence'])
        pos = best_window_start_krel.get(exp, {}).get(length, 0)
        return compute_rnaseh1_score(row['Sequence'], weights, window_start=pos)

    col_name = f"RNaseH1_Krel_score_{exp}"
    df_all[col_name] = df_all.apply(score_row, axis=1)

RNaseH1_Krel_features_best = [f"RNaseH1_Krel_score_{exp}" for exp in ['R4a_krel', 'R4b_krel', 'R7_krel']]


In [61]:
df_all.loc[:, 'CAI_score_global_CDS'] = 0.8526221963673779 # TODO - replace with the calculation


In [62]:
df_all.columns

Index(['Sequence', 'sense_start', 'sense_length', 'sense_start_from_end',
       'Canonical Gene Name', 'Cell line organism', 'Inhibition(%)',
       'sense_exon', 'sense_intron', 'sense_utr', 'sense_type',
       'Treatment_Period(hours)', 'ASO_volume(nM)', 'log_volume',
       'normalized_start', 'normalized_sense_start_from_end', 'self_energy',
       'internal_fold', 'gc_content', 'gc_content_3_prime_5',
       'homooligo_count', 'hairpin_score', 'gc_skew', 'at_skew',
       'nucleotide_diversity', 'stop_codon_count', 'at_rich_region_score',
       'poly_pyrimidine_stretch', 'on_target_fold_openness40_15',
       'on_target_fold_openness_normalized40_15', 'exp_ps_hybr',
       'Modification_min_distance_to_3prime',
       'RNaseH1_Krel_score_R4a_krel_Krel', 'RNaseH1_Krel_score_R4b_krel_Krel',
       'RNaseH1_Krel_score_R7_krel_Krel', 'pre_mrna_sequence',
       'sense_with_flank_120nt', 'sense_avg_accessibility',
       'CAI_score_global_CDS', 'RNaseH1_Krel_score_R4a_krel',
       

In [68]:
malat_scores = model.predict(df_all[selected_features])


In [86]:
from asodesigner.util import get_antisense

# Assuming you already have get_antisense(seq: str) -> str defined
# get_antisense, for some reason numba doesn't work well
tbl = str.maketrans("ACGTUacgtuNn", "TGCAAtgcaaNn")

df_all["score"] = malat_scores

df_all["sense"] = df_all[SEQUENCE].astype(str).str.translate(tbl).str[::-1]
(
    df_all.assign(score=malat_scores)
    .sort_values("score", ascending=False)   # sort by score
    .to_csv("malat_scores_model2.csv", index=False)
)


In [87]:
df_sorted = df_all.sort_values('score', ascending=False)


In [88]:
# pip install requests
import math, time, threading, urllib.parse, requests
from concurrent.futures import ThreadPoolExecutor, as_completed

UA = {"User-Agent": "python-requests gggenome/greedy"}

def _ggg_hits_leq_json(seq, k, db="hg38", timeout=60, retries=2):
    """Count hits with <=k mismatches via GGGenome JSON; fallback to CSV if needed."""
    s = str(seq).upper().replace("U", "T")
    q = urllib.parse.quote(s)
    url_json = f"https://gggenome.dbcls.jp/{db}/{k}/nogap/{q}.json"
    url_csv  = f"https://gggenome.dbcls.jp/{db}/{k}/nogap/{q}.csv?download"

    for attempt in range(retries + 1):
        try:
            r = requests.get(url_json, headers=UA, timeout=timeout)
            r.raise_for_status()
            try:
                data = r.json()
            except ValueError:
                raise RuntimeError("JSON parse failed")
            if isinstance(data, list):
                return len(data)
            if isinstance(data, dict):
                if "results" in data and isinstance(data["results"], list): return len(data["results"])
                if "hits" in data and isinstance(data["hits"], list):       return len(data["hits"])
                return sum(len(v) for v in data.values() if isinstance(v, list))
            return 0
        except Exception:
            # greedy CSV fallback
            try:
                r2 = requests.get(url_csv, headers=UA, timeout=timeout)
                r2.raise_for_status()
                return sum(1 for ln in r2.text.splitlines() if ln and not ln.startswith("#"))
            except Exception:
                if attempt < retries:
                    continue
                return 0
    return 0

def _d123_for_sequence(seq, db="hg38"):
    s = str(seq).upper().replace("U", "T")
    if not s:
        return (s, 0, 0, 0)
    L = len(s)
    k_allowed = max(0, math.floor(0.25 * L))  # GGGenome cap
    k0 = _ggg_hits_leq_json(s, 0, db=db)
    k1 = _ggg_hits_leq_json(s, 1, db=db) if k_allowed >= 1 else 0
    k2 = _ggg_hits_leq_json(s, 2, db=db) if k_allowed >= 2 else 0
    k3 = _ggg_hits_leq_json(s, 3, db=db) if k_allowed >= 3 else 0
    d1 = max(0, k1 - k0)
    d2 = max(0, k2 - k1)
    d3 = max(0, k3 - k2)
    return (s, d1, d2, d3)

def add_gggenome_d123(main_df, seq_col="SEQUENCE", db="hg38", *, max_workers=32, print_every=10):
    seqs = (main_df[seq_col].astype(str).str.upper().str.replace("U", "T", regex=False))
    uniq = seqs.dropna().unique().tolist()
    N = len(uniq)
    print(f"[GGG] Unique sequences: {N} | db={db} | workers={max_workers}")

    cache = {}
    lock = threading.Lock()
    t0 = time.perf_counter()
    errs = 0

    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futs = {ex.submit(_d123_for_sequence, s, db): s for s in uniq}
        done = 0
        for fut in as_completed(futs):
            s = futs[fut]
            try:
                s_key, d1, d2, d3 = fut.result()
            except Exception:
                d1 = d2 = d3 = 0
                with lock:
                    errs += 1
            with lock:
                cache[s] = (d1, d2, d3)
                done += 1
                if (done == 1) or (done % print_every == 0) or (done == N):
                    elapsed = time.perf_counter() - t0
                    rps = done / elapsed if elapsed > 0 else 0.0
                    print(f"[GGG] {done}/{N} cached | ~{rps:.1f} seq/s | errors={errs}")

    main_df["ggg_d1"] = seqs.map(lambda s: cache.get(s, (0, 0, 0))[0])
    main_df["ggg_d2"] = seqs.map(lambda s: cache.get(s, (0, 0, 0))[1])
    main_df["ggg_d3"] = seqs.map(lambda s: cache.get(s, (0, 0, 0))[2])

    print(f"[GGG] Finished in {time.perf_counter() - t0:.1f}s. Added columns: ggg_d1, ggg_d2, ggg_d3")
    return main_df


# --- usage ---
# main_df = main_df[main_df[SENSE_START] != -1]
result = add_gggenome_d123(df_sorted[:100], seq_col='sense', db="hg38", max_workers=100, print_every=2)

[GGG] Unique sequences: 100 | db=hg38 | workers=100
[GGG] 1/100 cached | ~0.0 seq/s | errors=0
[GGG] 2/100 cached | ~0.0 seq/s | errors=0
[GGG] 4/100 cached | ~0.1 seq/s | errors=0
[GGG] 6/100 cached | ~0.1 seq/s | errors=0
[GGG] 8/100 cached | ~0.1 seq/s | errors=0
[GGG] 10/100 cached | ~0.2 seq/s | errors=0
[GGG] 12/100 cached | ~0.2 seq/s | errors=0
[GGG] 14/100 cached | ~0.2 seq/s | errors=0
[GGG] 16/100 cached | ~0.2 seq/s | errors=0
[GGG] 18/100 cached | ~0.3 seq/s | errors=0
[GGG] 20/100 cached | ~0.3 seq/s | errors=0
[GGG] 22/100 cached | ~0.3 seq/s | errors=0
[GGG] 24/100 cached | ~0.4 seq/s | errors=0
[GGG] 26/100 cached | ~0.4 seq/s | errors=0
[GGG] 28/100 cached | ~0.4 seq/s | errors=0
[GGG] 30/100 cached | ~0.4 seq/s | errors=0
[GGG] 32/100 cached | ~0.5 seq/s | errors=0
[GGG] 34/100 cached | ~0.5 seq/s | errors=0
[GGG] 36/100 cached | ~0.5 seq/s | errors=0
[GGG] 38/100 cached | ~0.5 seq/s | errors=0
[GGG] 40/100 cached | ~0.6 seq/s | errors=0
[GGG] 42/100 cached | ~0.6 se

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df["ggg_d1"] = seqs.map(lambda s: cache.get(s, (0, 0, 0))[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df["ggg_d2"] = seqs.map(lambda s: cache.get(s, (0, 0, 0))[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df["ggg_d3"] = seqs.map(lambda s: cache.get(s, (0, 0, 0))[2])


In [98]:
# Show up to 200 characters per column
pd.set_option("display.max_colwidth", 200)

# Show more columns across the screen
pd.set_option("display.max_columns", None)

# Optionally, widen the console display
pd.set_option("display.width", 200)

print(result[:30][[SEQUENCE, 'sense_start', 'ggg_d1', 'ggg_d2', 'at_skew', 'gc_content', 'on_target_fold_openness_normalized40_15', 'sense_avg_accessibility', 'RNaseH1_Krel_score_R7_krel']])


                 Sequence  sense_start  ggg_d1  ggg_d2   at_skew  gc_content  on_target_fold_openness_normalized40_15  sense_avg_accessibility  RNaseH1_Krel_score_R7_krel
5    TATCTTCTCTATTCTTTTCT         2005      43    1000 -0.733333        0.25                                -0.004375                 0.903107                    0.974312
7    CCTATCTTCTCTATTCTTTT         2007       3       0 -0.714286        0.30                                -0.005792                 1.027160                    0.977993
12   ATTTCCCTATCTTCTCTATT         2012       5       0 -0.571429        0.30                                -0.009583                 1.266860                    0.992284
76   CTGCTTCCTACTTTTCAGGT         2076       2       0 -0.636364        0.45                                -0.084000                 1.498184                    0.973887
337  TTTCGGCTTCTTTTATTCCA         5337       1      27 -0.692308        0.35                                -0.084667                 1.489497   

In [99]:
df_sorted.tail(30)[['on_target_fold_openness_normalized40_15']]

Unnamed: 0,on_target_fold_openness_normalized40_15
383,-0.569417
438,-0.427958
427,-0.480875
444,-0.414875
443,-0.416208
425,-0.459792
456,-0.450292
455,-0.452917
459,-0.442417
460,-0.439792


In [None]:
# Now let's generate LNA GFP sequences

In [208]:
# GFP_IN_YEAST = (
#     'ATGGTtAGtAAaGGaGAaGAGTTgTTCACaGGaGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGGTGCTGGGGCAggtacCCCTAAAGATCCAGCCAAACCTCCGGCCAcGGCACAAGTTGTGGGATGGCCACCGGTGAGATCATACCGGAAGAACGTGATGGTTTCCTGCCAAAAATCAAGCGGTGGCCCGGAGGCGGCGGCGTTCGTGAAGTAA'
#     .upper())

GFP_IN_HUMAN = (
"ctttttcgcaacgggtttgccgccagaacacaggaccggtgccaccatggtgagcaagggcgaggagctgttcaccggggtggtgcccatcctggtcgagctggacggcgacgtaaacggccacaagttcagcgtgtccggcgagggcgagggcgatgccacctacggcaagctgaccctgaagttcatctgcaccaccggcaagctgcccgtgccctggcccaccctcgtgaccaccctgacctacggcgtgcagtgcttcagccgctaccccgaccacatgaagcagcacgacttcttcaagtccgccatgcccgaaggctacgtccaggagcgcaccatcttcttcaaggacgacggcaactacaagacccgcgccgaggtgaagttcgagggcgacaccctggtgaaccgcatcgagctgaagggcatcgacttcaaggaggacggcaacatcctggggcacaagctggagtacaactacaacagccacaacgtctatatcatggccgacaagcagaagaacggcatcaaggtgaacttcaagatccgccacaacatcgaggacggcagcgtgcagctcgccgaccactaccagcagaacacccccatcggcgacggccccgtgctgctgcccgacaaccactacctgagcacccagtccgccctgagcaaagaccccaacgagaagcgcgatcacatggtcctgctggagttcgtgaccgccgccgggatcactctcggcatggacgagctgtacaagcccaagaaaaagcggaaagtgggatccggcgcaacaaacttctctctgctgaaacaagccggagatgtcgaagagaatcctggaccgaccgagtacaagcccacggtgcgcctcgccacccgcgacgacgtccccagggccgtacgcaccctcgccgccgcgttcgccgactaccccgccacgcgccacaccgtcgatccggaccgccacatcgagcgggtcaccgagctgcaagaactcttcctcacgcgcgtcgggctcgacatcggcaaggtgtgggtcgcggacgacggcgccgcggtggcggtctggaccacgccggagagcgtcgaagcgggggcggtgttcgccgagatcggcccgcgcatggccgagttgagcggttcccggctggccgcgcagcaacagatggaaggcctcctggcgccgcaccggcccaaggagcccgcgtggttcctggccaccgtcggagtctcgcccgaccaccagggcaagggtctgggcagcgccgtcgtgctccccggagtggaggcggccgagcgcgccggggtgcccgccttcctggagacctccgcgccccgcaacctccccttctacgagcggctcggcttcaccgtcaccgccgacgtcgaggtgcccgaaggaccgcgcacctggtgcatgacccgcaagcccggtgcctgaacgcgttaagtcgacaatcaacctctggattacaaaatttgtgaaagattgactggtattcttaactatgttgctccttttacgctatgtggatacgctgctttaatgcctttgtatcatgctattgcttcccgtatggctttcattttctcctccttgtataaatcctggttgctgtctctttatgaggagttgtggcccgttgtcaggcaacgtggcgtggtgtgcactgtgtttgctgacgcaacccccactggttggggcattgccaccacctgtcagctcctttccgggactttcgctttccccctccctattgccacggcggaactcatcgccgcctgccttgcccgctgctggacaggggctcggctgttgggcactgacaattccgtggtgttgtcggggaaatcatcgtcctttccttggctgctcgcctgtgttgccacctggattctgcgcgggacgtccttctgctacgtcccttcggccctcaatccagcggaccttccttcccgcggcctgctgccggctctgcggcctcttccgcgtcttcgccttcgccctcagacgagtcggatctccctttgggccgcctccccgcgtcgactttaagaccaatgacttacaaggcagctgtagatcttagccactttttaaaagaaaaggggggactggaagggctaattcactcccaacgaagacaagatctgctttttgcttgtactgggtctctctggttagaccagatctgagcctgggagctctctggctaactagggaacccactgcttaagcctcaataaagcttgccttgagtgcttcaagtagtgtgtgcccgtctgttgtgtgactctggtaactagagatccctcagacccttttagtcagtgtggaaaatctctagcagggcccgtttaaacccgctgatcagcctcgactgtgccttctagttgccagccatctgttgtttgcccctcccccgtgccttccttgaccctggaaggtgccactcccactgtcctttcctaataaa".upper()
)
# GFP + Degron x <= 842
# NLS 843 <= x<= 893
# 3UTR 1615 <= x <= 1848


In [209]:
GFP_IN_HUMAN[633:633+16]


'GCCCGACAACCACTAC'

In [210]:
GFP_IN_HUMAN[679:679+16]

'CCCAACGAGAAGCGCG'

In [211]:
from asodesigner.util import get_antisense
import pandas as pd

# GFP_YEAST_END = len(GFP_IN_YEAST) - 3
GFP_HUMAN_END = len(GFP_IN_HUMAN)
LNA_SIZE = 16

def get_init_df(target_mrna, end):
    candidates = []
    sense_starts = []
    sense_lengths = []
    sense_starts_from_end = []

    for i in range(0, len(target_mrna) - (LNA_SIZE - 1)):
        target = target_mrna[i: i + LNA_SIZE]
        candidates.append(get_antisense(str(target)))
        sense_starts.append(i)
        sense_lengths.append(LNA_SIZE)
        sense_starts_from_end.append(end - i)
    df = pd.DataFrame(
        {SEQUENCE: candidates, SENSE_START: sense_starts,
         SENSE_LENGTH: sense_lengths, "sense_start_from_end": sense_starts_from_end})
    return df


# df_yeast = get_init_df(GFP_IN_YEAST, GFP_YEAST_END)
df_human = get_init_df(GFP_IN_HUMAN, GFP_HUMAN_END)

In [213]:
from asodesigner.util import get_antisense
import pandas as pd

# df_yeast[CANONICAL_GENE] = 'YEAST_GFP'
df_human[CANONICAL_GENE] = 'HUMAN_GFP'
dataframes = [df_human]

# df_yeast['sense_exon'] = 1
df_human['sense_exon'] = 1
# df_yeast['sense_intron'] = 0
df_human['sense_intron'] = 0
# df_yeast['sense_utr'] = 0
df_human['sense_utr'] = [1 if sense_start > 842 else 0 for sense_start in df_human[SENSE_START]]


In [214]:
%reload_ext autoreload
%autoreload 2

from asodesigner.process_utils import LocusInfo
from scripts.data_genertion.data_handling import get_populate_fold

genes_u = ['YEAST_GFP', 'HUMAN_GFP', 'MALAT1']
gene_to_data = {'YEAST_GFP': LocusInfo(), 'HUMAN_GFP': LocusInfo(), 'MALAT1': LocusInfo()}
# gene_to_data['YEAST_GFP'].full_mrna = GFP_IN_YEAST
gene_to_data['HUMAN_GFP'].full_mrna = GFP_IN_HUMAN

In [215]:
from scripts.data_genertion.data_handling import populate_features

dfs = dict()
# dfs['YEAST_GFP'] = df_yeast
dfs['HUMAN_GFP'] = df_human
SEQUENCES = {}
for target in ['HUMAN_GFP']:
# for target in ['YEAST_GFP', 'HUMAN_GFP']:
    SEQUENCES[target] = gene_to_data[target].full_mrna

for gene, df in dfs.items():
    df[TREATMENT_PERIOD] = 24  # keep constant for all
    df[VOLUME] = 1000  # keep constant for all
    df['log_volume'] = np.log(df[VOLUME])
    df['normalized_start'] = df[SENSE_START] / len(SEQUENCES[gene])
    df['normalized_sense_start_from_end'] = df['sense_start_from_end'] / len(SEQUENCES[gene])
    easy_to_populate = ['at_skew', 'gc_content', 'gc_content_3_prime_5', 'gc_skew', 'hairpin_score',
                        'homooligo_count', 'internal_fold', 'nucleotide_diversity', 'self_energy', 'stop_codon_count',
                        'at_rich_region_score', 'poly_pyrimidine_stretch']
    populate_features(df, easy_to_populate)

In [217]:
def get_populate_fold(df, genes_u, gene_to_data, fold_variants=[(40, 15)]):
    from asodesigner.fold import calculate_energies, get_weighted_energy
    from asodesigner.util import get_antisense

    all_data_human_gene_premrna_no_nan = df.copy()

    # Comment out the long cases for quick running
    for (window_size, step_size) in fold_variants:

        on_target_fold = 'on_target_fold_openness' + str(window_size) + '_' + str(step_size)
        on_target_fold_normalized = 'on_target_fold_openness_normalized' + str(window_size) + '_' + str(step_size)
        all_data_human_gene_premrna_no_nan[on_target_fold] = np.zeros_like(all_data_human_gene_premrna_no_nan[SEQUENCE],
                                                                           dtype=np.float64)
        all_data_human_gene_premrna_no_nan[on_target_fold_normalized] = np.zeros_like(
            all_data_human_gene_premrna_no_nan[SEQUENCE], dtype=np.float64)

        for gene in genes_u:

            target = gene_to_data[gene].full_mrna
            gene_rows = all_data_human_gene_premrna_no_nan[all_data_human_gene_premrna_no_nan[CANONICAL_GENE] == gene]
            energies = calculate_energies(str(target), step_size, window_size)

            for index, row in gene_rows.iterrows():
                antisense = row[SEQUENCE]
                sense = get_antisense(antisense)
                l = row[SENSE_LENGTH]
                sense_start = row[SENSE_START]
                mean_fold = get_weighted_energy(sense_start, l, step_size, energies, window_size)
                mean_fold_end = get_weighted_energy(sense_start, l, step_size, energies, window_size)
                mean_fold_start = get_weighted_energy(sense_start, l, step_size, energies, window_size)
                if mean_fold > 100:
                    print(energies)
                    print("Weird: ", mean_fold)
                    print("Sense_start ", sense_start)
                    print("Sense_length ", l)
                    print("Gene: ", gene)
                    mean_fold = 0
                all_data_human_gene_premrna_no_nan.loc[index, on_target_fold] = mean_fold
                all_data_human_gene_premrna_no_nan.loc[index, on_target_fold_normalized] = mean_fold / l
    return all_data_human_gene_premrna_no_nan

In [218]:
from scripts.data_genertion.data_handling import get_populate_fold

# dfs['YEAST_GFP'] = df_yeast
dfs['HUMAN_GFP'] = df_human

for gene, df in dfs.items():
    fold_variants = [(40, 15)]

    df = get_populate_fold(df, ['YEAST_GFP', 'HUMAN_GFP'], gene_to_data, fold_variants=fold_variants)
    dfs[gene] = df

[-4.69999981e+000 -8.00000000e+000 -9.39999962e+000 -1.30000000e+001
 -1.63999996e+001 -1.46000004e+001 -9.39999962e+000 -9.89999962e+000
 -1.18999996e+001 -1.43999996e+001 -1.00000000e+001 -4.30000019e+000
 -7.59999990e+000 -8.39999962e+000 -3.20000005e+000 -4.40000010e+000
 -7.69999981e+000 -7.40000010e+000 -5.00000000e+000 -7.50000000e+000
 -5.50000000e+000 -6.09999990e+000 -3.50000000e+000 -7.50000000e+000
 -1.01000004e+001 -1.38999996e+001 -8.39999962e+000 -7.30000019e+000
 -9.19999981e+000 -5.90000010e+000 -6.19999981e+000 -7.00000000e+000
 -3.00000000e+000 -4.80000019e+000 -4.00000000e+000 -4.50000000e+000
 -4.30000019e+000 -1.35000000e+001 -2.59999990e+000 -4.59999990e+000
 -1.00000000e+001 -3.79999995e+000 -1.60000002e+000 -3.59999990e+000
 -3.20000005e+000 -7.50000000e+000 -1.46000004e+001 -1.10000000e+001
 -1.38000002e+001 -6.40000010e+000 -3.40000010e+000 -4.30000019e+000
 -3.09999990e+000 -9.19999981e+000 -7.19999981e+000 -3.40000010e+000
 -6.40000010e+000 -7.40000010e+000

In [220]:
from hybridization.hybridization_features import get_exp_psrna_hybridization

for gene, df in dfs.items():
    df.loc[:, 'exp_ps_hybr'] = [
        get_exp_psrna_hybridization(antisense.replace('T', 'U'), temp=37) for
        antisense in df[SEQUENCE]]

In [221]:
from features.mod_features import compute_mod_min_distance_to_3prime

# generate LNA 16-mers
for gene, df in dfs.items():
    df.loc[:, 'Modification_min_distance_to_3prime'] = compute_mod_min_distance_to_3prime('LLLddddddddddLLL')

In [223]:
# Missing: CAI_score_global_CDS, 'sense_avg_accessibility', RNaseH1_Krel_score_R7_krel, Modification_min_distance_to_3prime

from yehuda_code.Folding_Functions import get_sense_with_flanks
from yehuda_code.access_calculator import AccessCalculator

FLANK_SIZE = 120
ACCESS_SIZE = 13
SEED_SIZE = 13
SEED_SIZES = [SEED_SIZE * m for m in range(1, 4)]
ACCESS_WIN_SIZE = 80


def compute_sense_accessibility(row, flank_size, access_win_size, seed_sizes, access_size, min_gc=0, max_gc=100,
                                gc_ranges=1):
    try:
        # Skip invalid rows
        if row['sense_start'] == -1 or pd.isna(row['sense_with_flank_120nt']) or row['sense_with_flank_120nt'] == "":
            return None

        seq = row[f'sense_with_flank_{flank_size}nt']
        sense_start = row['sense_start']
        sense_length = row['sense_length']

        # Calculate accessibility
        df_access = AccessCalculator.calc(
            seq, access_size,
            min_gc, max_gc, gc_ranges,
            access_win_size, seed_sizes
        )

        flank_start = max(0, sense_start - flank_size)
        sense_start_in_flank = sense_start - flank_start
        sense_end_in_flank = sense_start_in_flank + sense_length

        if 0 <= sense_start_in_flank < len(df_access) and sense_end_in_flank <= len(df_access):
            values = df_access['avg_access'].iloc[sense_start_in_flank:sense_end_in_flank].dropna()
            return values.mean() if not values.empty else None
        else:
            return None

    except Exception as e:
        print(f"Error at row {row.name} | seq start: {row['sense_start']} | error: {e}")
        return None


for gene, df in dfs.items():
    FLANKED_SENSE_COL = f'sense_with_flank_{FLANK_SIZE}nt'

    val = gene_to_data[gene].full_mrna
    df['pre_mrna_sequence'] = [val] * len(df)

    # Create new column with flanked sequences
    df[FLANKED_SENSE_COL] = df.apply(
        lambda row: get_sense_with_flanks(
            row['pre_mrna_sequence'],
            row['sense_start'],
            row['sense_length'],
            flank_size=FLANK_SIZE
        ) if row['sense_start'] != -1 else "",  # Handle cases where sense was not found
        axis=1
    )

    batch_size = 500
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        batch = df.iloc[start_idx:end_idx].copy()

        print(f"Processing rows {start_idx} to {end_idx}...")

        batch['sense_avg_accessibility'] = batch.apply(
            compute_sense_accessibility,
            axis=1,
            flank_size=FLANK_SIZE,
            access_win_size=ACCESS_WIN_SIZE,
            seed_sizes=SEED_SIZES,
            access_size=ACCESS_SIZE,
        )

        # Save batch to the new folder
        batch.to_csv(f"out/{gene}/batch_{start_idx}_{end_idx}.csv", index=False)


Processing rows 0 to 500...
Processing rows 500 to 1000...
Processing rows 1000 to 1500...
Processing rows 1500 to 2000...
Processing rows 2000 to 2468...


In [224]:
import pandas as pd
import glob

# Load all batch files from the new output folder
files = sorted(glob.glob(f"out/HUMAN_GFP/batch_*.csv"))
df_human = pd.concat([pd.read_csv(f) for f in files], axis=0)

# files = sorted(glob.glob(f"out/YEAST_GFP/batch_*.csv"))
# df_yeast = pd.concat([pd.read_csv(f) for f in files], axis=0)

In [226]:

from asodesigner.features.RNaseH_features import rnaseh1_dict, compute_rnaseh1_score

best_window_start_krel = {
    'R4a_krel': {10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0, 16: 0, 17: 3, 18: 2, 19: 4, 20: 3, 21: 0, 22: 0, 25: 0},
    'R4b_krel': {10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0, 16: 0, 17: 1, 18: 3, 19: 1, 20: 3, 21: 0, 22: 0, 25: 0},
    'R7_krel': {10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0, 16: 3, 17: 2, 18: 4, 19: 6, 20: 4, 21: 0, 22: 0, 25: 0},
}

for exp in ['R4a_krel', 'R4b_krel', 'R7_krel']:
    weights = rnaseh1_dict(exp)


    def score_row(row):
        length = len(row['Sequence'])
        pos = best_window_start_krel.get(exp, {}).get(length, 0)
        return compute_rnaseh1_score(row['Sequence'], weights, window_start=pos)

    col_name = f"RNaseH1_Krel_score_{exp}"

    # Yeast should have similar motifs, perhaps
    df_human[col_name] = df_human.apply(score_row, axis=1)
    # df_yeast[col_name] = df_yeast.apply(score_row, axis=1)

RNaseH1_Krel_features_best = [f"RNaseH1_Krel_score_{exp}" for exp in ['R4a_krel', 'R4b_krel', 'R7_krel']]


In [227]:
# Number is probably accurate enough
df_human.loc[:, 'CAI_score_global_CDS'] = 0.5752 # in frame =1
# df_yeast.loc[:, 'CAI_score_global_CDS'] = 0.5590591814785562

In [228]:
df_human_scores = model.predict(df_human[selected_features])

# Assuming you already have get_antisense(seq: str) -> str defined
# get_antisense, for some reason numba doesn't work well
tbl = str.maketrans("ACGTUacgtuNn", "TGCAAtgcaaNn")

df_human["score"] = df_human_scores

df_human["sense"] = df_human[SEQUENCE].astype(str).str.translate(tbl).str[::-1]
(
    df_human.assign(score=df_human_scores)
    .sort_values("score", ascending=False)  # sort by score
    .to_csv("df_human_scores_model2.csv", index=False)
)

df_sorted = df_human.sort_values('score', ascending=False)

In [231]:
# pip install requests
import math, time, threading, urllib.parse, requests
from concurrent.futures import ThreadPoolExecutor, as_completed

UA = {"User-Agent": "python-requests gggenome/greedy"}

def _ggg_hits_leq_json(seq, k, db="hg38", timeout=180, retries=2):
    """Count hits with <=k mismatches via GGGenome JSON; fallback to CSV if needed."""
    s = str(seq).upper().replace("U", "T")
    q = urllib.parse.quote(s)
    url_json = f"https://gggenome.dbcls.jp/{db}/{k}/nogap/{q}.json"
    url_csv  = f"https://gggenome.dbcls.jp/{db}/{k}/nogap/{q}.csv?download"

    for attempt in range(retries + 1):
        try:
            r = requests.get(url_json, headers=UA, timeout=timeout)
            r.raise_for_status()
            try:
                data = r.json()
            except ValueError:
                raise RuntimeError("JSON parse failed")
            if isinstance(data, list):
                return len(data)
            if isinstance(data, dict):
                if "results" in data and isinstance(data["results"], list): return len(data["results"])
                if "hits" in data and isinstance(data["hits"], list):       return len(data["hits"])
                return sum(len(v) for v in data.values() if isinstance(v, list))
            return 0
        except Exception:
            # greedy CSV fallback
            try:
                r2 = requests.get(url_csv, headers=UA, timeout=timeout)
                r2.raise_for_status()
                return sum(1 for ln in r2.text.splitlines() if ln and not ln.startswith("#"))
            except Exception:
                if attempt < retries:
                    continue
                return 0
    return 0

def _d123_for_sequence(seq, db="hg38"):
    s = str(seq).upper().replace("U", "T")
    if not s:
        return (s, 0, 0, 0)
    L = len(s)
    k_allowed = max(0, math.floor(0.25 * L))  # GGGenome cap
    k0 = _ggg_hits_leq_json(s, 0, db=db)
    k1 = _ggg_hits_leq_json(s, 1, db=db) if k_allowed >= 1 else 0
    # k2 = _ggg_hits_leq_json(s, 2, db=db) if k_allowed >= 2 else 0
    # k3 = _ggg_hits_leq_json(s, 3, db=db) if k_allowed >= 3 else 0
    d1 = max(0, k1 - k0)
    # d2 = max(0, k2 - k1)
    # d3 = max(0, k3 - k2)
    return (s, d1, 0, 0, k0)

cache = {}
def add_gggenome_d123(main_df, seq_col="SEQUENCE", db="hg38", *, max_workers=32, print_every=10):
    seqs = (main_df[seq_col].astype(str).str.upper().str.replace("U", "T", regex=False))
    uniq = seqs.dropna().unique().tolist()
    N = len(uniq)
    print(f"[GGG] Unique sequences: {N} | db={db} | workers={max_workers}")

    global cache
    cache = {}
    lock = threading.Lock()
    t0 = time.perf_counter()
    errs = 0

    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futs = {ex.submit(_d123_for_sequence, s, db): s for s in uniq}
        done = 0
        for fut in as_completed(futs):
            s = futs[fut]
            try:
                s_key, d1, d2, d3, d0 = fut.result()
            except Exception:
                d0 = d1 = d2 = d3 = 0
                with lock:
                    errs += 1
            with lock:
                cache[s] = (s, d1, d2, d3, d0)
                done += 1
                if (done == 1) or (done % print_every == 0) or (done == N):
                    elapsed = time.perf_counter() - t0
                    rps = done / elapsed if elapsed > 0 else 0.0
                    print(f"[GGG] {done}/{N} cached | ~{rps:.1f} seq/s | errors={errs}")

    main_df["ggg_d1"] = seqs.map(lambda s: cache.get(s, (s, 0, 0, 0, 0))[1])
    main_df["ggg_d0"] = seqs.map(lambda s: cache.get(s, (s, 0, 0, 0, 0))[4])
    # main_df["ggg_d2"] = seqs.map(lambda s: cache.get(s, (s, 0, 0, 0, 0))[2])
    # main_df["ggg_d3"] = seqs.map(lambda s: cache.get(s, (s, 0, 0, 0, 0))[3])


    print(f"[GGG] Finished in {time.perf_counter() - t0:.1f}s. Added columns: ggg_d1, ggg_d2, ggg_d3")
    return main_df


# --- usage ---
# main_df = main_df[main_df[SENSE_START] != -1]
result = add_gggenome_d123(df_sorted[:500], seq_col='sense', db="hg38", max_workers=1, print_every=2)

[GGG] Unique sequences: 500 | db=hg38 | workers=1
[GGG] 1/500 cached | ~0.2 seq/s | errors=0
[GGG] 2/500 cached | ~0.2 seq/s | errors=0
[GGG] 4/500 cached | ~0.2 seq/s | errors=0
[GGG] 6/500 cached | ~0.2 seq/s | errors=0
[GGG] 8/500 cached | ~0.2 seq/s | errors=0
[GGG] 10/500 cached | ~0.2 seq/s | errors=0
[GGG] 12/500 cached | ~0.2 seq/s | errors=0
[GGG] 14/500 cached | ~0.2 seq/s | errors=0
[GGG] 16/500 cached | ~0.2 seq/s | errors=0
[GGG] 18/500 cached | ~0.2 seq/s | errors=0
[GGG] 20/500 cached | ~0.2 seq/s | errors=0
[GGG] 22/500 cached | ~0.2 seq/s | errors=0
[GGG] 24/500 cached | ~0.2 seq/s | errors=0
[GGG] 26/500 cached | ~0.2 seq/s | errors=0
[GGG] 28/500 cached | ~0.2 seq/s | errors=0
[GGG] 30/500 cached | ~0.2 seq/s | errors=0
[GGG] 32/500 cached | ~0.2 seq/s | errors=0
[GGG] 34/500 cached | ~0.2 seq/s | errors=0
[GGG] 36/500 cached | ~0.2 seq/s | errors=0
[GGG] 38/500 cached | ~0.2 seq/s | errors=0
[GGG] 40/500 cached | ~0.2 seq/s | errors=0
[GGG] 42/500 cached | ~0.2 seq/

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df["ggg_d1"] = seqs.map(lambda s: cache.get(s, (s, 0, 0, 0, 0))[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df["ggg_d0"] = seqs.map(lambda s: cache.get(s, (s, 0, 0, 0, 0))[4])


In [233]:
result.insert(0, "row_number", range(1, len(result) + 1))

In [234]:
result[result['ggg_d0'] ==0].head(50)[[SEQUENCE, 'row_number', 'sense_start', 'ggg_d0', 'ggg_d1', 'RNaseH1_Krel_score_R7_krel', 'at_skew', 'gc_content', 'on_target_fold_openness_normalized40_15', 'sense_avg_accessibility']]

Unnamed: 0,Sequence,row_number,sense_start,ggg_d0,ggg_d1,RNaseH1_Krel_score_R7_krel,at_skew,gc_content,on_target_fold_openness_normalized40_15,sense_avg_accessibility
250,CTCAAGGCAAGCTTTA,1,2250,0,70,1.035249,0.111111,0.4375,-0.336198,9.665283
467,TTTGTAATCCAGAGGT,4,1467,0,106,0.978683,-0.2,0.375,-0.145768,3.979304
451,TGATTGTCGACTTAAC,23,1451,0,9,0.992926,-0.2,0.375,-0.132292,3.852323
465,TGTAATCCAGAGGTTG,27,1465,0,73,1.017547,-0.111111,0.4375,-0.135221,3.636754
351,TAAACGGGCCCTGCTA,32,2351,0,13,1.04022,0.142857,0.5625,-0.385547,4.858123
459,CCAGAGGTTGATTGTC,35,1459,0,33,1.020199,-0.25,0.5,-0.128255,3.739716
448,TTGTCGACTTAACGCG,37,1448,0,0,0.988124,-0.25,0.5,-0.166732,4.067594
354,GTTTAAACGGGCCCTG,38,2354,0,9,0.949883,-0.142857,0.5625,-0.384961,4.16954
344,TGTACTCGGTCGGTCC,39,844,0,2,0.99499,-0.666667,0.625,-0.340495,5.668445
442,GGCACCTTCCAGGGTC,40,2442,0,109,1.017932,-0.2,0.6875,0.0,10.466902


In [200]:
GFP_IN_HUMAN[633:633 + 16]

'GCCCGACAACCACTAC'

In [199]:
GFP_IN_HUMAN[679 :679 + 16]


'CCCAACGAGAAGCGCG'

In [162]:
# 1. Insert row number (1,2,3,...)
result = result.copy()
# result.insert(0, "row_number", range(1, len(result) + 1))

# 2. Filter on ggg_d1 < 50
filtered = result[result["ggg_d1"] < 500]

# 3. Print selected columns
print(filtered[[
    "row_number", SEQUENCE, "sense_start", "ggg_d1", "at_skew", "gc_content",
    "on_target_fold_openness_normalized40_15",
    "sense_avg_accessibility", "RNaseH1_Krel_score_R7_krel"
]].head(60))


     row_number          Sequence  sense_start  ggg_d1   at_skew  gc_content  on_target_fold_openness_normalized40_15  sense_avg_accessibility  RNaseH1_Krel_score_R7_krel
389           1  TTTCTTTGCAAGTTAT         1889     329 -0.500000      0.2500                                -0.097656                 5.035191                    1.017614
390           2  ATTTCTTTGCAAGTTA         1890     282 -0.333333      0.2500                                -0.098307                 5.253077                    1.022290
388           3  TTCTTTGCAAGTTATA         1888     199 -0.333333      0.2500                                -0.097005                 4.828204                    0.990100
391           4  CATTTCTTTGCAAGTT         1891     348 -0.454545      0.3125                                -0.098958                 5.573556                    0.971061
387           5  TCTTTGCAAGTTATAA         1887     176 -0.166667      0.2500                                -0.096354                 4.632415   

In [115]:
# Appendix for CAI

In [None]:
from math import prod

# Yeast codon usage table (S. cerevisiae, from Codon Usage Database)
# frequency per thousand, normalized to relative adaptiveness (w values)
# For simplicity, store as preferred-codon relative adaptiveness
yeast_codon_pref = {
    'TTT':0.61,'TTC':1.0,'TTA':0.19,'TTG':0.45,'CTT':0.10,'CTC':0.08,'CTA':0.07,'CTG':0.09,
    'ATT':0.36,'ATC':1.0,'ATA':0.12,'ATG':1.0,
    'GTT':0.47,'GTC':0.15,'GTA':0.14,'GTG':0.25,
    'TCT':0.55,'TCC':0.54,'TCA':0.37,'TCG':0.17,'AGT':0.39,'AGC':1.0,
    'CCT':0.61,'CCC':0.21,'CCA':0.63,'CCG':0.15,
    'ACT':0.60,'ACC':1.0,'ACA':0.36,'ACG':0.13,
    'GCT':0.82,'GCC':0.49,'GCA':0.58,'GCG':0.13,
    'TAT':0.57,'TAC':1.0,'TAA':0,'TAG':0,'TGA':0,
    'CAT':0.57,'CAC':1.0,
    'CAA':0.34,'CAG':1.0,
    'AAT':0.53,'AAC':1.0,
    'AAA':0.44,'AAG':1.0,
    'GAT':0.46,'GAC':1.0,
    'GAA':0.38,'GAG':1.0,
    'TGT':0.44,'TGC':1.0,'TGG':1.0,
    'CGT':0.56,'CGC':0.65,'CGA':0.14,'CGG':0.14,'AGA':0.05,'AGG':0.02,
    'GGT':0.53,'GGC':0.49,'GGA':1.0,'GGG':0.27
}

# Input sequence (from user)
seq = ("ATGGTtAGtAAaGGaGAaGAGTTgTTCACaGGaGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTATATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAGGGTGCTGGGGCAggtacCCCTAAAGATCCAGCCAAACCTCCGGCCAcGGCACAAGTTGTGGGATGGCCACCGGTGAGATCATACCGGAAGAACGTGATGGTTTCCTGCCAAAAATCAAGCGGTGGCCCGGAGGCGGCGGCGTTCGTGAAGTAA").upper()

# Split into codons
codons = [seq[i:i+3] for i in range(0, len(seq), 3)]
# Drop stop codon and any incomplete codons at the end
valid_codons = [c for c in codons if c in yeast_codon_pref and yeast_codon_pref[c] > 0]

# Calculate weights
weights = [yeast_codon_pref[c] for c in valid_codons]

# Compute CAI
cai = prod(weights) ** (1/len(weights))
cai


In [None]:
from math import prod

# Human codon usage table (Homo sapiens, whole genome, from Kazusa)
# Values are normalized as relative adaptiveness (w)
# Frequencies per 1000 codons (rounded from Kazusa)
human_freqs = {
    'TTT':17.6,'TTC':20.3,'TTA':7.7,'TTG':12.9,'CTT':13.2,'CTC':19.6,'CTA':7.2,'CTG':39.6,
    'ATT':16.0,'ATC':20.8,'ATA':7.5,'ATG':22.0,
    'GTT':10.8,'GTC':14.6,'GTA':7.1,'GTG':28.1,
    'TCT':15.2,'TCC':17.6,'TCA':12.2,'TCG':4.5,'AGT':11.9,'AGC':19.5,
    'CCT':17.5,'CCC':19.8,'CCA':16.9,'CCG':6.9,
    'ACT':13.1,'ACC':21.2,'ACA':15.1,'ACG':6.1,
    'GCT':18.2,'GCC':27.7,'GCA':15.8,'GCG':7.4,
    'TAT':12.0,'TAC':15.6,'TAA':1.0,'TAG':0.8,'TGA':1.6,
    'CAT':10.9,'CAC':15.1,
    'CAA':12.3,'CAG':34.2,
    'AAT':17.0,'AAC':19.1,
    'AAA':24.4,'AAG':32.5,
    'GAT':22.3,'GAC':26.0,
    'GAA':29.0,'GAG':39.6,
    'TGT':10.8,'TGC':12.1,'TGG':13.2,
    'CGT':4.7,'CGC':10.5,'CGA':6.2,'CGG':11.4,'AGA':12.1,'AGG':11.5,
    'GGT':10.8,'GGC':22.2,'GGA':16.5,'GGG':16.5
}

# Normalize to get relative adaptiveness (w)
human_w = {}
for codon in human_freqs:
    aa_group = [c for c in human_freqs if c[0] == codon[0] or True]  # lazy grouping
# Let's just hard-code amino acid codon groupings
aa_codons = {
    'F':['TTT','TTC'],
    'L':['TTA','TTG','CTT','CTC','CTA','CTG'],
    'I':['ATT','ATC','ATA'],
    'M':['ATG'],
    'V':['GTT','GTC','GTA','GTG'],
    'S':['TCT','TCC','TCA','TCG','AGT','AGC'],
    'P':['CCT','CCC','CCA','CCG'],
    'T':['ACT','ACC','ACA','ACG'],
    'A':['GCT','GCC','GCA','GCG'],
    'Y':['TAT','TAC'],
    'H':['CAT','CAC'],
    'Q':['CAA','CAG'],
    'N':['AAT','AAC'],
    'K':['AAA','AAG'],
    'D':['GAT','GAC'],
    'E':['GAA','GAG'],
    'C':['TGT','TGC'],
    'W':['TGG'],
    'R':['CGT','CGC','CGA','CGG','AGA','AGG'],
    'G':['GGT','GGC','GGA','GGG'],
    'STOP':['TAA','TAG','TGA']
}
for aa, codon_list in aa_codons.items():
    maxf = max(human_freqs[c] for c in codon_list)
    for c in codon_list:
        if human_freqs[c] > 0:
            human_w[c] = human_freqs[c]/maxf
        else:
            human_w[c] = 0

# Input sequence (human GFP variant)
seq_human = ("atggtgagcaagggcgaggagctgttcaccggggtggtgcccatcctggtcgagctggacggcgacgtaaacggccacaagttcagcgtgtccggcgagggcgagggcgatgccacctacggcaagctgaccctgaagttcatctgcaccaccggcaagctgcccgtgccctggcccaccctcgtgaccaccctgacctacggcgtgcagtgcttcagccgctaccccgaccacatgaagcagcacgacttcttcaagtccgccatgcccgaaggctacgtccaggagcgcaccatcttcttcaaggacgacggcaactacaagacccgcgccgaggtgaagttcgagggcgacaccctggtgaaccgcatcgagctgaagggcatcgacttcaaggaggacggcaacatcctggggcacaagctggagtacaactacaacagccacaacgtctatatcatggccgacaagcagaagaacggcatcaaggtgaacttcaagatccgccacaacatcgaggacggcagcgtgcagctcgccgaccactaccagcagaacacccccatcggcgacggccccgtgctgctgcccgacaaccactacctgagcacccagtccgccctgagcaaagaccccaacgagaagcgcgatcacatggtcctgctggagttcgtgaccgccgccgggatcactctcggcatggacgagctgtacaagaagcttagccatggcttcccgccggaggtggaggagcaggatgatggcacgctgcccatgtcttgtgcccaggagagcgggatggaccgtcaccctgcagcctgtgcttctgctaggatcaatgtgaagcgacctgccgccacaaagaaggctggacaggctaagaagaagaaatgaggatcccgcgcgcgcatatgttaattaaccaactgcatggggatccacgcgttaagtcgacaatcaacctctggattacaaaatttgtgaaagattgactggtattcttaactatgttgctccttttacgctatgtggatacgctgctttaatgcctttgtatcatgctattgcttcccgtatggctttcattttctcctccttgtataaatcctggttgctgtctctttatgaggagttgtggcccgttgtcaggcaacgtggcgtggtgtgcactgtgtttgctgacgcaacccccactggttggggcattgccaccacctgtcagctcctttccgggactttcgctttccccctccctattgccacggcggaactcatcgccgcctgccttgcccgctgctggacaggggctcggctgttgggcactgacaattccgtggtgttgtcggggaaatcatcgtcctttccttggctgctcgcctgtgttgccacctggattctgcgcgggacgtccttctgctacgtcccttcggccctcaatccagcggaccttccttcccgcggcctgctgccggctctgcggcctcttccgcgtcttcgccttcgccctcagacgagtcggatctccctttgggccgcctccccgcgtcgactttaagaccaatgacttacaaggcagctgtagatcttagccactttttaaaagaaaaggggggactggaagggctaattcactcccaacgaagacaagatctgctttttgcttgtactgggtctctctggttagaccagatctgagcctgggagctctctggctaactagggaacccactgcttaagcctcaataaagcttgccttgagtgcttcaagtagtgtgtgcccgtctgttgtgtgactctggtaactagagatccctcagacccttttagtcagtgtggaaaatctctagcagtacgtatagtagttcatgtcatcttattattcagtatttataacttgcaaagaaatgaatatcagagagtgagagg")

# Split into codons
codons_h = [seq_human[i:i+3].upper() for i in range(0, len(seq_human), 3)]
valid_codons_h = [c for c in codons_h if c in human_w and human_w[c] > 0]

weights_h = [human_w[c] for c in valid_codons_h]
cai_h = prod(weights_h) ** (1/len(weights_h))
cai_h


In [236]:
from asodesigner.read_human_genome import get_locus_to_data_dict
import pickle
from asodesigner.consts import CACHE_DIR

# Adding RB1 and TP53BP1
genes_u = ['HIF1A', 'APOL1', 'YAP1', 'SOD1', 'SNCA', 'IRF4', 'KRAS', 'KLKB1', 'SNHG14', 'DGAT2', 'IRF5', 'HTRA1',
           'MYH7', 'MALAT1', 'HSD17B13', 'PRMT5', 'MAT2A', 'RIOK1', 'RB1', 'TP53BP1']
cache_path = CACHE_DIR / 'gene_to_data_simple_cache.pickle'
# if not cache_path.exists():
if True:
    gene_to_data = get_locus_to_data_dict(include_introns=True, gene_subset=genes_u)
    with open(cache_path, 'wb') as f:
        pickle.dump(gene_to_data, f)
else:
    with open(cache_path, 'rb') as f:
        gene_to_data = pickle.load(f)

Time took to read fasta: 8.80322551727295
Length:  639
Feature type:  transcript
Feature type:  CDS
Feature type:  start_codon
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  transcript
Feature type:  CDS
Feature type:  start_codon
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  transcript
Feature type:  transcript
Feature type:  transcript
Feature type:  transcript
Feature type:  transcript
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Feature type:  CDS
Featur

In [237]:
SL_PARTNERS = ['RB1', 'TP53BP1']
SEQUENCES = {}
for partner in SL_PARTNERS:
    SEQUENCES[partner] = gene_to_data[partner].full_mrna

In [316]:
from asodesigner.util import get_antisense
import pandas as pd

# GFP_YEAST_END = len(GFP_IN_YEAST) - 3
GFP_HUMAN_END = len(GFP_IN_HUMAN)
LNA_SIZE = 16

def get_init_df(target_mrna, end, aso_sizes=[LNA_SIZE]):
    candidates = []
    sense_starts = []
    sense_lengths = []
    sense_starts_from_end = []

    for aso_size in aso_sizes:
        for i in range(0, len(target_mrna) - (aso_size - 1)):
            target = target_mrna[i: i + aso_size]
            candidates.append(get_antisense(str(target)))
            sense_starts.append(i)
            sense_lengths.append(aso_size)
            sense_starts_from_end.append(end - i)
    df = pd.DataFrame(
        {SEQUENCE: candidates, SENSE_START: sense_starts,
         SENSE_LENGTH: sense_lengths, "sense_start_from_end": sense_starts_from_end})
    return df

dfs = dict()
# df_yeast = get_init_df(GFP_IN_YEAST, GFP_YEAST_END)
for partner in SL_PARTNERS:
    dfs[partner] = get_init_df(gene_to_data[partner].full_mrna, len(gene_to_data[partner].full_mrna), aso_sizes=[16, 20])

In [318]:
from scripts.data_genertion.data_handling import get_populated_df_with_structure_features

for gene, df in dfs.items():
    df[CELL_LINE_ORGANISM] = 'human'
    df[INHIBITION] = 0
    df[CANONICAL_GENE] = gene
    df = get_populated_df_with_structure_features(df, genes_u, gene_to_data)

In [321]:
from scripts.data_genertion.data_handling import populate_features

for gene, df in dfs.items():
    df[TREATMENT_PERIOD] = 24  # keep constant for all
    df[VOLUME] = 1000  # keep constant for all
    df['log_volume'] = np.log(df[VOLUME])
    df['normalized_start'] = df[SENSE_START] / len(SEQUENCES[gene])
    df['normalized_sense_start_from_end'] = df['sense_start_from_end'] / len(SEQUENCES[gene])
    easy_to_populate = ['at_skew', 'gc_content', 'gc_content_3_prime_5', 'gc_skew', 'hairpin_score',
                        'homooligo_count', 'internal_fold', 'nucleotide_diversity', 'self_energy', 'stop_codon_count',
                        'at_rich_region_score', 'poly_pyrimidine_stretch']
    populate_features(df, easy_to_populate)

In [322]:
def get_populate_fold(df, genes_u, gene_to_data, fold_variants=[(40, 15)]):
    from asodesigner.fold import calculate_energies, get_weighted_energy
    from asodesigner.util import get_antisense

    all_data_human_gene_premrna_no_nan = df.copy()

    # Comment out the long cases for quick running
    for (window_size, step_size) in fold_variants:

        on_target_fold = 'on_target_fold_openness' + str(window_size) + '_' + str(step_size)
        on_target_fold_normalized = 'on_target_fold_openness_normalized' + str(window_size) + '_' + str(step_size)
        all_data_human_gene_premrna_no_nan[on_target_fold] = np.zeros_like(all_data_human_gene_premrna_no_nan[SEQUENCE],
                                                                           dtype=np.float64)
        all_data_human_gene_premrna_no_nan[on_target_fold_normalized] = np.zeros_like(
            all_data_human_gene_premrna_no_nan[SEQUENCE], dtype=np.float64)

        for gene in genes_u:

            target = gene_to_data[gene].full_mrna
            gene_rows = all_data_human_gene_premrna_no_nan[all_data_human_gene_premrna_no_nan[CANONICAL_GENE] == gene]
            energies = calculate_energies(str(target), step_size, window_size)

            for index, row in gene_rows.iterrows():
                antisense = row[SEQUENCE]
                sense = get_antisense(antisense)
                l = row[SENSE_LENGTH]
                sense_start = row[SENSE_START]
                mean_fold = get_weighted_energy(sense_start, l, step_size, energies, window_size)
                mean_fold_end = get_weighted_energy(sense_start, l, step_size, energies, window_size)
                mean_fold_start = get_weighted_energy(sense_start, l, step_size, energies, window_size)
                if mean_fold > 100:
                    print(energies)
                    print("Weird: ", mean_fold)
                    print("Sense_start ", sense_start)
                    print("Sense_length ", l)
                    print("Gene: ", gene)
                    mean_fold = 0
                all_data_human_gene_premrna_no_nan.loc[index, on_target_fold] = mean_fold
                all_data_human_gene_premrna_no_nan.loc[index, on_target_fold_normalized] = mean_fold / l
    return all_data_human_gene_premrna_no_nan

In [323]:
from scripts.data_genertion.data_handling import get_populate_fold

for gene, df in dfs.items():
    fold_variants = [(40, 15)]
    df = get_populate_fold(df, SL_PARTNERS, gene_to_data, fold_variants=fold_variants)
    dfs[gene] = df

[-1.56000004e+01 -1.81000004e+01 -1.50000000e+01 ...  9.75542770e+15
  5.32916532e+06  8.25747459e+97]
Weird:  203238077072048.75
Sense_start  295665
Sense_length  16
Gene:  RB1
[-1.56000004e+01 -1.81000004e+01 -1.50000000e+01 ...  9.75542770e+15
  5.32916532e+06  8.25747459e+97]
Weird:  406476154144105.75
Sense_start  295666
Sense_length  16
Gene:  RB1
[-1.56000004e+01 -1.81000004e+01 -1.50000000e+01 ...  9.75542770e+15
  5.32916532e+06  8.25747459e+97]
Weird:  609714231216162.8
Sense_start  295667
Sense_length  16
Gene:  RB1
[-1.56000004e+01 -1.81000004e+01 -1.50000000e+01 ...  9.75542770e+15
  5.32916532e+06  8.25747459e+97]
Weird:  812952308288219.9
Sense_start  295668
Sense_length  16
Gene:  RB1
[-1.56000004e+01 -1.81000004e+01 -1.50000000e+01 ...  9.75542770e+15
  5.32916532e+06  8.25747459e+97]
Weird:  1016190385360276.9
Sense_start  295669
Sense_length  16
Gene:  RB1
[-1.56000004e+01 -1.81000004e+01 -1.50000000e+01 ...  9.75542770e+15
  5.32916532e+06  8.25747459e+97]
Weird:  1

In [324]:
from hybridization.hybridization_features import get_exp_psrna_hybridization

for gene, df in dfs.items():
    df.loc[:, 'exp_ps_hybr'] = [
        get_exp_psrna_hybridization(antisense.replace('T', 'U'), temp=37) for
        antisense in df[SEQUENCE]]

In [325]:
# Missing: CAI_score_global_CDS, 'sense_avg_accessibility', RNaseH1_Krel_score_R7_krel, Modification_min_distance_to_3prime
from features.mod_features import compute_mod_min_distance_to_3prime

for gene, df in dfs.items():
    if df[SEQUENCE].str.len().eq(20).any():
        df.loc[df[SEQUENCE].str.len() == 20, "Modification_min_distance_to_3prime"] = \
            compute_mod_min_distance_to_3prime("MMMMMddddddddddMMMMM")

    if df[SEQUENCE].str.len().eq(16).any():
        df.loc[df[SEQUENCE].str.len() == 16, "Modification_min_distance_to_3prime"] = \
            compute_mod_min_distance_to_3prime("LLLddddddddddLLL")

In [256]:
from yehuda_code.Folding_Functions import get_sense_with_flanks
from yehuda_code.access_calculator import AccessCalculator

FLANK_SIZE = 120
ACCESS_SIZE = 13
SEED_SIZE = 13
SEED_SIZES = [SEED_SIZE * m for m in range(1, 4)]
ACCESS_WIN_SIZE = 80

def compute_sense_accessibility(row, flank_size, access_win_size, seed_sizes, access_size, min_gc=0, max_gc=100,
                                gc_ranges=1):
    try:
        # Skip invalid rows
        if row['sense_start'] == -1 or pd.isna(row['sense_with_flank_120nt']) or row['sense_with_flank_120nt'] == "":
            return None

        seq = row[f'sense_with_flank_{flank_size}nt']
        sense_start = row['sense_start']
        sense_length = row['sense_length']

        # Calculate accessibility
        df_access = AccessCalculator.calc(
            seq, access_size,
            min_gc, max_gc, gc_ranges,
            access_win_size, seed_sizes
        )

        flank_start = max(0, sense_start - flank_size)
        sense_start_in_flank = sense_start - flank_start
        sense_end_in_flank = sense_start_in_flank + sense_length

        if 0 <= sense_start_in_flank < len(df_access) and sense_end_in_flank <= len(df_access):
            values = df_access['avg_access'].iloc[sense_start_in_flank:sense_end_in_flank].dropna()
            return values.mean() if not values.empty else None
        else:
            return None

    except Exception as e:
        print(f"Error at row {row.name} | seq start: {row['sense_start']} | error: {e}")
        return None


for gene, df in dfs.items():
    FLANKED_SENSE_COL = f'sense_with_flank_{FLANK_SIZE}nt'

    val = gene_to_data[gene].full_mrna
    df['pre_mrna_sequence'] = [val] * len(df)

    # Create new column with flanked sequences
    df[FLANKED_SENSE_COL] = df.apply(
        lambda row: get_sense_with_flanks(
            row['pre_mrna_sequence'],
            row['sense_start'],
            row['sense_length'],
            flank_size=FLANK_SIZE
        ) if row['sense_start'] != -1 else "",  # Handle cases where sense was not found
        axis=1
    )

    batch_size = 500
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        batch = df.iloc[start_idx:end_idx].copy()

        print(f"Processing rows {start_idx} to {end_idx}...")

        batch['sense_avg_accessibility'] = batch.apply(
            compute_sense_accessibility,
            axis=1,
            flank_size=FLANK_SIZE,
            access_win_size=ACCESS_WIN_SIZE,
            seed_sizes=SEED_SIZES,
            access_size=ACCESS_SIZE,
        )

        # Save batch to the new folder
        batch.to_csv(f"out/{gene}/batch_{start_idx}_{end_idx}.csv", index=False)


Processing rows 0 to 500...
Processing rows 500 to 1000...
Processing rows 1000 to 1500...
Processing rows 1500 to 2000...
Processing rows 2000 to 2500...
Processing rows 2500 to 3000...
Processing rows 3000 to 3500...
Processing rows 3500 to 4000...
Processing rows 4000 to 4500...
Processing rows 4500 to 5000...
Processing rows 5000 to 5500...
Processing rows 5500 to 6000...
Processing rows 6000 to 6500...
Processing rows 6500 to 7000...
Processing rows 7000 to 7500...
Processing rows 7500 to 8000...
Processing rows 8000 to 8500...
Processing rows 8500 to 9000...
Processing rows 9000 to 9500...
Processing rows 9500 to 10000...
Processing rows 10000 to 10500...
Processing rows 10500 to 11000...
Processing rows 11000 to 11500...
Processing rows 11500 to 12000...
Processing rows 12000 to 12500...
Processing rows 12500 to 13000...
Processing rows 13000 to 13500...
Processing rows 13500 to 14000...
Processing rows 14000 to 14500...
Processing rows 14500 to 15000...
Processing rows 15000 to

In [326]:
import pandas as pd
import glob

files = sorted(glob.glob(f"out/RB1/batch_*.csv"))
df_rb1_accessibility = pd.concat([pd.read_csv(f) for f in files], axis=0)

In [332]:
df_rb1 = df_rb1_accessibility

In [333]:
files = sorted(glob.glob(f"out/TP53BP1/batch_*.csv"))
df_tp53bp1_accessibility = pd.concat([pd.read_csv(f) for f in files], axis=0)

In [334]:
df_tp53bp1 = df_tp53bp1_accessibility

In [335]:
from asodesigner.features.RNaseH_features import rnaseh1_dict, compute_rnaseh1_score

best_window_start_krel = {
    'R4a_krel': {10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0, 16: 0, 17: 3, 18: 2, 19: 4, 20: 3, 21: 0, 22: 0, 25: 0},
    'R4b_krel': {10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0, 16: 0, 17: 1, 18: 3, 19: 1, 20: 3, 21: 0, 22: 0, 25: 0},
    'R7_krel': {10: 0, 11: 0, 12: 0, 13: 0, 14: 0, 15: 0, 16: 3, 17: 2, 18: 4, 19: 6, 20: 4, 21: 0, 22: 0, 25: 0},
}

for exp in ['R4a_krel', 'R4b_krel', 'R7_krel']:
    weights = rnaseh1_dict(exp)

    def score_row(row):
        length = len(row['Sequence'])
        pos = best_window_start_krel.get(exp, {}).get(length, 0)
        return compute_rnaseh1_score(row['Sequence'], weights, window_start=pos)


    col_name = f"RNaseH1_Krel_score_{exp}"

    # Yeast should have similar motifs, perhaps
    df_rb1[col_name] = df_rb1.apply(score_row, axis=1)
    df_tp53bp1[col_name] = df_tp53bp1.apply(score_row, axis=1)

RNaseH1_Krel_features_best = [f"RNaseH1_Krel_score_{exp}" for exp in ['R4a_krel', 'R4b_krel', 'R7_krel']]

In [338]:
df_rb1_bak = df_rb1.copy()
df_tp53bp1_bak = df_tp53bp1.copy()

In [339]:
# =========================
# External mRNA integration
# =========================
import pandas as pd
from pathlib import Path
import re

def _norm_rna_to_dna(seq: str) -> str:
    """Normalize RNA to DNA alphabet (U->T), uppercase, strip whitespace."""
    return str(seq).upper().replace('U', 'T').replace(' ', '').replace('\t', '').replace('\n', '')


def load_mrna_by_gene_from_files(files: list[str | Path],seq_column: str = "Original Transcript Sequence" ) -> dict[str, str]:
    """
    Load {Gene -> <seq_column>} from a manual list of CSV paths.
    - Expects columns: 'Gene' and <seq_column>
    - Returns DNA alphabet (A/C/G/T) after U->T via _norm_rna_to_dna
    - If multiple rows per gene: keeps the *longest* sequence
    """
    files = [Path(f) for f in files]
    rows = []
    for f in files:
        df = pd.read_csv(f, usecols=['Gene', seq_column])
        df[seq_column] = df[seq_column].map(_norm_rna_to_dna)
        # Keep only clean sequences
        df = df[df[seq_column].str.fullmatch(r'[ACGT]+', na=False)]
        rows.append(df)

    if not rows:
        return {}

    big = pd.concat(rows, ignore_index=True)
    big['len'] = big[seq_column].str.len()
    # Pick longest per gene
    chosen = big.sort_values(['Gene', 'len'], ascending=[True, False]).drop_duplicates('Gene')
    return dict(zip(chosen['Gene'], chosen[seq_column]))

# ---- Choose which mRNA to use for mRNA-based features (tAI/windows on mRNA, etc.) ----
def choose_preferred_mrna(gene_name: str, mrna_built_from_exons: str, gene_to_mrna_real: dict[str,str]) -> str:
    """
    Prefer the real (external) mRNA when available; otherwise fall back to exon-joined.
    Does NOT touch your genome->(mRNA/CDS) mappings or pre-mRNA flanks.
    """
    ext = gene_to_mrna_real.get(gene_name)
    return ext if ext else mrna_built_from_exons


In [340]:
from pathlib import Path
import pandas as pd
from asodesigner.consts import *

DATA_mRNA_PATH = PROJECT_PATH / "scripts" / "data_genertion" / "cell_line_expression"

FILENAMES = [
    "ACH-000232_transcriptome.csv",
    "ACH-000463_transcriptome.csv",
    "ACH-000739_transcriptome.csv",
    "ACH-001086_transcriptome.csv",
    "ACH-001188_transcriptome.csv",
    "ACH-001328_transcriptome.csv",
]

EXTERNAL_MRNA_FILES = [DATA_mRNA_PATH / fn for fn in FILENAMES]

In [341]:
d_orig = load_mrna_by_gene_from_files(
    [str(p) for p in EXTERNAL_MRNA_FILES],
    seq_column="Original Transcript sequence"
)

In [342]:

import pandas as pd

df1 = pd.read_csv(DATA_mRNA_PATH / "ACH-000232_transcriptome.csv")
df2 = pd.read_csv(DATA_mRNA_PATH / "ACH-000463_transcriptome.csv")
df3 = pd.read_csv(DATA_mRNA_PATH / "ACH-000739_transcriptome.csv")
df4 = pd.read_csv(DATA_mRNA_PATH / "ACH-001086_transcriptome.csv")
df5 = pd.read_csv(DATA_mRNA_PATH / "ACH-001188_transcriptome.csv")
df6 = pd.read_csv(DATA_mRNA_PATH / "ACH-001328_transcriptome.csv")
transcript_df = pd.concat([df1, df2, df3, df4, df5, df6], ignore_index=True)
transcript_df = transcript_df.drop_duplicates()

In [343]:
transcript_df = transcript_df.copy()

transcript_df.loc[:, "ref sequence"] = (transcript_df["Mutated Transcript sequence"].fillna(transcript_df["Original Transcript sequence"]))

In [344]:
# Build CAI reference weights directly from top-N transcript sequences (no CDS extraction)

from asodesigner.features.cai import calc_CAI_weight  # make sure cai.py is importable

TOP_N   = 300
SEQ_COL = "ref sequence"
EXPR_COL = "expression_norm"

# Basic checks
assert EXPR_COL in transcript_df.columns, f"Missing '{EXPR_COL}' column"
assert SEQ_COL  in transcript_df.columns, f"Missing '{SEQ_COL}' column"

# 1) Pick top-N by expression_norm
ref_df = transcript_df.sort_values(EXPR_COL, ascending=False).head(TOP_N).copy()

# 2) Take their sequences as-is (mRNA with U's; calc_CAI_weight handles U->T internally)
reference_seqs = ref_df[SEQ_COL].dropna().astype(str).tolist()

# 3) Build CAI weights
weights_list, weights_flat = calc_CAI_weight(reference_seqs)

print(f"Built CAI weights from {len(reference_seqs)} transcript sequences (top {TOP_N} by {EXPR_COL}).")

Built CAI weights from 300 transcript sequences (top 300 by expression_norm).


In [274]:
df_rb1_bak = df_rb1.copy()
df_tp53bp1_bak = df_tp53bp1.copy()

In [345]:
from pathlib import Path
import pandas as pd

DATA_mRNA_PATH = PROJECT_PATH / "scripts" / "data_genertion" / "cell_line_expression"

FILENAMES = [
    "ACH-000232_transcriptome.csv",
    "ACH-000463_transcriptome.csv",
    "ACH-000739_transcriptome.csv",
    "ACH-001086_transcriptome.csv",
    "ACH-001188_transcriptome.csv",
    "ACH-001328_transcriptome.csv",
]

EXTERNAL_MRNA_FILES = [DATA_mRNA_PATH / fn for fn in FILENAMES]
missing = [p.name for p in EXTERNAL_MRNA_FILES if not p.exists()]
assert not missing, f"Missing files in {DATA_mRNA_PATH}: {missing}"

# 1) Load Original
try:
    d_orig = load_mrna_by_gene_from_files(
        [str(p) for p in EXTERNAL_MRNA_FILES],
        seq_column="Original Transcript sequence"
    )
except ValueError:
    # in case the S is capitalized in your headers
    d_orig = load_mrna_by_gene_from_files(
        [str(p) for p in EXTERNAL_MRNA_FILES],
        seq_column="Original Transcript Sequence"
    )

# 2) Load Mutated
try:
    d_mut = load_mrna_by_gene_from_files(
        [str(p) for p in EXTERNAL_MRNA_FILES],
        seq_column="Mutated Transcript sequence"
    )
except ValueError:
    d_mut = load_mrna_by_gene_from_files(
        [str(p) for p in EXTERNAL_MRNA_FILES],
        seq_column="Mutated Transcript Sequence"
    )

# 3) Prefer Mutated when available
gene_to_mrna_real = {**d_orig, **d_mut}

print(f"Loaded {len(gene_to_mrna_real)} real mRNA sequences (Gene -> mRNA).")



Loaded 17546 real mRNA sequences (Gene -> mRNA).


In [348]:
from asodesigner.util import get_antisense
import numpy as np

# Column names
SENSE_LENGTH      = 'sense_length'         # Length of the ASO (nt)
SENSE_TYPE        = 'sense_type'           # exon / intron
CDS_SEQUENCE      = 'cds_sequence'         # CDS string (joined exons within CDS range)
IN_CODING_REGION  = 'in_coding_region'     # site is within CDS on a real exon

# Flank sizes
FLANK_SIZES_PREMRNA = [20, 30, 40, 50, 60, 70]
FLANK_SIZES_CDS     = [20, 30, 40, 50, 60, 70]

df_rb1[CDS_SEQUENCE]     = ""
df_rb1[IN_CODING_REGION] = False

for fs in FLANK_SIZES_PREMRNA:
    df_rb1[f"flank_sequence_{fs}"] = ""
for fs in FLANK_SIZES_CDS:
    df_rb1[f"local_coding_region_around_ASO_{fs}"] = ""

# ---- helpers (local to Part B) ----
def _to_str_seq(x) -> str:
    """
    Coerce sequence-like (list/np.array/Series) or string to a clean uppercase DNA string.
    Converts U->T and strips whitespace. Ensures slicing returns a plain string (avoids pandas iterable assignment).
    """
    if isinstance(x, str):
        s = x
    else:
        try:
            s = ''.join(list(x))
        except Exception:
            s = str(x)
    return s.replace(' ', '').replace('\t', '').replace('\n', '').replace('U', 'T').upper()

def _build_spliced_mrna_from_exons(pre_mrna: str, exon_indices):
    """
    Build exon-joined mRNA by concatenating exon slices out of pre_mrna.
    Keeps your original assumptions: pre_mrna corresponds to genomic strand and
    starts at exon_indices[0][0]; exon intervals are used directly.
    """
    if not exon_indices:
        return ""
    pre_genome_start = exon_indices[0][0]
    parts = []
    for exon_start, exon_end in exon_indices:
        pm_start = exon_start - pre_genome_start
        pm_end   = exon_end   - pre_genome_start
        parts.append(pre_mrna[pm_start:pm_end])
    return "".join(parts)

# Cache CDS per gene
gene_to_cds_info = {}

# ---- main loop ----
for index, row in df_rb1.iterrows():
    gene_name  = row[CANONICAL_GENE]
    locus_info = gene_to_data[gene_name]

    # Keep using your current pre-mRNA for flanks/exon-intron logic (coerced to clean string)
    pre_mrna  = _to_str_seq(locus_info.full_mrna)
    antisense = _to_str_seq(row[SEQUENCE])
    sense     = _to_str_seq(get_antisense(antisense))

    # Locate site on pre-mRNA
    idx = pre_mrna.find(sense)
    df_rb1.at[index, SENSE_LENGTH] = len(antisense)

    if idx != -1:
        # Genomic correction (kept as-is)
        genome_corrected_index = idx + locus_info.exon_indices[0][0]

        # pre-mRNA flanks (now using .at and guaranteed string slices)
        for fs in FLANK_SIZES_PREMRNA:
            flank_start = max(0, idx - fs)
            flank_end   = min(len(pre_mrna), idx + len(sense) + fs)
            flank_seq   = pre_mrna[flank_start:flank_end]
            df_rb1.at[index, f"flank_sequence_{fs}"] = flank_seq

        # Build CDS + genome->mRNA map (kept identical to your approach)
        if gene_name not in gene_to_cds_info:
            cds_seq = []  # build as list for speed, join at end
            genome_to_mrna_map = {}
            mrna_idx = 0
            for exon_start, exon_end in locus_info.exon_indices:
                for gpos in range(exon_start, exon_end):
                    if mrna_idx >= len(pre_mrna):
                        break
                    if locus_info.cds_start <= gpos <= locus_info.cds_end:
                        cds_seq.append(pre_mrna[mrna_idx])
                        genome_to_mrna_map[gpos] = len(cds_seq) - 1
                    mrna_idx += 1
            cds_seq = ''.join(cds_seq)
            gene_to_cds_info[gene_name] = (cds_seq, genome_to_mrna_map)
        else:
            cds_seq, genome_to_mrna_map = gene_to_cds_info[gene_name]

        # Save CDS
        df_rb1.at[index, CDS_SEQUENCE] = _to_str_seq(cds_seq)

        #  NEW: prefer real mRNA for mRNA-based features (fallback to exon-joined)
        mrna_built        = _build_spliced_mrna_from_exons(pre_mrna, locus_info.exon_indices)
        mrna_for_features = choose_preferred_mrna(gene_name, mrna_built, gene_to_mrna_real)

        # If within CDS, extract local CDS context (unchanged logic; .at + str)
        if (
            locus_info.cds_start <= genome_corrected_index <= locus_info.cds_end
            and genome_corrected_index in genome_to_mrna_map
        ):
            df_rb1.at[index, IN_CODING_REGION] = True
            cds_idx = genome_to_mrna_map[genome_corrected_index]
            for fs in FLANK_SIZES_CDS:
                start = max(0, cds_idx - fs)
                end   = min(len(cds_seq), cds_idx + len(sense) + fs)
                local_seq = cds_seq[start:end]
                df_rb1.at[index, f"local_coding_region_around_ASO_{fs}"] = _to_str_seq(local_seq)



In [346]:
from asodesigner.util import get_antisense
import numpy as np

# Column names
SENSE_LENGTH      = 'sense_length'         # Length of the ASO (nt)
SENSE_TYPE        = 'sense_type'           # exon / intron
CDS_SEQUENCE      = 'cds_sequence'         # CDS string (joined exons within CDS range)
IN_CODING_REGION  = 'in_coding_region'     # site is within CDS on a real exon

# Flank sizes
FLANK_SIZES_PREMRNA = [20, 30, 40, 50, 60, 70]
FLANK_SIZES_CDS     = [20, 30, 40, 50, 60, 70]

df_tp53bp1[CDS_SEQUENCE]     = ""
df_tp53bp1[IN_CODING_REGION] = False

for fs in FLANK_SIZES_PREMRNA:
    df_tp53bp1[f"flank_sequence_{fs}"] = ""
for fs in FLANK_SIZES_CDS:
    df_tp53bp1[f"local_coding_region_around_ASO_{fs}"] = ""

# ---- helpers (local to Part B) ----
def _to_str_seq(x) -> str:
    """
    Coerce sequence-like (list/np.array/Series) or string to a clean uppercase DNA string.
    Converts U->T and strips whitespace. Ensures slicing returns a plain string (avoids pandas iterable assignment).
    """
    if isinstance(x, str):
        s = x
    else:
        try:
            s = ''.join(list(x))
        except Exception:
            s = str(x)
    return s.replace(' ', '').replace('\t', '').replace('\n', '').replace('U', 'T').upper()

def _build_spliced_mrna_from_exons(pre_mrna: str, exon_indices):
    """
    Build exon-joined mRNA by concatenating exon slices out of pre_mrna.
    Keeps your original assumptions: pre_mrna corresponds to genomic strand and
    starts at exon_indices[0][0]; exon intervals are used directly.
    """
    if not exon_indices:
        return ""
    pre_genome_start = exon_indices[0][0]
    parts = []
    for exon_start, exon_end in exon_indices:
        pm_start = exon_start - pre_genome_start
        pm_end   = exon_end   - pre_genome_start
        parts.append(pre_mrna[pm_start:pm_end])
    return "".join(parts)

# Cache CDS per gene
gene_to_cds_info = {}

# ---- main loop ----
for index, row in df_tp53bp1.iterrows():
    gene_name  = row[CANONICAL_GENE]
    locus_info = gene_to_data[gene_name]

    # Keep using your current pre-mRNA for flanks/exon-intron logic (coerced to clean string)
    pre_mrna  = _to_str_seq(locus_info.full_mrna)
    antisense = _to_str_seq(row[SEQUENCE])
    sense     = _to_str_seq(get_antisense(antisense))

    # Locate site on pre-mRNA
    idx = pre_mrna.find(sense)
    df_tp53bp1.at[index, SENSE_LENGTH] = len(antisense)

    if idx != -1:
        # Genomic correction (kept as-is)
        genome_corrected_index = idx + locus_info.exon_indices[0][0]

        # pre-mRNA flanks (now using .at and guaranteed string slices)
        for fs in FLANK_SIZES_PREMRNA:
            flank_start = max(0, idx - fs)
            flank_end   = min(len(pre_mrna), idx + len(sense) + fs)
            flank_seq   = pre_mrna[flank_start:flank_end]
            df_tp53bp1.at[index, f"flank_sequence_{fs}"] = flank_seq

        # Build CDS + genome->mRNA map (kept identical to your approach)
        if gene_name not in gene_to_cds_info:
            cds_seq = []  # build as list for speed, join at end
            genome_to_mrna_map = {}
            mrna_idx = 0
            for exon_start, exon_end in locus_info.exon_indices:
                for gpos in range(exon_start, exon_end):
                    if mrna_idx >= len(pre_mrna):
                        break
                    if locus_info.cds_start <= gpos <= locus_info.cds_end:
                        cds_seq.append(pre_mrna[mrna_idx])
                        genome_to_mrna_map[gpos] = len(cds_seq) - 1
                    mrna_idx += 1
            cds_seq = ''.join(cds_seq)
            gene_to_cds_info[gene_name] = (cds_seq, genome_to_mrna_map)
        else:
            cds_seq, genome_to_mrna_map = gene_to_cds_info[gene_name]

        # Save CDS
        df_tp53bp1.at[index, CDS_SEQUENCE] = _to_str_seq(cds_seq)

        #  NEW: prefer real mRNA for mRNA-based features (fallback to exon-joined)
        mrna_built        = _build_spliced_mrna_from_exons(pre_mrna, locus_info.exon_indices)
        mrna_for_features = choose_preferred_mrna(gene_name, mrna_built, gene_to_mrna_real)

        # If within CDS, extract local CDS context (unchanged logic; .at + str)
        if (
            locus_info.cds_start <= genome_corrected_index <= locus_info.cds_end
            and genome_corrected_index in genome_to_mrna_map
        ):
            df_tp53bp1.at[index, IN_CODING_REGION] = True
            cds_idx = genome_to_mrna_map[genome_corrected_index]
            for fs in FLANK_SIZES_CDS:
                start = max(0, cds_idx - fs)
                end   = min(len(cds_seq), cds_idx + len(sense) + fs)
                local_seq = cds_seq[start:end]
                df_tp53bp1.at[index, f"local_coding_region_around_ASO_{fs}"] = _to_str_seq(local_seq)



In [347]:
df_tp53bp1

Unnamed: 0,Sequence,sense_start,sense_length,sense_start_from_end,Cell line organism,Inhibition(%),Canonical Gene Name,Treatment_Period(hours),ASO_volume(nM),log_volume,normalized_start,normalized_sense_start_from_end,self_energy,internal_fold,gc_content,gc_content_3_prime_5,homooligo_count,hairpin_score,gc_skew,at_skew,nucleotide_diversity,stop_codon_count,at_rich_region_score,poly_pyrimidine_stretch,on_target_fold_openness40_15,on_target_fold_openness_normalized40_15,exp_ps_hybr,Modification_min_distance_to_3prime,pre_mrna_sequence,sense_with_flank_120nt,sense_avg_accessibility,RNaseH1_Krel_score_R4a_krel,RNaseH1_Krel_score_R4b_krel,RNaseH1_Krel_score_R7_krel,cds_sequence,in_coding_region,flank_sequence_20,flank_sequence_30,flank_sequence_40,flank_sequence_50,flank_sequence_60,flank_sequence_70,local_coding_region_around_ASO_20,local_coding_region_around_ASO_30,local_coding_region_around_ASO_40,local_coding_region_around_ASO_50,local_coding_region_around_ASO_60,local_coding_region_around_ASO_70
0,CACCAATGGGTCCCCCAGAG,0,20,107668,human,0,TP53BP1,24,1000,6.907755,0.000000,1.000000,-5351.056145,-3.6,0.65,0.6,0.380952,0.10,-0.230769,0.428571,0.6875,0.0,0.00,0.05,-15.775000,-0.788750,-3031,0.0,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTGCCTCCGAGTTTGTATCGTTGCCTCCATCTGTGTCTGTGTGTCTGTATTATCGTTGT...,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTG,8.666435,1.014921,0.999820,1.000072,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTGCCTCCGAGTTTGTATCGTTGCCTCCATCTGTGTCTGTGTGTCTGTATTATCGTTGT...,True,TCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCT,CACTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTA,CTGGGCAGGGCACTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAATACTTCATT,TAGCATCTGGCTGGGCAGGGCACTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAATACTTCATTAGACAAATTA,CTATTCATTTTAGCATCTGGCTGGGCAGGGCACTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAATACTTCATTAGACAAATTAAGGAAATATA,GATTAACAACCTATTCATTTTAGCATCTGGCTGGGCAGGGCACTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAATACTTCATTAGACAAATTAAGGAAATATAATTAGGGATT,CTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTG,GTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGG,CTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCA,GTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCA,GTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCACGCCCAGCTA,TTGAGGTGGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCACGCCCAGCTAATTTTTTTGT
1,CCACCAATGGGTCCCCCAGA,1,20,107667,human,0,TP53BP1,24,1000,6.907755,0.000009,0.999991,-5351.056145,-3.6,0.65,0.8,0.380952,0.10,-0.384615,0.428571,0.6875,0.0,0.00,0.05,-15.790000,-0.789500,-3243,0.0,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTGCCTCCGAGTTTGTATCGTTGCCTCCATCTGTGTCTGTGTGTCTGTATTATCGTTGT...,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTGC,8.932784,0.966488,1.012563,0.988831,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTGCCTCCGAGTTTGTATCGTTGCCTCCATCTGTGTCTGTGTGTCTGTATTATCGTTGT...,True,CAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTA,ACTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAA,TGGGCAGGGCACTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAATACTTCATTA,AGCATCTGGCTGGGCAGGGCACTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAATACTTCATTAGACAAATTAA,TATTCATTTTAGCATCTGGCTGGGCAGGGCACTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAATACTTCATTAGACAAATTAAGGAAATATAA,ATTAACAACCTATTCATTTTAGCATCTGGCTGGGCAGGGCACTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAATACTTCATTAGACAAATTAAGGAAATATAATTAGGGATTA,TCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGA,TGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGA,TGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCAC,TCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCAC,TCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCACGCCCAGCTAA,TGAGGTGGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCACGCCCAGCTAATTTTTTTGTA
2,CCCACCAATGGGTCCCCCAG,2,20,107666,human,0,TP53BP1,24,1000,6.907755,0.000019,0.999981,-5351.056145,-3.8,0.70,0.8,0.523810,0.15,-0.428571,0.333333,0.6250,0.0,0.00,0.05,-15.805000,-0.790250,-3363,0.0,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTGCCTCCGAGTTTGTATCGTTGCCTCCATCTGTGTCTGTGTGTCTGTATTATCGTTGT...,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTGCC,9.260415,0.984112,1.035757,0.951979,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTGCCTCCGAGTTTGTATCGTTGCCTCCATCTGTGTCTGTGTGTCTGTATTATCGTTGT...,True,AAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAG,CTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAAT,GGGCAGGGCACTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAATACTTCATTAG,GCATCTGGCTGGGCAGGGCACTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAATACTTCATTAGACAAATTAAG,ATTCATTTTAGCATCTGGCTGGGCAGGGCACTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAATACTTCATTAGACAAATTAAGGAAATATAAT,TTAACAACCTATTCATTTTAGCATCTGGCTGGGCAGGGCACTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAATACTTCATTAGACAAATTAAGGAAATATAATTAGGGATTAA,CGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAG,GGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGAC,GGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACC,CGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCACG,CTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCACGCCCAGCTAAT,GAGGTGGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCACGCCCAGCTAATTTTTTTGTAT
3,CCCCACCAATGGGTCCCCCA,3,20,107665,human,0,TP53BP1,24,1000,6.907755,0.000028,0.999972,-5351.056145,-4.2,0.70,0.8,0.571429,0.15,-0.571429,0.333333,0.5625,0.0,0.00,0.10,-15.820000,-0.791000,-3575,0.0,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTGCCTCCGAGTTTGTATCGTTGCCTCCATCTGTGTCTGTGTGTCTGTATTATCGTTGT...,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTGCCT,9.674513,0.964350,1.046950,0.964403,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTGCCTCCGAGTTTGTATCGTTGCCTCCATCTGTGTCTGTGTGTCTGTATTATCGTTGT...,True,AAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGA,TTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAATA,GGCAGGGCACTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAATACTTCATTAGA,CATCTGGCTGGGCAGGGCACTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAATACTTCATTAGACAAATTAAGG,TTCATTTTAGCATCTGGCTGGGCAGGGCACTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAATACTTCATTAGACAAATTAAGGAAATATAATT,TAACAACCTATTCATTTTAGCATCTGGCTGGGCAGGGCACTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAATACTTCATTAGACAAATTAAGGAAATATAATTAGGGATTAAA,GGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGT,GCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACT,GAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCT,GCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCACGC,TCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCACGCCCAGCTAATT,AGGTGGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCACGCCCAGCTAATTTTTTTGTATT
4,GCCCCACCAATGGGTCCCCC,4,20,107664,human,0,TP53BP1,24,1000,6.907755,0.000037,0.999963,-5351.056145,-4.2,0.75,1.0,0.571429,0.10,-0.466667,0.200000,0.6250,0.0,0.00,0.10,-15.835000,-0.791750,-3611,0.0,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTGCCTCCGAGTTTGTATCGTTGCCTCCATCTGTGTCTGTGTGTCTGTATTATCGTTGT...,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTGCCTC,9.884820,0.962133,0.972448,0.996025,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTGCCTCCGAGTTTGTATCGTTGCCTCCATCTGTGTCTGTGTGTCTGTATTATCGTTGT...,True,AAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGAC,TATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAATAC,GCAGGGCACTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAATACTTCATTAGAC,ATCTGGCTGGGCAGGGCACTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAATACTTCATTAGACAAATTAAGGA,TCATTTTAGCATCTGGCTGGGCAGGGCACTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAATACTTCATTAGACAAATTAAGGAAATATAATTA,AACAACCTATTCATTTTAGCATCTGGCTGGGCAGGGCACTTATACCTCAAAAATGAAGAAATTGGAACCCAGACTATTTGAGAAAAATTAGAGAAACACACTGCCTAGACAGGGTAATACTTCATTAGACAAATTAAGGAAATATAATTAGGGATTAAAA,GCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTA,CGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTA,AGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTG,CCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCACGCC,CGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCACGCCCAGCTAATTT,GGTGGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCACGCCCAGCTAATTTTTTTGTATTT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,TCACACCCAGTAGTAAAACT,99995,20,7673,human,0,TP53BP1,24,1000,6.907755,0.928735,0.071265,-1481.852458,0.0,0.40,0.4,0.333333,0.00,-0.500000,0.333333,0.5625,0.1,0.05,0.00,-4.833333,-0.241667,-2019,0.0,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTGCCTCCGAGTTTGTATCGTTGCCTCCATCTGTGTCTGTGTGTCTGTATTATCGTTGT...,TTGGCTAGTCAGAACTAAGTGGAGATGGAGACCCCTGACCTGGGGCCTCGGCGGTGCCTGAGCTCCTCCAGCAGACAACGAGGGCACTGAAGGGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACCTTATGCACTGGATTGGGTTGGGTTT...,4.374578,0.999968,1.032662,0.982713,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTGCCTCCGAGTTTGTATCGTTGCCTCCATCTGTGTCTGTGTGTCTGTATTATCGTTGT...,True,GTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACC,AGGGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAAC,AGGGCACTGAAGGGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACCTTATGCACT,GCAGACAACGAGGGCACTGAAGGGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACCTTATGCACTGGATTGGGTT,AGCTCCTCCAGCAGACAACGAGGGCACTGAAGGGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACCTTATGCACTGGATTGGGTTGGGTTTTAAA,GCGGTGCCTGAGCTCCTCCAGCAGACAACGAGGGCACTGAAGGGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACCTTATGCACTGGATTGGGTTGGGTTTTAAAATTTGTGATA,GCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCC,GTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAG,CCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTAC,GCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGC,GTGGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCACGCCC,GTTTTTTGAGGTGGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCACGCCCAGCTAATTTT
496,TTCACACCCAGTAGTAAAAC,99996,20,7672,human,0,TP53BP1,24,1000,6.907755,0.928744,0.071256,-1069.506229,0.0,0.40,0.4,0.333333,0.00,-0.500000,0.333333,0.5625,0.1,0.05,0.00,-4.787500,-0.239375,-1939,0.0,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTGCCTCCGAGTTTGTATCGTTGCCTCCATCTGTGTCTGTGTGTCTGTATTATCGTTGT...,TGGCTAGTCAGAACTAAGTGGAGATGGAGACCCCTGACCTGGGGCCTCGGCGGTGCCTGAGCTCCTCCAGCAGACAACGAGGGCACTGAAGGGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACCTTATGCACTGGATTGGGTTGGGTTTT...,4.348574,0.959126,1.010640,1.002306,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTGCCTCCGAGTTTGTATCGTTGCCTCCATCTGTGTCTGTGTGTCTGTATTATCGTTGT...,True,TCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCG,GGGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACC,GGGCACTGAAGGGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACCTTATGCACTG,CAGACAACGAGGGCACTGAAGGGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACCTTATGCACTGGATTGGGTTG,GCTCCTCCAGCAGACAACGAGGGCACTGAAGGGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACCTTATGCACTGGATTGGGTTGGGTTTTAAAA,CGGTGCCTGAGCTCCTCCAGCAGACAACGAGGGCACTGAAGGGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACCTTATGCACTGGATTGGGTTGGGTTTTAAAATTTGTGATAA,CGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCT,TGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGC,CAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACA,CTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCC,TGGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCACGCCCA,TTTTTTGAGGTGGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCACGCCCAGCTAATTTTT
497,CTTCACACCCAGTAGTAAAA,99997,20,7671,human,0,TP53BP1,24,1000,6.907755,0.928753,0.071247,-1069.506229,0.0,0.40,0.4,0.333333,0.00,-0.500000,0.333333,0.6250,0.1,0.05,0.05,-4.741667,-0.237083,-1976,0.0,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTGCCTCCGAGTTTGTATCGTTGCCTCCATCTGTGTCTGTGTGTCTGTATTATCGTTGT...,GGCTAGTCAGAACTAAGTGGAGATGGAGACCCCTGACCTGGGGCCTCGGCGGTGCCTGAGCTCCTCCAGCAGACAACGAGGGCACTGAAGGGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACCTTATGCACTGGATTGGGTTGGGTTTTA...,4.294734,1.001588,1.011691,0.968183,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTGCCTCCGAGTTTGTATCGTTGCCTCCATCTGTGTCTGTGTGTCTGTATTATCGTTGT...,True,CATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGG,GGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACCT,GGCACTGAAGGGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACCTTATGCACTGG,AGACAACGAGGGCACTGAAGGGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACCTTATGCACTGGATTGGGTTGG,CTCCTCCAGCAGACAACGAGGGCACTGAAGGGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACCTTATGCACTGGATTGGGTTGGGTTTTAAAAT,GGTGCCTGAGCTCCTCCAGCAGACAACGAGGGCACTGAAGGGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACCTTATGCACTGGATTGGGTTGGGTTTTAAAATTTGTGATAAA,GATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTC,GCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCT,AGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAG,TCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCA,GGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCACGCCCAG,TTTTTGAGGTGGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCACGCCCAGCTAATTTTTT
498,CCTTCACACCCAGTAGTAAA,99998,20,7670,human,0,TP53BP1,24,1000,6.907755,0.928762,0.071238,-1069.506229,0.0,0.45,0.6,0.285714,0.00,-0.555556,0.272727,0.6250,0.1,0.05,0.05,-4.695833,-0.234792,-2268,0.0,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTGCCTCCGAGTTTGTATCGTTGCCTCCATCTGTGTCTGTGTGTCTGTATTATCGTTGT...,GCTAGTCAGAACTAAGTGGAGATGGAGACCCCTGACCTGGGGCCTCGGCGGTGCCTGAGCTCCTCCAGCAGACAACGAGGGCACTGAAGGGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACCTTATGCACTGGATTGGGTTGGGTTTTAA...,4.403815,0.967296,0.975422,0.985045,CTCTGGGGGACCCATTGGTGGGGCGAGGCGGTTGCCTCGGCGCCGGAAGAAGCTTCTAGTTAAGATGTATCCCTGATAAGTACTGTCTGGCAATTCCGCAGCCTCGCCCGAACTGAGGGGAGCCATCTTGTCCCTCTCTGCCTCCGAGTTTGTATCGTTGCCTCCATCTGTGTCTGTGTGTCTGTATTATCGTTGT...,True,ATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGC,GGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACCTT,GCACTGAAGGGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACCTTATGCACTGGA,GACAACGAGGGCACTGAAGGGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACCTTATGCACTGGATTGGGTTGGG,TCCTCCAGCAGACAACGAGGGCACTGAAGGGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACCTTATGCACTGGATTGGGTTGGGTTTTAAAATT,GTGCCTGAGCTCCTCCAGCAGACAACGAGGGCACTGAAGGGGGAGCCGTCATAAGGGATCTGCTCTAAGTTTTACTACTGGGTGTGAAGGGGAAGGGGAGTGGTACCGGCATTGAACCTTATGCACTGGATTGGGTTGGGTTTTAAAATTTGTGATAAAA,ATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCC,CAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTG,GGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGG,CTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCAC,GAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCACGCCCAGC,TTTTGAGGTGGAGTCTCGCTCTGTCGCCCAGGCTGGAGTGCAGTGGCGCGATCTCGGCTCACTGCAAGCTCTGCCTCCCGGGTTCACGCCATTCTACTGCCTCAGCCTCCTGAGTAGCTGGGACTACAGGCACCTGCCACCACGCCCAGCTAATTTTTTT


In [349]:

from sklearn.preprocessing import StandardScaler

# Define flank sizes
CDS_WINDOWS = FLANK_SIZES_CDS

# Loop over each flank window size
for flank in CDS_WINDOWS:
    local_col = f"local_coding_region_around_ASO_{flank}"
    is_local_flag_col = f"region_is_local_{flank}"

    # Create the binary flag: 1 if local exists, 0 otherwise
    df_tp53bp1[is_local_flag_col] = df_tp53bp1[local_col].apply(
        lambda x: isinstance(x, str) and x.strip() != ""
    ).astype(int)

    # Create the binary flag: 1 if local exists, 0 otherwise
    df_rb1[is_local_flag_col] = df_rb1[local_col].apply(
        lambda x: isinstance(x, str) and x.strip() != ""
    ).astype(int)



In [350]:
from asodesigner.features.cai import calc_CAI

# weights_list, weights_flat = calc_CAI_weight(reference_seqs)

for flank in CDS_WINDOWS:
    local_col = f"local_coding_region_around_ASO_{flank}"
    CAI_col   = f"CAI_score_{flank}_CDS"
    df_rb1[CAI_col] = (
        df_rb1[local_col].astype(str).apply(lambda s: calc_CAI(s, weights_flat))
    )
    df_tp53bp1[CAI_col] = (
        df_tp53bp1[local_col].astype(str).apply(lambda s: calc_CAI(s, weights_flat))
    )
df_rb1["CAI_score_global_CDS"] = (
    df_rb1["cds_sequence"].astype(str).apply(lambda s: calc_CAI(s, weights_flat))
)
df_tp53bp1["CAI_score_global_CDS"] = (
    df_tp53bp1["cds_sequence"].astype(str).apply(lambda s: calc_CAI(s, weights_flat))
)


CAI_list = [f"CAI_score_{flank}_CDS" for flank in CDS_WINDOWS] + ["CAI_score_global_CDS"]

In [353]:
df_rb1_bak = df_rb1.copy()
df_tp53bp1_bak = df_tp53bp1.copy()

In [357]:
df_tp53bp1 = df_tp53bp1_bak

In [364]:
SENSE_START = 'sense_start'
SENSE_START_FROM_END = 'sense_start_from_end'
SENSE_LENGTH = 'sense_length'
SENSE_TYPE = 'sense_type'
SENSE_EXON = 'sense_exon'
SENSE_INTRON = 'sense_intron'
SENSE_UTR = 'sense_utr'

for index, row in df_tp53bp1.iterrows():
    gene_name = row[CANONICAL_GENE]
    locus_info = gene_to_data[gene_name]
    pre_mrna = locus_info.full_mrna
    antisense = row[SEQUENCE]
    idx = row[SENSE_START]
    df_tp53bp1.at[index, SENSE_START_FROM_END] = np.abs(
        locus_info.exon_indices[-1][1] - locus_info.cds_start - idx
    )
    if idx != -1:
        genome_corrected_index = idx + locus_info.cds_start
        found = False
        for exon_indices in locus_info.exon_indices:
            # print(exon[0], exon[1])
            if exon_indices[0] <= genome_corrected_index <= exon_indices[1]:
                df_tp53bp1.at[index, SENSE_TYPE] = 'exon'
                df_tp53bp1.at[index, SENSE_EXON] = 1
                found = True
                break
        for intron_indices in locus_info.intron_indices:
            # print(exon[0], exon[1])
            if intron_indices[0] <= genome_corrected_index <= intron_indices[1]:
                df_tp53bp1.at[index, SENSE_TYPE] = 'intron'
                df_tp53bp1.at[index, SENSE_INTRON] = 1
                found = True
                break
        for i, utr_indices in enumerate(locus_info.utr_indices):
                if utr_indices[0] <= genome_corrected_index <= utr_indices[1]:
                    df_tp53bp1.at[index, SENSE_TYPE] = 'utr'
                    df_tp53bp1.at[index, SENSE_UTR] = 1

                    found = True
                    break
    if not found:
        df_tp53bp1.loc[index, SENSE_TYPE] = 'intron'


In [371]:
SENSE_START = 'sense_start'
SENSE_START_FROM_END = 'sense_start_from_end'
SENSE_LENGTH = 'sense_length'
SENSE_TYPE = 'sense_type'
SENSE_EXON = 'sense_exon'
SENSE_INTRON = 'sense_intron'
SENSE_UTR = 'sense_utr'

for index, row in df_rb1.iterrows():
    gene_name = row[CANONICAL_GENE]
    locus_info = gene_to_data[gene_name]
    pre_mrna = locus_info.full_mrna
    antisense = row[SEQUENCE]
    idx = row[SENSE_START]
    df_rb1.at[index, SENSE_START_FROM_END] = np.abs(
        locus_info.exon_indices[-1][1] - locus_info.cds_start - idx
    )
    if idx != -1:
        genome_corrected_index = idx + locus_info.cds_start
        found = False
        for exon_indices in locus_info.exon_indices:
            # print(exon[0], exon[1])
            if exon_indices[0] <= genome_corrected_index <= exon_indices[1]:
                df_rb1.at[index, SENSE_TYPE] = 'exon'
                df_rb1.at[index, SENSE_EXON] = 1
                found = True
                break
        for intron_indices in locus_info.intron_indices:
            # print(exon[0], exon[1])
            if intron_indices[0] <= genome_corrected_index <= intron_indices[1]:
                df_rb1.at[index, SENSE_TYPE] = 'intron'
                df_rb1.at[index, SENSE_INTRON] = 1
                found = True
                break
        for i, utr_indices in enumerate(locus_info.utr_indices):
                if utr_indices[0] <= genome_corrected_index <= utr_indices[1]:
                    df_rb1.at[index, SENSE_TYPE] = 'utr'
                    df_rb1.at[index, SENSE_UTR] = 1

                    found = True
                    break
    if not found:
        df_rb1.loc[index, SENSE_TYPE] = 'intron'


In [372]:
df_tp53bp1_scores = model.predict(df_tp53bp1[selected_features])
df_rb1_scores = model.predict(df_rb1[selected_features])


In [374]:
# Assuming you already have get_antisense(seq: str) -> str defined
# get_antisense, for some reason numba doesn't work well
tbl = str.maketrans("ACGTUacgtuNn", "TGCAAtgcaaNn")

df_tp53bp1["score"] = df_tp53bp1_scores
df_rb1["score"] = df_rb1_scores

df_tp53bp1["sense"] = df_tp53bp1[SEQUENCE].astype(str).str.translate(tbl).str[::-1]
(
    df_tp53bp1.assign(score=df_tp53bp1_scores)
    .sort_values("score", ascending=False)  # sort by score
    .to_csv("df_tp53bp1_scores_model2.csv", index=False)
)

df_rb1["sense"] = df_rb1[SEQUENCE].astype(str).str.translate(tbl).str[::-1]
(
    df_rb1.assign(score=df_rb1_scores)
    .sort_values("score", ascending=False)  # sort by score
    .to_csv("df_rb1_scores_model2.csv", index=False)
)

df_tp53bp1_sorted = df_tp53bp1.sort_values('score', ascending=False)
df_rb1_sorted = df_rb1.sort_values('score', ascending=False)

In [375]:
df_tp53bp1_sorted.insert(0, "row_number", range(1, len(df_tp53bp1_sorted) + 1))
df_rb1_sorted.insert(0, "row_number", range(1, len(df_rb1_sorted) + 1))

In [380]:
# pip install requests
import math, time, threading, urllib.parse, requests
from concurrent.futures import ThreadPoolExecutor, as_completed

UA = {"User-Agent": "python-requests gggenome/greedy"}


def _ggg_hits_leq_json(seq, k, db="hg38", timeout=60, retries=2):
    """Count hits with <=k mismatches via GGGenome JSON; fallback to CSV if needed."""
    s = str(seq).upper().replace("U", "T")
    q = urllib.parse.quote(s)
    url_json = f"https://gggenome.dbcls.jp/{db}/{k}/nogap/{q}.json"
    url_csv = f"https://gggenome.dbcls.jp/{db}/{k}/nogap/{q}.csv?download"

    for attempt in range(retries + 1):
        try:
            r = requests.get(url_json, headers=UA, timeout=timeout)
            r.raise_for_status()
            try:
                data = r.json()
            except ValueError:
                raise RuntimeError("JSON parse failed")
            if isinstance(data, list):
                return len(data)
            if isinstance(data, dict):
                if "results" in data and isinstance(data["results"], list): return len(data["results"])
                if "hits" in data and isinstance(data["hits"], list):       return len(data["hits"])
                return sum(len(v) for v in data.values() if isinstance(v, list))
            return 10000
        except Exception:
            # greedy CSV fallback
            try:
                r2 = requests.get(url_csv, headers=UA, timeout=timeout)
                r2.raise_for_status()
                return sum(1 for ln in r2.text.splitlines() if ln and not ln.startswith("#"))
            except Exception:
                if attempt < retries:
                    continue
                return 10000
    return 0


def _d123_for_sequence(seq, db="hg38"):
    s = str(seq).upper().replace("U", "T")
    if not s:
        return (s, 0, 0, 0)
    L = len(s)
    k_allowed = max(0, math.floor(0.25 * L))  # GGGenome cap
    k0 = _ggg_hits_leq_json(s, 0, db=db)
    k1 = _ggg_hits_leq_json(s, 1, db=db) if k_allowed >= 1 else 0
    # k2 = _ggg_hits_leq_json(s, 2, db=db) if k_allowed >= 2 else 0
    # k3 = _ggg_hits_leq_json(s, 3, db=db) if k_allowed >= 3 else 0
    d1 = max(0, k1 - k0)
    # d2 = max(0, k2 - k1)
    # d3 = max(0, k3 - k2)
    return (s, d1, 0, 0, k0)


cache = {}


def add_gggenome_d123(main_df, seq_col="SEQUENCE", db="hg38", *, max_workers=32, print_every=10):
    seqs = (main_df[seq_col].astype(str).str.upper().str.replace("U", "T", regex=False))
    uniq = seqs.dropna().unique().tolist()
    N = len(uniq)
    print(f"[GGG] Unique sequences: {N} | db={db} | workers={max_workers}")

    global cache
    cache = {}
    lock = threading.Lock()
    t0 = time.perf_counter()
    errs = 0

    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futs = {ex.submit(_d123_for_sequence, s, db): s for s in uniq}
        done = 0
        for fut in as_completed(futs):
            s = futs[fut]
            try:
                s_key, d1, d2, d3, d0 = fut.result()
            except Exception:
                d0 = d1 = d2 = d3 = 0
                with lock:
                    errs += 1
            with lock:
                cache[s] = (s, d1, d2, d3, d0)
                done += 1
                if (done == 1) or (done % print_every == 0) or (done == N):
                    elapsed = time.perf_counter() - t0
                    rps = done / elapsed if elapsed > 0 else 0.0
                    print(f"[GGG] {done}/{N} cached | ~{rps:.1f} seq/s | errors={errs}")

    main_df["ggg_d1"] = seqs.map(lambda s: cache.get(s, (s, 0, 0, 0, 0))[1])
    main_df["ggg_d0"] = seqs.map(lambda s: cache.get(s, (s, 0, 0, 0, 0))[4])
    # main_df["ggg_d2"] = seqs.map(lambda s: cache.get(s, (s, 0, 0, 0, 0))[2])
    # main_df["ggg_d3"] = seqs.map(lambda s: cache.get(s, (s, 0, 0, 0, 0))[3])

    print(f"[GGG] Finished in {time.perf_counter() - t0:.1f}s. Added columns: ggg_d1, ggg_d2, ggg_d3")
    return main_df




In [381]:
# --- usage ---
# main_df = main_df[main_df[SENSE_START] != -1]
result = add_gggenome_d123(df_tp53bp1_sorted[:150], seq_col='sense', db="hg38", max_workers=1, print_every=2)

[GGG] Unique sequences: 149 | db=hg38 | workers=1
[GGG] 1/149 cached | ~0.2 seq/s | errors=0
[GGG] 2/149 cached | ~0.2 seq/s | errors=0
[GGG] 4/149 cached | ~0.2 seq/s | errors=0
[GGG] 6/149 cached | ~0.3 seq/s | errors=0
[GGG] 8/149 cached | ~0.3 seq/s | errors=0
[GGG] 10/149 cached | ~0.3 seq/s | errors=0
[GGG] 12/149 cached | ~0.3 seq/s | errors=0
[GGG] 14/149 cached | ~0.3 seq/s | errors=0
[GGG] 16/149 cached | ~0.2 seq/s | errors=0
[GGG] 18/149 cached | ~0.2 seq/s | errors=0
[GGG] 20/149 cached | ~0.2 seq/s | errors=0
[GGG] 22/149 cached | ~0.2 seq/s | errors=0
[GGG] 24/149 cached | ~0.2 seq/s | errors=0
[GGG] 26/149 cached | ~0.2 seq/s | errors=0
[GGG] 28/149 cached | ~0.2 seq/s | errors=0
[GGG] 30/149 cached | ~0.2 seq/s | errors=0
[GGG] 32/149 cached | ~0.2 seq/s | errors=0
[GGG] 34/149 cached | ~0.2 seq/s | errors=0
[GGG] 36/149 cached | ~0.2 seq/s | errors=0
[GGG] 38/149 cached | ~0.2 seq/s | errors=0
[GGG] 40/149 cached | ~0.2 seq/s | errors=0
[GGG] 42/149 cached | ~0.2 seq/

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df["ggg_d1"] = seqs.map(lambda s: cache.get(s, (s, 0, 0, 0, 0))[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df["ggg_d0"] = seqs.map(lambda s: cache.get(s, (s, 0, 0, 0, 0))[4])


In [393]:
result[['row_number', SEQUENCE, 'sense_start', 'ggg_d0', 'ggg_d1', 'sense_avg_accessibility', 'on_target_fold_openness_normalized40_15', 'at_skew', 'gc_content', 'sense_utr']]

Unnamed: 0,row_number,Sequence,sense_start,ggg_d0,ggg_d1,sense_avg_accessibility,on_target_fold_openness_normalized40_15,at_skew,gc_content,sense_utr
21,1,TTTCTCTTTTGTAGTTTCGG,105521,1,1,1.102138,-0.015792,-0.846154,0.35,1.0
119,2,TTTCAATTTCCTTGGCCAGC,103619,1,0,5.867987,-0.150000,-0.454545,0.45,1.0
20,3,TTCTCTTTTGTAGTTTCGGG,105520,1,3,1.356582,-0.013125,-0.833333,0.40,1.0
497,4,CTTTCGGGTAGGGGTTGTGC,94997,1,0,1.472698,-0.146750,-0.750000,0.60,1.0
498,5,TCTTTCGGGTAGGGGTTGTG,94998,1,0,1.493798,-0.145750,-0.777778,0.55,1.0
...,...,...,...,...,...,...,...,...,...,...
257,146,TCTGAGCATTCTTTTTTTGG,54257,1,0,3.496343,-0.109458,-0.692308,0.35,1.0
144,147,ATTTGCAGGAGGTAATGTTG,5644,1,1,1.167447,-0.005250,-0.166667,0.40,1.0
105,148,TAACCAAGTATTGTTTTCAT,97105,1,2,3.620442,-0.130833,-0.200000,0.25,1.0
381,149,GTTATTTTGTTTTTGGAGCT,56381,1,4,1.303577,-0.029750,-0.714286,0.30,1.0


In [394]:
result_rb1 = add_gggenome_d123(df_rb1_sorted[:150], seq_col='sense', db="hg38", max_workers=1, print_every=2)

[GGG] Unique sequences: 150 | db=hg38 | workers=1
[GGG] 1/150 cached | ~0.3 seq/s | errors=0
[GGG] 2/150 cached | ~0.2 seq/s | errors=0
[GGG] 4/150 cached | ~0.2 seq/s | errors=0
[GGG] 6/150 cached | ~0.1 seq/s | errors=0
[GGG] 8/150 cached | ~0.1 seq/s | errors=0
[GGG] 10/150 cached | ~0.0 seq/s | errors=0
[GGG] 12/150 cached | ~0.0 seq/s | errors=0
[GGG] 14/150 cached | ~0.0 seq/s | errors=0
[GGG] 16/150 cached | ~0.0 seq/s | errors=0
[GGG] 18/150 cached | ~0.0 seq/s | errors=0
[GGG] 20/150 cached | ~0.0 seq/s | errors=0
[GGG] 22/150 cached | ~0.0 seq/s | errors=0
[GGG] 24/150 cached | ~0.0 seq/s | errors=0
[GGG] 26/150 cached | ~0.0 seq/s | errors=0
[GGG] 28/150 cached | ~0.0 seq/s | errors=0
[GGG] 30/150 cached | ~0.0 seq/s | errors=0
[GGG] 32/150 cached | ~0.0 seq/s | errors=0
[GGG] 34/150 cached | ~0.0 seq/s | errors=0
[GGG] 36/150 cached | ~0.0 seq/s | errors=0
[GGG] 38/150 cached | ~0.0 seq/s | errors=0
[GGG] 40/150 cached | ~0.0 seq/s | errors=0
[GGG] 42/150 cached | ~0.0 seq/

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df["ggg_d1"] = seqs.map(lambda s: cache.get(s, (s, 0, 0, 0, 0))[1])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  main_df["ggg_d0"] = seqs.map(lambda s: cache.get(s, (s, 0, 0, 0, 0))[4])


In [None]:
r

In [395]:
tresult_rb1[['row_number', SEQUENCE, 'sense_start', 'ggg_d0', 'ggg_d1', 'sense_avg_accessibility', 'on_target_fold_openness_normalized40_15', 'at_skew', 'gc_content', 'sense_utr']]

Unnamed: 0,row_number,Sequence,sense_start,ggg_d0,ggg_d1,sense_avg_accessibility,on_target_fold_openness_normalized40_15,at_skew,gc_content,sense_utr
436,1,TTTTGCATATCCCCTGTTCT,119936,1,0,1.485608,-0.015625,-0.666667,0.40,1.0
241,2,ATTTGATTTAGTTTTTTATT,262741,1,19,0.054253,-0.003500,-0.555556,0.10,1.0
244,3,TGGATTTGATTTAGTTTTTT,262744,1,7,0.132395,-0.005250,-0.625000,0.20,1.0
236,4,ATTTAGTTTTTTATTGTTTT,262736,1,44,0.056511,-0.000583,-0.666667,0.10,1.0
234,5,TTAGTTTTTTATTGTTTTTT,262734,2,105,0.057739,-0.001833,-0.777778,0.10,1.0
...,...,...,...,...,...,...,...,...,...,...
491,146,TACTATTTTGATTTTCTGTA,106991,1,11,1.128023,-0.014167,-0.500000,0.20,1.0
409,147,GCATTGACTTCTGTTTTTAC,254409,1,2,1.642736,-0.000250,-0.538462,0.35,1.0
41,148,TGATTTTGATGATTTATGTC,254541,1,4,1.309937,0.000000,-0.466667,0.25,1.0
90,149,TATTTATTGCTATTTCTGAG,265090,1,3,1.064463,-0.096042,-0.466667,0.25,1.0
