In [1]:
from scripts.data_genertion.consts import *
from scripts.features.feature_extraction import load_all_features

main_df = load_all_features()
main_df[SEQUENCE] = main_df[SEQUENCE].astype(str)

In [2]:
from asodesigner.read_human_genome import get_locus_to_data_dict
import pickle
from asodesigner.consts import CACHE_DIR

genes_u = ['HIF1A', 'APOL1', 'YAP1', 'SOD1', 'SNCA', 'IRF4', 'KRAS', 'KLKB1', 'SNHG14', 'DGAT2', 'IRF5', 'HTRA1',
           'MYH7', 'MALAT1', 'HSD17B13']
cache_path = CACHE_DIR / 'gene_to_data_simple_cache.pickle'
if not cache_path.exists():
    gene_to_data = get_locus_to_data_dict(include_introns=True, gene_subset=genes_u)
    with open(cache_path, 'wb') as f:
        pickle.dump(gene_to_data, f)
else:
    with open(cache_path, 'rb') as f:
        gene_to_data = pickle.load(f)

In [3]:
from scripts.data_genertion.data_handling import get_populated_df_with_structure_features

main_df = get_populated_df_with_structure_features(main_df, genes_u, gene_to_data)

In [4]:
main_df = main_df[main_df[SENSE_START] != -1] # only found ASOs
main_df[SEQUENCE]


0        GCTAAAACAAATGCTA
1        TATAATGGTGAATATC
2        GCATGAAGATTTCTGG
3        GGTGAATATCTTCAAA
4        CACTTGTACTAGTATG
               ...       
34760    GTTATGAAATTATTGG
34761    ATTCTATTAGAGGGCT
34762    GCTTTAAACTCAGGTG
34763    CGTCAATATATTCTTT
34764    TTTTGTAAGTGCAACC
Name: Sequence, Length: 29987, dtype: object

In [None]:
# pip install requests
import math, time, threading, urllib.parse, requests
from concurrent.futures import ThreadPoolExecutor, as_completed

UA = {"User-Agent": "python-requests gggenome/greedy"}

def _ggg_hits_leq_json(seq, k, db="hg38", timeout=60, retries=2):
    """Count hits with <=k mismatches via GGGenome JSON; fallback to CSV if needed."""
    s = str(seq).upper().replace("U", "T")
    q = urllib.parse.quote(s)
    url_json = f"https://gggenome.dbcls.jp/{db}/{k}/nogap/{q}.json"
    url_csv  = f"https://gggenome.dbcls.jp/{db}/{k}/nogap/{q}.csv?download"

    for attempt in range(retries + 1):
        try:
            r = requests.get(url_json, headers=UA, timeout=timeout)
            r.raise_for_status()
            try:
                data = r.json()
            except ValueError:
                raise RuntimeError("JSON parse failed")
            if isinstance(data, list):
                return len(data)
            if isinstance(data, dict):
                if "results" in data and isinstance(data["results"], list): return len(data["results"])
                if "hits" in data and isinstance(data["hits"], list):       return len(data["hits"])
                return sum(len(v) for v in data.values() if isinstance(v, list))
            return 0
        except Exception:
            # greedy CSV fallback
            try:
                r2 = requests.get(url_csv, headers=UA, timeout=timeout)
                r2.raise_for_status()
                return sum(1 for ln in r2.text.splitlines() if ln and not ln.startswith("#"))
            except Exception:
                if attempt < retries:
                    continue
                return 0
    return 0

def _d123_for_sequence(seq, db="hg38"):
    s = str(seq).upper().replace("U", "T")
    if not s:
        return (s, 0, 0, 0)
    L = len(s)
    k_allowed = max(0, math.floor(0.25 * L))  # GGGenome cap
    k0 = _ggg_hits_leq_json(s, 0, db=db)
    k1 = _ggg_hits_leq_json(s, 1, db=db) if k_allowed >= 1 else 0
    k2 = _ggg_hits_leq_json(s, 2, db=db) if k_allowed >= 2 else 0
    k3 = _ggg_hits_leq_json(s, 3, db=db) if k_allowed >= 3 else 0
    d1 = max(0, k1 - k0)
    d2 = max(0, k2 - k1)
    d3 = max(0, k3 - k2)
    return (s, d1, d2, d3)

def add_gggenome_d123(main_df, seq_col="SEQUENCE", db="hg38", *, max_workers=32, print_every=10):
    seqs = (main_df[seq_col].astype(str).str.upper().str.replace("U", "T", regex=False))
    uniq = seqs.dropna().unique().tolist()
    N = len(uniq)
    print(f"[GGG] Unique sequences: {N} | db={db} | workers={max_workers}")

    cache = {}
    lock = threading.Lock()
    t0 = time.perf_counter()
    errs = 0

    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futs = {ex.submit(_d123_for_sequence, s, db): s for s in uniq}
        done = 0
        for fut in as_completed(futs):
            s = futs[fut]
            try:
                s_key, d1, d2, d3 = fut.result()
            except Exception:
                d1 = d2 = d3 = 0
                with lock:
                    errs += 1
            with lock:
                cache[s] = (d1, d2, d3)
                done += 1
                if (done == 1) or (done % print_every == 0) or (done == N):
                    elapsed = time.perf_counter() - t0
                    rps = done / elapsed if elapsed > 0 else 0.0
                    print(f"[GGG] {done}/{N} cached | ~{rps:.1f} seq/s | errors={errs}")

    main_df["ggg_d1"] = seqs.map(lambda s: cache.get(s, (0, 0, 0))[0])
    main_df["ggg_d2"] = seqs.map(lambda s: cache.get(s, (0, 0, 0))[1])
    main_df["ggg_d3"] = seqs.map(lambda s: cache.get(s, (0, 0, 0))[2])

    print(f"[GGG] Finished in {time.perf_counter() - t0:.1f}s. Added columns: ggg_d1, ggg_d2, ggg_d3")
    return main_df

# --- usage ---
# main_df = main_df[main_df[SENSE_START] != -1]
main_df = add_gggenome_d123(main_df, seq_col=SEQUENCE, db="hg38", max_workers=100, print_every=2)


[GGG] Unique sequences: 14123 | db=hg38 | workers=100


In [None]:
from scripts.util.print import print_correlations

print_correlations(main_df, 'ggg_d1', INHIBITION)
print_correlations(main_df, 'ggg_d2', INHIBITION)
print_correlations(main_df, 'ggg_d3', INHIBITION)

In [None]:
from scripts.features.feature_extraction import save_feature
save_feature(main_df, 'ggg_d1')
save_feature(main_df, 'ggg_d2')
save_feature(main_df, 'ggg_d3')

In [None]:

# main_df = main_df[main_df[SENSE_START] != -1]
# main_df["ggg_d1"] = main_df[SEQUENCE].map(_d1_cached)

In [None]:
main_df[main_df['ggg_d3'] != 0]['ggg_d3',]

In [None]:
# pip install requests
import math, time, threading, urllib.parse, requests
from concurrent.futures import ThreadPoolExecutor, as_completed

UA = {"User-Agent": "python-requests gggenome/greedy"}

def _ggg_hits_leq_json(seq, k, db="refseq", timeout=120, retries=2):
    """
    Count hits with <=k mismatches via GGGenome JSON; fallback to CSV if needed.
    Prints errors but returns 0 on failure so the pipeline continues.
    """
    s = str(seq).upper().replace("U", "T")
    if not s:
        return 0
    q = urllib.parse.quote(s)
    url_json = f"https://gggenome.dbcls.jp/{db}/{k}/nogap/{q}.json"
    url_csv  = f"https://gggenome.dbcls.jp/{db}/{k}/nogap/{q}.csv?download"

    for attempt in range(retries + 1):
        try:
            r = requests.get(url_json, headers=UA, timeout=timeout)
            r.raise_for_status()
            try:
                data = r.json()
            except ValueError as e:
                print(f"[ERROR] JSON parse failed for seq={s}, k={k}, url={url_json}: {e}")
                raise

            if isinstance(data, list):
                return len(data)
            if isinstance(data, dict):
                if "results" in data and isinstance(data["results"], list):
                    return len(data["results"])
                if "hits" in data and isinstance(data["hits"], list):
                    return len(data["hits"])
                return sum(len(v) for v in data.values() if isinstance(v, list))
            return 0

        except Exception as e:
            print(f"[WARN] JSON failed (attempt {attempt+1}) for seq={s}, k={k}, url={url_json}: {e}")
            # Try CSV fallback
            try:
                r2 = requests.get(url_csv, headers=UA, timeout=timeout)
                r2.raise_for_status()
                return sum(1 for ln in r2.text.splitlines() if ln and not ln.startswith("#"))
            except Exception as e2:
                print(f"[ERROR] CSV fallback failed for seq={s}, k={k}, url={url_csv}: {e2}")
                if attempt < retries:
                    continue
                return 0
    return 0

def _k_counts_for_sequence(seq, db="refseq"):
    """Return cumulative counts (k0..k3) and derived d1..d3 for a sequence."""
    s = str(seq).upper().replace("U", "T")
    if not s:
        return (s, 0, 0, 0, 0, 0, 0)
    L = len(s)
    k_allowed = max(0, math.floor(0.25 * L))  # GGGenome cap
    k0 = _ggg_hits_leq_json(s, 0, db=db)
    k1 = _ggg_hits_leq_json(s, 1, db=db) if k_allowed >= 1 else k0
    k2 = _ggg_hits_leq_json(s, 2, db=db) if k_allowed >= 2 else k1
    k3 = _ggg_hits_leq_json(s, 3, db=db) if k_allowed >= 3 else k2
    d1 = max(0, k1 - k0)
    d2 = max(0, k2 - k1)
    d3 = max(0, k3 - k2)
    return (s, k0, k1, k2, k3, d1, d2, d3)

def add_gggenome_tx_d123(
    main_df, seq_col="SEQUENCE", db="refseq", *,
    max_workers=32, print_every=10, return_k=False
):
    """
    Add transcriptome off-target bins (nogap mismatches).
    tx_d1, tx_d2, tx_d3 = hits with exactly 1/2/3 mismatches.
    If return_k=True, also adds tx_k0..tx_k3.
    """
    seqs = main_df[seq_col].astype(str).str.upper().str.replace("U", "T", regex=False)
    uniq = seqs.dropna().unique().tolist()
    N = len(uniq)
    print(f"[GGG-TX] Unique sequences: {N} | db={db} | workers={max_workers}")

    cache = {}
    lock = threading.Lock()
    t0 = time.perf_counter()
    errs = 0

    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futs = {ex.submit(_k_counts_for_sequence, s, db): s for s in uniq}
        done = 0
        for fut in as_completed(futs):
            s = futs[fut]
            try:
                s_key, k0, k1, k2, k3, d1, d2, d3 = fut.result()
            except Exception as e:
                print(f"[ERROR] Worker failed for seq={s}: {e}")
                k0 = k1 = k2 = k3 = d1 = d2 = d3 = 0
                with lock:
                    errs += 1
            with lock:
                cache[s] = (k0, k1, k2, k3, d1, d2, d3)
                done += 1
                if (done == 1) or (done % print_every == 0) or (done == N):
                    elapsed = time.perf_counter() - t0
                    rps = done / elapsed if elapsed > 0 else 0.0
                    print(f"[GGG-TX] {done}/{N} cached | ~{rps:.1f} seq/s | errors={errs}")

    main_df["tx_d1"] = seqs.map(lambda s: cache.get(s, (0,0,0,0,0,0,0))[4])
    main_df["tx_d2"] = seqs.map(lambda s: cache.get(s, (0,0,0,0,0,0,0))[5])
    main_df["tx_d3"] = seqs.map(lambda s: cache.get(s, (0,0,0,0,0,0,0))[6])

    if return_k:
        main_df["tx_k0"] = seqs.map(lambda s: cache.get(s, (0,0,0,0,0,0,0))[0])
        main_df["tx_k1"] = seqs.map(lambda s: cache.get(s, (0,0,0,0,0,0,0))[1])
        main_df["tx_k2"] = seqs.map(lambda s: cache.get(s, (0,0,0,0,0,0,0))[2])
        main_df["tx_k3"] = seqs.map(lambda s: cache.get(s, (0,0,0,0,0,0,0))[3])

    finish = time.perf_counter() - t0
    added = "tx_d1, tx_d2, tx_d3" + (", tx_k0..tx_k3" if return_k else "")
    print(f"[GGG-TX] Finished in {finish:.1f}s. Added columns: {added}")
    return main_df

# --- usage ---
# main_df = main_df[main_df[SENSE_START] != -1]
# RefSeq transcripts (default):
main_df = add_gggenome_tx_d123(main_df, seq_col=SEQUENCE, db="refseq", max_workers=256, print_every=2, return_k=True)
# Or GENCODE transcripts:
# main_df = add_gggenome_tx_d123(main_df, seq_col=SEQUENCE, db="GENCODE_47", max_workers=256, print_every=2, return_k=True)
