Imports

In [19]:
from xgboost import XGBRanker
import pandas as pd
from IDO_Seq import IDO1_sequence
import requests

# import necessary feature modules
from scripts.data_genertion.consts import SEQUENCE
from scripts.features.feature_extraction import load_all_features
from scripts.data_genertion.data_handling import get_populated_df_with_structure_features

Load model

In [None]:
model = XGBRanker()
model.load_model('ranker_model2.json')

Functions

In [20]:
def dna_to_dna_reverse_complement(seq: str) -> str:
    seq = seq.upper()
    translation_table = str.maketrans("ATGC", "TACG")
    # Translate and reverse
    return seq.translate(translation_table)[::-1]

In [21]:
def find_aso_binding_positions(aso_seq, mrna_seq):
    target = dna_to_dna_reverse_complement(aso_seq)
    idx = mrna_seq.find(target)
    if idx == -1:
        return None
    return int(idx), idx/len(mrna_seq)

The data

In [35]:
exp_data = pd.read_csv("IDO1_exp_inhibition.csv")
exp_data = exp_data[["ASO", "Sequence", "Length"]]

exp_data[["sense_start", "normalized_start"]] = exp_data["Sequence"].apply(
    lambda s: pd.Series(find_aso_binding_positions(s, IDO1_sequence))
)
exp_data = exp_data.dropna(subset=["sense_start"])
exp_data["Cell line organism"] = "human"
exp_data["Canonical Gene Name"] = "IDO1"
exp_data["Inhibition(%)"] = 0.0  # dummy placeholder
exp_data

Unnamed: 0,ASO,Sequence,Length,sense_start,normalized_start,Cell line organism,Canonical Gene Name,Inhibition(%)
2,A06055H,CCGCAGGCCAGCATCAC,17,1002.0,0.541915,human,IDO1,0.0
3,A06049H,ACAAAACGTCCATGTTC,17,469.0,0.253651,human,IDO1,0.0
4,A06037H,CAGGACGTCAAAGCAC,16,844.0,0.456463,human,IDO1,0.0
5,A06017H,AGGACGTCAAAGCAC,15,844.0,0.456463,human,IDO1,0.0
6,A06048H,GTTGGCAGTAAGGAACA,17,355.0,0.191996,human,IDO1,0.0
...,...,...,...,...,...,...,...,...
69,A06001H,GGCGCTGTGACTTG,14,250.0,0.135208,human,IDO1,0.0
70,A06030H,AGGCGCTGTGACTTGT,16,249.0,0.134667,human,IDO1,0.0
71,A06045H,AGGCGCTGTGACTTGTG,17,248.0,0.134127,human,IDO1,0.0
72,A06029H,GGCGCTGTGACTTGTG,16,248.0,0.134127,human,IDO1,0.0


In [23]:
class TranscriptStructure:
    def __init__(self, transcript_id, full_mrna, exon_indices, intron_indices, utr_indices, cds_start):
        self.transcript_id = transcript_id
        self.full_mrna = full_mrna
        self.exon_indices = exon_indices
        self.intron_indices = intron_indices
        self.utr_indices = utr_indices
        self.cds_start = cds_start

    def __repr__(self):
        return (f"TranscriptStructure({self.transcript_id}, "
                f"{len(self.full_mrna)} nt, {len(self.exon_indices)} exons)")


In [25]:
def get_transcript_structure(transcript_id: str):
    server = "https://rest.ensembl.org"
    headers = {"Content-Type": "application/json"}

    # 1️⃣ Fetch transcript metadata (includes exons & UTRs)
    url = f"{server}/lookup/id/{transcript_id}?expand=1"
    r = requests.get(url, headers=headers)
    if not r.ok:
        raise RuntimeError(f"Failed to fetch transcript data: {r.text}")
    data = r.json()

    # 2️⃣ Fetch full pre-mRNA sequence
    seq_url = f"{server}/sequence/id/{transcript_id}?type=genomic"
    seq_r = requests.get(seq_url, headers=headers)
    if not seq_r.ok:
        raise RuntimeError(f"Failed to fetch sequence: {seq_r.text}")
    full_mrna = seq_r.json()["seq"]

    # 3️⃣ Extract exon coordinates relative to transcript
    exon_indices = []
    exons = sorted(data["Exon"], key=lambda e: e["start"])
    for e in exons:
        exon_indices.append((e["start"], e["end"]))

    # 4️⃣ Derive intron coordinates
    intron_indices = []
    for (prev_end, next_start) in zip([e["end"] for e in exons[:-1]],
                                      [e["start"] for e in exons[1:]]):
        intron_indices.append((prev_end + 1, next_start - 1))

    # 5️⃣ Extract CDS and UTR info
    cds_start = data.get("Translation", {}).get("start", None)
    utr_indices = []
    if "UTR" in data:
        utr_indices = [(u["start"], u["end"]) for u in data["UTR"]]

    # Create structure object
    return TranscriptStructure(
        transcript_id=transcript_id,
        full_mrna=full_mrna,
        exon_indices=exon_indices,
        intron_indices=intron_indices,
        utr_indices=utr_indices,
        cds_start=cds_start
    )


In [26]:
if __name__ == "__main__":
    transcript_id = "ENST00000518237"
    IDO1_structure_data_object = get_transcript_structure(transcript_id)

    print("✅ Successfully retrieved transcript structure!")
    print(IDO1_structure_data_object)

    # You can now store it like:
    genes_u = ["IDO1"]
    gene_to_data = {"IDO1": IDO1_structure_data_object}

✅ Successfully retrieved transcript structure!
TranscriptStructure(ENST00000518237, 14900 nt, 10 exons)


In [36]:
exp_data = get_populated_df_with_structure_features(exp_data, genes_u, gene_to_data)
print("✅ Structural features added")

✅ Structural features added


In [41]:
exp_data["normalized_start"] = exp_data["sense_start"]/14900
exp_data

Unnamed: 0,ASO,Sequence,Length,sense_start,normalized_start,Cell line organism,Canonical Gene Name,Inhibition(%),sense_start_from_end,sense_length,sense_exon,sense_intron,sense_utr,sense_type
2,A06055H,CCGCAGGCCAGCATCAC,17,14053,0.943154,human,IDO1,0.0,814,17,1,0,0,exon
3,A06049H,ACAAAACGTCCATGTTC,17,8661,0.581275,human,IDO1,0.0,6206,17,1,0,0,exon
4,A06037H,CAGGACGTCAAAGCAC,16,11437,0.767584,human,IDO1,0.0,3430,16,1,0,0,exon
5,A06017H,AGGACGTCAAAGCAC,15,11437,0.767584,human,IDO1,0.0,3430,15,1,0,0,exon
6,A06048H,GTTGGCAGTAAGGAACA,17,4944,0.331812,human,IDO1,0.0,9923,17,1,0,0,exon
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69,A06001H,GGCGCTGTGACTTG,14,4232,0.284027,human,IDO1,0.0,10635,14,1,0,0,exon
70,A06030H,AGGCGCTGTGACTTGT,16,4231,0.283960,human,IDO1,0.0,10636,16,1,0,0,exon
71,A06045H,AGGCGCTGTGACTTGTG,17,4230,0.283893,human,IDO1,0.0,10637,17,1,0,0,exon
72,A06029H,GGCGCTGTGACTTGTG,16,4230,0.283893,human,IDO1,0.0,10637,16,1,0,0,exon
