### **Merging rxnorm to dti dataset**


In [1]:
%pip install PyYAML
%pip isntall pandas

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


ERROR: unknown command "isntall" - maybe you meant "install"



In [15]:
import pandas as pd
import yaml
import gc
from tqdm import tqdm
from pathlib import Path
import re
import time
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
# Load the YAML file
with open('config.yaml', 'r') as file:
    config = yaml.safe_load(file)

### Run this code if you dont have the *scope_subset.parquet* file

In [4]:
# pd load
scope_pd = pd.read_parquet(config["paths"]["LINK_TO_DTI_DATASET"])


In [5]:
scope_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2208221 entries, 0 to 2208220
Data columns (total 9 columns):
 #   Column             Dtype  
---  ------             -----  
 0   drug_chembl_id     object 
 1   target_uniprot_id  object 
 2   positive_times     int64  
 3   negative_times     int64  
 4   drug_pubchem_id    float64
 5   label              int64  
 6   smiles             object 
 7   sequence           object 
 8   molfile_3d         object 
dtypes: float64(1), int64(3), object(5)
memory usage: 151.6+ MB


In [8]:
scope_subset = scope_pd[
    [
        "drug_chembl_id",
        "target_uniprot_id",
        "label",
        "smiles",
        "sequence",
        "molfile_3d",
    ]
]


In [9]:
scope_subset.to_parquet("scope_subset.parquet", index=False)

In [None]:
del scope_subset

In [13]:
gc.collect()

487

### Merging rxnorm id with scope subset

In [14]:
scope_subset_pd = pd.read_parquet("scope_subset.parquet")

In [15]:
scope_subset_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2208221 entries, 0 to 2208220
Data columns (total 6 columns):
 #   Column             Dtype 
---  ------             ----- 
 0   drug_chembl_id     object
 1   target_uniprot_id  object
 2   label              int64 
 3   smiles             object
 4   sequence           object
 5   molfile_3d         object
dtypes: int64(1), object(5)
memory usage: 101.1+ MB


In [16]:
rxcui_chembl_pd = pd.read_parquet(config["paths"]["LINK_TO_RXCUI_DATASET"])

In [17]:
rxcui_chembl_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1062 entries, 0 to 1061
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   rxcui        1062 non-null   object 
 1   rxnorm_name  1062 non-null   object 
 2   tty          1062 non-null   object 
 3   pubchem_cid  1045 non-null   float64
 4   inchikey     1062 non-null   object 
 5   drugbank_id  1048 non-null   object 
 6   unii         977 non-null    object 
 7   confidence   1062 non-null   object 
 8   provenance   1045 non-null   object 
 9   chembl_id    1062 non-null   object 
 10  pubchem_id   1062 non-null   Int64  
dtypes: Int64(1), float64(1), object(9)
memory usage: 92.4+ KB


In [18]:
# Merge only common entries (inner join)
scope_onside_common = scope_subset_pd.merge(
    rxcui_chembl_pd[['chembl_id', 'rxcui']], 
    how='inner', 
    left_on='drug_chembl_id', 
    right_on='chembl_id'
)

# Drop duplicate chembl_id column since drug_chembl_id is enough
scope_onside_common = scope_onside_common.drop(columns=['chembl_id'])

In [None]:
scope_onside_common.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34923 entries, 0 to 34922
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   drug_chembl_id     34923 non-null  object
 1   target_uniprot_id  34923 non-null  object
 2   label              34923 non-null  int64 
 3   smiles             34923 non-null  object
 4   sequence           34923 non-null  object
 5   molfile_3d         34866 non-null  object
 6   rxcui              34923 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.9+ MB


In [21]:
scope_onside_common.to_parquet("../Data/scope_onside_common.parquet", index=False)

### Fetch Alphafold PDB file of protein

In [3]:
scope_onside_pd = pd.read_parquet("../Data/scope_onside_common.parquet")

In [4]:
scope_onside_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34923 entries, 0 to 34922
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   drug_chembl_id     34923 non-null  object
 1   target_uniprot_id  34923 non-null  object
 2   label              34923 non-null  int64 
 3   smiles             34923 non-null  object
 4   sequence           34923 non-null  object
 5   molfile_3d         34866 non-null  object
 6   rxcui              34923 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.9+ MB


In [7]:
def unique_uniprot_ids(df: pd.DataFrame, col="target_uniprot_id"):
    ids = (
        df[col]
        .astype(str)
        .str.strip()
        .str.upper()
        .dropna()
        .unique()
        .tolist()
    )
    # optional: keep only plausible UniProt accessions (simple filter)
    pat = re.compile(r"^[OPQ][0-9][A-Z0-9]{3}[0-9](-\d+)?$|^[A-NR-Z][0-9][A-Z0-9]{3}[0-9](-\d+)?$")
    ids = [u for u in ids if pat.match(u)]
    return ids

In [13]:
def alphafold_pdb_urls(uniprot_id: str):
    # Most current releases are v4; some older ones are v3
    base = "https://alphafold.ebi.ac.uk/files"
    return [
        f"{base}/AF-{uniprot_id}-F1-model_v4.pdb",
        f"{base}/AF-{uniprot_id}-F1-model_v3.pdb",
    ]

In [8]:
uniprot_ids = unique_uniprot_ids(scope_onside_pd, "target_uniprot_id")

In [16]:
def download_one(uid: str, out_dir: Path, retries=3, backoff=2.0, timeout=30, skip_existing=True):
    dest = out_dir / f"{uid}.pdb"
    if skip_existing and dest.exists() and dest.stat().st_size > 0:
        return uid, True  # already have it

    for url in alphafold_pdb_urls(uid):
        for attempt in range(1, retries + 1):
            try:
                r = requests.get(url, stream=True, timeout=timeout)
                if r.status_code == 200:
                    tmp = dest.with_suffix(".part")
                    with open(tmp, "wb") as f:
                        for chunk in r.iter_content(chunk_size=8192):
                            if chunk:
                                f.write(chunk)
                    if tmp.stat().st_size == 0:
                        tmp.unlink(missing_ok=True)
                        raise IOError("Empty file")
                    tmp.replace(dest)
                    return uid, True
                elif r.status_code == 404:
                    break  # try fallback url
                else:
                    time.sleep(backoff ** (attempt - 1))
            except requests.RequestException:
                time.sleep(backoff ** (attempt - 1))
    return uid, False

In [18]:
def download_alphafold_parallel(uniprot_ids, out_dir="../AlphaFoldData", max_workers=6):
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    successes, misses = 0, 0
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(download_one, uid, out_dir): uid for uid in uniprot_ids}
        for future in tqdm(as_completed(futures), total=len(futures), desc="AlphaFold downloads"):
            uid, ok = future.result()
            if ok:
                successes += 1
            else:
                misses += 1
    return successes, misses

In [21]:
ok, failed = download_alphafold_parallel(uniprot_ids, max_workers=6)

AlphaFold downloads: 100%|██████████| 2422/2422 [00:04<00:00, 498.60it/s]


In [23]:
print(f"Done. Downloaded: {ok:,} | Missing: {failed:,}")

Done. Downloaded: 2,394 | Missing: 28


In [27]:
def find_missing_ids(uniprot_ids, out_dir="../AlphaFoldData"):
    out_dir = Path(out_dir)
    # collect finished IDs (strip .pdb)
    existing = {f.stem for f in out_dir.glob("*.pdb") if f.stat().st_size > 0}
    # compare with expected
    missing = sorted(set(uniprot_ids) - existing)
    return missing

In [28]:
missing = find_missing_ids(uniprot_ids)

print(f"Total expected: {len(uniprot_ids):,}")
print(f"Found locally: {len(uniprot_ids) - len(missing):,}")
print(f"Missing: {len(missing):,}")
print("Missing UniProt IDs:")
for uid in missing:
    print(uid)


Total expected: 2,422
Found locally: 2,394
Missing: 28
Missing UniProt IDs:
P01266
P04114
P07203
P08519
P15924
P18283
P21817
P22352
P35556
P36969
P46939
P49908
P55073
P59796
P78527
Q03164
Q13315
Q14571
Q14643
Q15413
Q16881
Q75N90
Q86YZ3
Q8WXI7
Q92736
Q96QA9
Q9NU22
Q9NZV6
