In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
from scipy import sparse
import re
import os
from pubchempy import get_compounds
import time

In [18]:
base_dir = "/cluster/work/bewi/members/rquiles/zeroshot_amr/data"

### Metadata file

In [19]:
metadata = pd.read_csv(os.path.join(base_dir, "combined_long_table.csv"))

In [20]:
metadata

Unnamed: 0,species,sample_id,drug,response,dataset
0,HOP62,95_001_111-lib_1681,Doxorubicin,1.0,any
1,HOP62,95_001_111-lib_1681,Doxorubicin,1.0,any
2,HOP62,95_001_111-lib_1681,Etoposide,1.0,any
3,HOP62,95_001_111-lib_1681,Gemcitabine,1.0,any
4,HOP62,95_001_111-lib_1681,Gemcitabine,1.0,any
...,...,...,...,...,...
51895436,HT29,96_192_081-lib_2608,Nelarabine,0.0,any
51895437,HT29,96_192_081-lib_2608,Vincristine,0.0,any
51895438,HT29,96_192_081-lib_2608,Venetoclax,0.0,any
51895439,HT29,96_192_081-lib_2608,Osimertinib,0.0,any


In [21]:
unique_lines = np.array(metadata["species"].unique())
unique_drugs = np.array(metadata["drug"].unique())

In [22]:
print(f"Number unique lines: {len(unique_lines)}")
print(f"Unique lines: {unique_lines}")
print(f"Number unique drugs: {len(unique_drugs)}")
print(f"Unique drugs: {unique_drugs}")

Number unique lines: 46
Unique lines: ['HOP62' 'A172' 'NCIH2030' 'NCISNU1' 'HCT15' 'A498' 'ASPC1' 'MIAPACA2'
 'HT29' 'SW900' 'COLO205' 'CFPAC1' 'LOXIMVI' 'LS180' 'RKO' 'J82' 'C33A'
 'NCIH1792' 'C32' 'LOVO' 'NCIH2347' 'RPMI7951' 'A549' 'NCIH23' 'SKMEL2'
 'AN3CA' 'HS578T' 'KATOIII' 'SNU423' 'HEC1A' 'BT474' 'A427' 'SHP77'
 'CHP212' 'HS766T' 'C3A' 'SW1417' 'SW48' 'PANC0327' 'NCIH1573' 'H4'
 'SW1088' 'NCIH596' 'NCIH661' 'SW1271' 'NCIH2122']
Number unique drugs: 65
Unique drugs: ['Doxorubicin' 'Etoposide' 'Gemcitabine' 'Mitomycin-C' 'Vinorelbine'
 'Bicalutamide' 'Ponatinib' '5-Fluorouracil' 'Bexarotene' 'Bleomycin'
 'Pazopanib' 'Ruxolitinib' 'Idelalisib' 'Cabozantinib' 'Belinostat'
 'Alectinib' 'Pemetrexed' 'Panobinostat' 'Vinblastine' 'Cisplatin'
 'Cytarabine' 'Docetaxel' 'Methotrexate' 'Tretinoin' 'Gefitinib'
 'Vorinostat' 'Nilotinib' 'Temsirolimus' 'Olaparib' 'Bosutinib'
 'Lenalidomide' 'Axitinib' 'Afatinib' 'Vismodegib' 'Palbociclib'
 'Rucaparib' 'Tamoxifen' 'Trametinib' 'Dabrafenib' 'Te

In [23]:
# Determine zeroshot subsets
train_lines = unique_lines[:35]
seen_lines = unique_lines[35:40]
zeroshot_lines = unique_lines[40:]
print(f"Zeroshot cell lines: {zeroshot_lines}")

zeroshot_drugs = ['Erlotinib', 'Rapamycin', 'Sunitinib', 'Paclitaxel', 'Sorafenib', 'Imatinib', 'Crizotinib', 'Dasatinib', 'Bortezomib', 'Lapatinib']
print(f"Zeroshot drugs: {zeroshot_drugs}")

Zeroshot cell lines: ['H4' 'SW1088' 'NCIH596' 'NCIH661' 'SW1271' 'NCIH2122']
Zeroshot drugs: ['Erlotinib', 'Rapamycin', 'Sunitinib', 'Paclitaxel', 'Sorafenib', 'Imatinib', 'Crizotinib', 'Dasatinib', 'Bortezomib', 'Lapatinib']


In [24]:
# Test if all zeroshot drugs have been removed from training set
print("All Zeroshot Drugs have been removed from training set!")
metadata[(metadata["species"].isin(train_lines)) & (metadata["drug"].isin(zeroshot_drugs))]

All Zeroshot Drugs have been removed from training set!


Unnamed: 0,species,sample_id,drug,response,dataset


### Data Split File

In [25]:
splits = pd.read_csv(os.path.join(base_dir, "data_splits.csv"))

In [26]:
# Ensure train lines are only in train set
print("train_lines are only in train set!")
splits[splits["species"].isin(train_lines)]["Set"].unique()

train_lines are only in train set!


array(['train'], dtype=object)

In [27]:
# Ensure zeroshot lines are only in test set
print("zeroshot_lines are only in test set!")
splits[splits["species"].isin(zeroshot_lines)]["Set"].unique()

zeroshot_lines are only in test set!


array(['test'], dtype=object)

In [28]:
# Ensure "seen" lines are present in train and test at the adequate ratio
print("seen_lines:")
print(f"Sets: {splits[splits['species'].isin(seen_lines)]['Set'].unique()}")
# splits[(splits['species'].isin(seen_lines) & splits['Set'] == 'test')]
len(splits[(splits['species'].isin(seen_lines)) & (splits['Set'] == 'test')]) / len(splits[(splits['species'].isin(seen_lines))])

seen_lines:
Sets: ['test' 'train']


0.20004186801109503

## Original controls file

In [18]:
data_path = "/cluster/work/bewi/data/tahoe100/h5ad/controls_merged.h5ad"
adata = sc.read_h5ad(data_path)

## Drug Fingerprints

In [3]:
fps = pd.read_csv(os.path.join(base_dir, "drug_fingerprints_Mol_selfies.csv"))

In [5]:
fps["drug"].unique()

array(['Talc', 'Bortezomib', 'Ixazomib', 'Ixazomib citrate',
       'Lactate (calcium)', 'Bisoprolol (hemifumarate)', 'Fumaric acid',
       'Hydroxyurea', 'L-Eflornithine (monohydrochloride)',
       'Cysteamine (hydrochloride)', 'Darinaparsin',
       'Entecavir (monohydrate)', 'Allantoin', '5-Fluorouracil',
       'L-Thyroxine (sodium salt pentahydrate)', 'Gallic acid',
       'Gallic acid (hydrate)', 'ERK5-IN-2', 'Vilanterol',
       'Niclosamide (olamine)', 'Norepinephrine (hydrochloride)',
       'Triclosan', 'Mitoxantrone (dihydrochloride)',
       'Pentamidine (isethionate)', 'Folic acid',
       'Balsalazide (sodium hydrate)', 'Resveratrol', 'PF-06260933',
       'Daidzin', 'Pemetrexed', 'Econazole', 'XRK3F2', 'Arbutin',
       'Tucidinostat', 'Pexidartinib (hydrochloride)',
       'Sodium Salicylate', 'Salicylic acid', 'Ataluren', '4EGI-1',
       'Clotrimazole', 'Phenytoin (sodium)', 'SBI-0640756', 'Oxaprozin',
       'Baicalin', 'Belinostat', 'Carbamazepine', 'HI-TOPK-032',

In [11]:
# Normalize drug names for the 2 sets
def norm_drug(s):
    """Normalize drug names for matching."""
    if s is None:
        return None
    s = s.lower()
    
    # Remove GDSC suffix ___123
    s = re.sub(r"___\d+", "", s)

    # Remove parentheses content (e.g., (hydrochloride))
    s = re.sub(r"\([^)]*\)", "", s)

    # Remove salts / hydrate words
    remove_words = [
        "hydrochloride", "monohydrochloride", "dihydrochloride",
        "sodium", "hydrate", "monohydrate", "pentahydrate",
        "hemifumarate", "citrate", "acid", "salt"
    ]
    for w in remove_words:
        s = s.replace(w, "")

    # Remove punctuation
    s = re.sub(r"[^a-z0-9]+", "", s)

    return s.strip()

In [13]:
tahoe_drugs = fps["drug"].unique()
gdsc_drugs = np.array(metadata["drug"].unique())

In [17]:
for t_drug in tahoe_drugs:
    if "Etoposide" in t_drug:
        print(drug)

In [16]:
# Build mapping from GDSC → Tahoe using normalized names.
norm_tahoe = {norm_drug(d): d for d in tahoe_drugs}

mapping = {}
for d in gdsc_drugs:
    key = norm_drug(d)
    mapping[d] = norm_tahoe.get(key, None)

mapping

{'Doxorubicin___133': 'Doxorubicin (hydrochloride)',
 'Doxorubicin___1386': 'Doxorubicin (hydrochloride)',
 'Etoposide': None,
 'Gemcitabine___135': 'Gemcitabine',
 'Gemcitabine___1393': 'Gemcitabine',
 'Mitomycin-C': None,
 'Vinorelbine': None,
 'Bicalutamide___150': 'Bicalutamide',
 'Bicalutamide___1502': 'Bicalutamide',
 'Ponatinib': 'Ponatinib',
 '5-Fluorouracil': '5-Fluorouracil',
 'Bexarotene': 'Bexarotene',
 'Bleomycin': None,
 'Pazopanib': None,
 'Ruxolitinib': None,
 'Idelalisib': None,
 'Cabozantinib': 'Cabozantinib (S-malate)',
 'Belinostat': 'Belinostat',
 'Alectinib': None,
 'Pemetrexed': 'Pemetrexed',
 'Panobinostat': 'Panobinostat',
 'Vinblastine': 'Vinblastine (sulfate)',
 'Cisplatin___1005': None,
 'Cisplatin___1496': None,
 'Cytarabine': 'Cytarabine (hydrochloride)',
 'Docetaxel': 'Docetaxel (Trihydrate)',
 'Methotrexate': 'Methotrexate',
 'Tretinoin': None,
 'Gefitinib': 'Gefitinib',
 'Vorinostat': None,
 'Nilotinib': None,
 'Temsirolimus': 'Temsirolimus',
 'Olaparib

### Generate new Dataframe

In [26]:
def get_smiles_from_name(drug_name):
    try:
        results = get_compounds(drug_name, 'name')
        if len(results) > 0:
            return results[0].canonical_smiles
    except:
        print(f"No match found for: {drug_name}")
        return None

def fetch_smiles(drug_list):
    smiles_dict = {}
    for drug in drug_list:
        smi = get_smiles_from_name(drug)
        smiles_dict[drug] = smi
        print(f"{drug}: {smi}")
        time.sleep(0.2)  # avoid rate-limiting
    return pd.DataFrame.from_dict(smiles_dict, orient="index", columns=["SMILES"])

In [27]:
# Clean names (remove concentration suffixes like ___133)
clean_drugs = list(set([d.split("___")[0] for d in gdsc_drugs]))

df_smiles = fetch_smiles(clean_drugs)
df_smiles.to_csv("gdsc_smiles.csv")

  return results[0].canonical_smiles


Fulvestrant: CC12CCC3C(C1CCC2O)C(CC4=C3C=CC(=C4)O)CCCCCCCCCS(=O)CCCC(C(F)(F)F)(F)F
Paclitaxel: CC1=C2C(C(=O)C3(C(CC4C(C3C(C(C2(C)C)(CC1OC(=O)C(C(C5=CC=CC=C5)NC(=O)C6=CC=CC=C6)O)O)OC(=O)C7=CC=CC=C7)(CO4)OC(=O)C)O)C)OC(=O)C
Bortezomib: B(C(CC(C)C)NC(=O)C(CC1=CC=CC=C1)NC(=O)C2=NC=CN=C2)(O)O
Rucaparib: CNCC1=CC=C(C=C1)C2=C3CCNC(=O)C4=C3C(=CC(=C4)F)N2
Vismodegib: CS(=O)(=O)C1=CC(=C(C=C1)C(=O)NC2=CC(=C(C=C2)Cl)C3=CC=CC=N3)Cl
Lenalidomide: C1CC(=O)NC(=O)C1N2CC3=C(C2=O)C=CC=C3N
Zoledronate: C1=CN(C=N1)CC(O)(P(=O)(O)O)P(=O)(O)O
Mitoxantrone: C1=CC(=C2C(=C1NCCNCCO)C(=O)C3=C(C=CC(=C3C2=O)O)O)NCCNCCO
Temozolomide: CN1C(=O)N2C=NC(=C2N=N1)C(=O)N
Cyclophosphamide: C1CNP(=O)(OC1)N(CCCl)CCCl
Dabrafenib: CC(C)(C)C1=NC(=C(S1)C2=NC(=NC=C2)N)C3=C(C(=CC=C3)NS(=O)(=O)C4=C(C=CC=C4F)F)F
Osimertinib: CN1C=C(C2=CC=CC=C21)C3=NC(=NC=C3)NC4=C(C=C(C(=C4)NC(=O)C=C)N(C)CCN(C)C)OC
Palbociclib: CC1=C(C(=O)N(C2=NC(=NC=C12)NC3=NC=C(C=C3)N4CCNCC4)C5CCCC5)C(=O)C
Tretinoin: CC1=C(C(CCC1)(C)C)C=CC(=CC=CC(=CC(=O)O)C)C
Alectini

In [2]:
df_smiles = pd.read_csv("gdsc_smiles.csv", index=False)

## Eliminate Cetuximab and Normalize Drug Names in Metadata

Elimiante Cetuximab:

In [4]:
metadata = pd.read_csv(os.path.join(base_dir, "combined_long_table.csv"))

In [7]:
metadata[metadata["drug"].str.split("___").str[0] == "Cetuximab"]

Unnamed: 0,species,sample_id,drug,response,dataset
41,HOP62,95_001_111-lib_1681,Cetuximab,0.0,any
127,A172,95_002_071-lib_1681,Cetuximab,0.0,any
214,NCIH2030,95_003_116-lib_1681,Cetuximab,0.0,any
263,NCISNU1,95_004_100-lib_1681,Cetuximab,0.0,any
437,A498,95_005_118-lib_1681,Cetuximab,0.0,any
...,...,...,...,...,...
52469516,LOVO,96_185_048-lib_2608,Cetuximab,0.0,any
52469604,HOP62,96_186_088-lib_2608,Cetuximab,0.0,any
52469761,NCIH2030,96_187_181-lib_2608,Cetuximab,0.0,any
52469811,SHP77,96_191_125-lib_2608,Cetuximab,0.0,any


In [8]:
metadata = metadata[~(metadata["drug"].str.split("___").str[0] == "Cetuximab")]
metadata.to_csv(os.path.join(base_dir, "combined_long_table.csv", index=False))

Normalize drug names:

In [9]:
metadata = pd.read_csv(os.path.join(base_dir, "combined_long_table.csv"))

In [10]:
metadata[metadata["drug"].str.split("___").str[0] == "Cetuximab"]

Unnamed: 0.1,Unnamed: 0,species,sample_id,drug,response,dataset


In [15]:
def norm_drug_gdsc(d):
    return d.split("___")[0]

metadata["drug"] = metadata["drug"].map(norm_drug_gdsc)

In [16]:
metadata

Unnamed: 0,species,sample_id,drug,response,dataset
0,HOP62,95_001_111-lib_1681,Doxorubicin,1.0,any
1,HOP62,95_001_111-lib_1681,Doxorubicin,1.0,any
2,HOP62,95_001_111-lib_1681,Etoposide,1.0,any
3,HOP62,95_001_111-lib_1681,Gemcitabine,1.0,any
4,HOP62,95_001_111-lib_1681,Gemcitabine,1.0,any
...,...,...,...,...,...
51895436,HT29,96_192_081-lib_2608,Nelarabine,0.0,any
51895437,HT29,96_192_081-lib_2608,Vincristine,0.0,any
51895438,HT29,96_192_081-lib_2608,Venetoclax,0.0,any
51895439,HT29,96_192_081-lib_2608,Osimertinib,0.0,any


In [17]:
metadata.to_csv(os.path.join(base_dir, "combined_long_table.csv"), index=False)