In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
from scipy import sparse
import re
import os

In [64]:
cell_lines= ["RPMI-7951", "PANC-1", "SW480", "H4", "hTERT-HPNE", "A549", "A498", "CHP-212", "NCI-H2030", "LoVo", "NCI-H1573", "SNU-423", "AsPC-1", "NCI-H460", "NCI-H23", "CFPAC-1", "HT-29", "SW1417", "NCI-H1792", "RKO", "J82", "C32", "NCI-H2347", "HEC-1-A", "SK-MEL-2", "SW48", "BT-474", "SHP-77", "NCI-H661", "NCI-H596", "NCI-H2122", "Hs 766T", "HOP62", "MIA PaCa-2", "LOX-IMVI", "KATO III", "HS-578T", "SW 900", "LS 180", "SNU-1", "A-172", "Panc 03.27", "C-33 A", "COLO 205", "HepG2/C3A", "HCT15", "A-427", "AN3 CA"]
len(cell_lines)

48

In [2]:
parent_dir = "/cluster/work/bewi/members/rquiles/zeroshot_amr/data/test"
adata_path = "/cluster/work/bewi/data/tahoe100/h5ad/controls_merged_hvg_log1p_umap.h5ad"

In [4]:
adata = sc.read_h5ad(adata_path)

In [5]:
adata

AnnData object with n_obs × n_vars = 2330156 × 3000
    obs: 'sample', 'gene_count', 'tscp_count', 'mread_count', 'drugname_drugconc', 'drug', 'cell_line', 'sublibrary', 'BARCODE', 'pcnt_mito', 'S_score', 'G2M_score', 'phase', 'pass_filter', 'cell_name', 'plate', 'source_plate'
    var: 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'cell_name_colors', 'hvg', 'log1p', 'neighbors', 'pca', 'plate_colors', 'umap'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [37]:
sum(adata.var["highly_variable"].values)

3000

In [9]:
X = adata.X.toarray()
X

array([[0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        2.4719892],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        3.1220112],
       ...,
       [0.       , 1.9810015, 0.       , ..., 0.       , 0.       ,
        1.9810015],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ]], dtype=float32)

## TAHOE vs EXTRACTED controls

In [2]:
tahoe_path = "/cluster/work/bewi/data/tahoe100/h5ad/controls_Trametinib_merged.h5ad"
extracted_path = "/cluster/work/bewi/data/tahoe100/metadata/controls_col_cellCode_with_sensitivity.csv"

adata = sc.read_h5ad(tahoe_path, backed="r")
df = pd.read_csv(extracted_path)

In [3]:
tahoe_indices = adata.obs.index
df_indices = df["BARCODE_SUB_LIB_ID"].unique()

In [4]:
# Cell names in Dataframe
def norm(x):
    return re.sub(r"[^0-9A-Za-z]", "", str(x)).upper()

df["_norm"] = df["cell_name"].map(norm)

In [5]:
print(f"Tahoe 'GDSC' dataset has {adata.obs['cell_name'].nunique()} cell names")
print(f"Dataframe has {df['cell_line'].nunique()} cell lines")
print(f"Dataframe has {df['cell_name'].nunique()} cell names")
print(f"Dataframe has {df['_norm'].nunique()} nomalized cell names")

Tahoe 'GDSC' dataset has 50 cell names
Dataframe has 50 cell lines
Dataframe has 68 cell names
Dataframe has 53 nomalized cell names


In [6]:
cell_norm_names = df['_norm'].unique()
cell_lines = df["cell_line"].unique()
cell_names = df["cell_name"].unique()
adata_names = adata.obs['cell_name'].unique()
adata_lines = adata.obs['cell_line'].unique()

In [63]:
## Check which names come from tahoe, and which ones they are
t_names = []
for name in cell_names:
    if name in adata_names:
        t_names.append(name)

print(f"Tahoe names in the Dataframe: {len(sorted(t_names))}")
print(f"Tahoe names in the Dataframe: {sorted(t_names)}")

Tahoe names in the Dataframe: 48
Tahoe names in the Dataframe: ['A-172', 'A-427', 'A498', 'A549', 'AN3 CA', 'AsPC-1', 'BT-474', 'C-33 A', 'C32', 'CFPAC-1', 'CHP-212', 'COLO 205', 'H4', 'HCT15', 'HEC-1-A', 'HOP62', 'HS-578T', 'HT-29', 'HepG2/C3A', 'Hs 766T', 'J82', 'KATO III', 'LOX-IMVI', 'LS 180', 'LoVo', 'MIA PaCa-2', 'NCI-H1573', 'NCI-H1792', 'NCI-H2030', 'NCI-H2122', 'NCI-H23', 'NCI-H2347', 'NCI-H460', 'NCI-H596', 'NCI-H661', 'PANC-1', 'Panc 03.27', 'RKO', 'RPMI-7951', 'SHP-77', 'SK-MEL-2', 'SNU-1', 'SNU-423', 'SW 900', 'SW1417', 'SW48', 'SW480', 'hTERT-HPNE']


In [7]:
mapping = {}
for line in cell_lines:
    mask = df["cell_line"] == line
    mapped_names = list(df[mask]["cell_name"].unique())
    print(f"{line}: ({len(mapped_names)}, {mapped_names})")
    mapping[line] = mapped_names

CVCL_0428: (2, ['MIA-PaCa-2', 'MIA PaCa-2'])
CVCL_0332: (2, ['Hs-578-T', 'HS-578T'])
CVCL_1666: (1, ['RPMI-7951'])
CVCL_0480: (1, ['PANC-1'])
CVCL_1285: (2, ['HOP-62', 'HOP62'])
CVCL_0131: (2, ['A172', 'A-172'])
CVCL_0546: (1, ['SW480'])
CVCL_1381: (2, ['LOXIMVI', 'LOX-IMVI'])
CVCL_1239: (1, ['H4'])
CVCL_C466: (1, ['hTERT-HPNE'])
CVCL_0023: (1, ['A549'])
CVCL_1056: (1, ['A498'])
CVCL_1125: (1, ['CHP-212'])
CVCL_1517: (1, ['NCI-H2030'])
CVCL_0399: (1, ['LoVo'])
CVCL_0397: (2, ['LS-180', 'LS 180'])
CVCL_0099: (2, ['NCI-SNU-1', 'SNU-1'])
CVCL_0292: (2, ['HCT-15', 'HCT15'])
CVCL_0334: (2, ['Hs766T', 'Hs 766T'])
CVCL_1478: (1, ['NCI-H1573'])
CVCL_0366: (1, ['SNU-423'])
CVCL_0152: (1, ['AsPC-1'])
CVCL_0459: (1, ['NCI-H460'])
CVCL_1547: (1, ['NCI-H23'])
CVCL_1119: (1, ['CFPAC-1'])
CVCL_0320: (1, ['HT-29'])
CVCL_1731: (2, ['SW900', 'SW 900'])
CVCL_0218: (2, ['COLO-205', 'COLO 205'])
CVCL_1717: (1, ['SW1417'])
CVCL_1098: (2, ['C3A', 'HepG2/C3A'])
CVCL_1495: (1, ['NCI-H1792'])
CVCL_0504: (1, ['R

In [5]:
mapping_norm = {}
for line in cell_lines:
    mask = df["cell_line"] == line
    mapped_names = list(df[mask]["_norm"].unique())
    print(f"{line}: ({len(mapped_names)}, {mapped_names})")
    mapping[line] = mapped_names

CVCL_0428: (1, ['MIAPACA2'])
CVCL_0332: (1, ['HS578T'])
CVCL_1666: (1, ['RPMI7951'])
CVCL_0480: (1, ['PANC1'])
CVCL_1285: (1, ['HOP62'])
CVCL_0131: (1, ['A172'])
CVCL_0546: (1, ['SW480'])
CVCL_1381: (1, ['LOXIMVI'])
CVCL_1239: (1, ['H4'])
CVCL_C466: (1, ['HTERTHPNE'])
CVCL_0023: (1, ['A549'])
CVCL_1056: (1, ['A498'])
CVCL_1125: (1, ['CHP212'])
CVCL_1517: (1, ['NCIH2030'])
CVCL_0399: (1, ['LOVO'])
CVCL_0397: (1, ['LS180'])
CVCL_0099: (2, ['NCISNU1', 'SNU1'])
CVCL_0292: (1, ['HCT15'])
CVCL_0334: (1, ['HS766T'])
CVCL_1478: (1, ['NCIH1573'])
CVCL_0366: (1, ['SNU423'])
CVCL_0152: (1, ['ASPC1'])
CVCL_0459: (1, ['NCIH460'])
CVCL_1547: (1, ['NCIH23'])
CVCL_1119: (1, ['CFPAC1'])
CVCL_0320: (1, ['HT29'])
CVCL_1731: (1, ['SW900'])
CVCL_0218: (1, ['COLO205'])
CVCL_1717: (1, ['SW1417'])
CVCL_1098: (2, ['C3A', 'HEPG2C3A'])
CVCL_1495: (1, ['NCIH1792'])
CVCL_0504: (1, ['RKO'])
CVCL_0359: (1, ['J82'])
CVCL_1094: (1, ['C33A'])
CVCL_1097: (1, ['C32'])
CVCL_1550: (1, ['NCIH2347'])
CVCL_0371: (1, ['KATOIII

### Name Normalization

In [20]:
cvcl_map = {
    "CVCL_0428": ['MIAPACA2'],
    "CVCL_0332": ['HS578T'],
    "CVCL_1666": ['RPMI7951'],
    "CVCL_0480": ['PANC1'],
    "CVCL_1285": ['HOP62'],
    "CVCL_0131": ['A172'],
    "CVCL_0546": ['SW480'],
    "CVCL_1381": ['LOXIMVI'],
    "CVCL_1239": ['H4'],
    "CVCL_C466": ['HTERTHPNE'],
    "CVCL_0023": ['A549'],
    "CVCL_1056": ['A498'],
    "CVCL_1125": ['CHP212'],
    "CVCL_1517": ['NCIH2030'],
    "CVCL_0399": ['LOVO'],
    "CVCL_0397": ['LS180'],
    "CVCL_0099": ['NCISNU1', 'SNU1'],
    "CVCL_0292": ['HCT15'],
    "CVCL_0334": ['HS766T'],
    "CVCL_1478": ['NCIH1573'],
    "CVCL_0366": ['SNU423'],
    "CVCL_0152": ['ASPC1'],
    "CVCL_0459": ['NCIH460'],
    "CVCL_1547": ['NCIH23'],
    "CVCL_1119": ['CFPAC1'],
    "CVCL_0320": ['HT29'],
    "CVCL_1731": ['SW900'],
    "CVCL_0218": ['COLO205'],
    "CVCL_1717": ['SW1417'],
    "CVCL_1098": ['C3A', 'HEPG2C3A'],
    "CVCL_1495": ['NCIH1792'],
    "CVCL_0504": ['RKO'],
    "CVCL_0359": ['J82'],
    "CVCL_1094": ['C33A'],
    "CVCL_1097": ['C32'],
    "CVCL_1550": ['NCIH2347'],
    "CVCL_0371": ['KATOIII'],
    "CVCL_1055": ['A427'],
    "CVCL_0293": ['HEC1A'],
    "CVCL_0069": ['SKMEL2'],
    "CVCL_0028": ['AN3CA'],
    "CVCL_1724": ['SW48'],
    "CVCL_1635": ['PANC03270327', 'PANC0327'],
    "CVCL_0179": ['BT474'],
    "CVCL_1693": ['SHP77'],
    "CVCL_1715": ['SW1088'],
    "CVCL_1716": ['SW1271'],
    "CVCL_1577": ['NCIH661'],
    "CVCL_1571": ['NCIH596'],
    "CVCL_1531": ['NCIH2122'],
}

In [32]:
variant_to_canonical = {}

for cvcl, names in cvcl_map.items():
    # pick canonical name = shortest
    canonical = min(names, key=len)

    for name in names:
        variant_to_canonical[name] = canonical

In [51]:
def map_variant(x, variant_to_canonical):
    return variant_to_canonical[x] if x in variant_to_canonical.keys() else "_"

## Control Name Extraction

In [28]:
bin_path = "/cluster/work/bewi/data/tahoe100/metadata/Cmax_viability_binarized_FULL_with_cell_name_thresh0p5.csv"
# control_adata_path = "/cluster/work/bewi/data/tahoe100/h5ad/controls_Trametinib_merged.h5ad"

bin_df = pd.read_csv(bin_path)
# control_adata = sc.read_h5ad(control_adata_path)

In [48]:
gdsc_names = bin_df["cell_name"].unique()
tahoe_names = adata.obs["cell_name"].unique()
tahoe_lines = adata.obs["cell_line"].unique()

In [57]:
for line in tahoe_lines:
    if line not in cell_lines:
        print(line)
# So all tahoe cell_lines are in GDSC?? YES, AS IT SEEMS (or at least in the mapping)
print(len(cell_lines))

50


In [108]:
# gdsc_norm_names = [map_variant(norm(name), variant_to_canonical) for name in gdsc_names]
# tahoe_norm_names = [map_variant(norm(name), variant_to_canonical) for name in tahoe_names]
# tahoe_norm_names = norm_names

gdsc_norm_names = [norm(name) for name in gdsc_names]
tahoe_norm_names = [norm(name) for name in tahoe_names]

i=1
for name in sorted(tahoe_norm_names):
    if name in gdsc_norm_names:
        print(f"{i}. {name}")
        i += 1

1. A172
2. A427
3. A498
4. A549
5. ASPC1
6. BT474
7. C32
8. C33A
9. CFPAC1
10. CHP212
11. COLO205
12. H4
13. HCT15
14. HOP62
15. HS578T
16. HS766T
17. HT29
18. J82
19. KATOIII
20. LOVO
21. LOXIMVI
22. LS180
23. MIAPACA2
24. NCIH1573
25. NCIH1792
26. NCIH2030
27. NCIH2122
28. NCIH23
29. NCIH2347
30. NCIH460
31. NCIH596
32. NCIH661
33. RKO
34. RPMI7951
35. SHP77
36. SKMEL2
37. SNU1
38. SNU423
39. SW1417


In [98]:
len(cell_lines)

50

## Normalized DF

In [30]:
path = "/cluster/work/bewi/data/tahoe100/metadata/controls_col_cellCode_with_sensitivity_norm_names.csv"
df_norm = pd.read_csv(path)

In [31]:
norm_lines = df_norm["cell_line"].unique()
norm_names = df_norm["norm_name"].unique()

In [33]:
len(norm_names)

50

In [57]:
### Reconstruct variant to canonical dict
idx = np.random.choice(len(df_norm), size=50_000_000, replace=False)
df_red = df_norm.iloc[idx, :]
variant_to_canonical = {}

print("Building map")
for name in cell_names:
    print(".", end="")
    variant_to_canonical[name] = df_red[df_red["cell_name"] == name]["norm_name"].values[0]

Building map
....................................................................

In [67]:
## Check which names come from tahoe, and which ones they are
t_names = []
for name in [map_variant(n, variant_to_canonical) for n in adata_names]:
    if name in [map_variant(n, variant_to_canonical) for n in cell_names]:
        t_names.append(name)

print(f"Tahoe names in the Dataframe: {len(sorted(t_names))}")
print(f"Tahoe names in the Dataframe: {sorted(t_names)}")

Tahoe names in the Dataframe: 48
Tahoe names in the Dataframe: ['A172', 'A427', 'A498', 'A549', 'AN3CA', 'ASPC1', 'BT474', 'C32', 'C33A', 'C3A', 'CFPAC1', 'CHP212', 'COLO205', 'H4', 'HCT15', 'HEC1A', 'HOP62', 'HS578T', 'HS766T', 'HT29', 'HTERTHPNE', 'J82', 'KATOIII', 'LOVO', 'LOXIMVI', 'LS180', 'MIAPACA2', 'NCIH1573', 'NCIH1792', 'NCIH2030', 'NCIH2122', 'NCIH23', 'NCIH2347', 'NCIH460', 'NCIH596', 'NCIH661', 'PANC0327', 'PANC1', 'RKO', 'RPMI7951', 'SHP77', 'SKMEL2', 'SNU1', 'SNU423', 'SW1417', 'SW48', 'SW480', 'SW900']


In [95]:
i = norm_names.tolist().index("C33A")
norm_names[i] = "HepG2"

sorted_norm = sorted(norm_names.tolist())
sorted_tahoe = sorted(tahoe_names.tolist())

for i in range(len(norm_names)):
    print(f"{sorted_norm[i]}, {sorted_tahoe[i]}")

A172, A-172
A427, A-427
A498, A498
A549, A549
AN3CA, AN3 CA
ASPC1, AsPC-1
BT474, BT-474
C32, C-33 A
C3A, C32
CFPAC1, CFPAC-1
CHP212, CHP-212
COLO205, COLO 205
H4, H4
HCT15, HCT15
HEC1A, HEC-1-A
HOP62, HOP62
HS578T, HS-578T
HS766T, HT-29
HT29, HepG2/C3A
HTERTHPNE, Hs 766T
HepG2, J82
J82, KATO III
KATOIII, LOX-IMVI
LOVO, LS 180
LOXIMVI, LoVo
LS180, MIA PaCa-2
MIAPACA2, NCI-H1573
NCIH1573, NCI-H1792
NCIH1792, NCI-H2030
NCIH2030, NCI-H2122
NCIH2122, NCI-H23
NCIH23, NCI-H2347
NCIH2347, NCI-H460
NCIH460, NCI-H596
NCIH596, NCI-H661
NCIH661, PANC-1
PANC0327, Panc 03.27
PANC1, RKO
RKO, RPMI-7951
RPMI7951, SHP-77
SHP77, SK-MEL-2
SKMEL2, SNU-1
SNU1, SNU-423
SNU423, SW 1088
SW1088, SW 1271
SW1271, SW 900
SW1417, SW1417
SW48, SW48
SW480, SW480
SW900, hTERT-HPNE


## NPY Files

In [2]:
table_path = "/cluster/work/bewi/members/rquiles/zeroshot_amr/data/combined_long_table.csv"
npy_path = "/cluster/work/bewi/members/rquiles/zeroshot_amr/data/raw_data_fcr.npy"
splits_path = "/cluster/work/bewi/members/rquiles/zeroshot_amr/data/data_splits.csv"

table = pd.read_csv(table_path)
npy = np.load(npy_path)
splits = pd.read_csv(splits_path)

In [3]:
print(f"Table shape: {table.shape}")
print(f"NPY shape: {npy.shape}")
print(f"Splits shape: {splits.shape}")

Table shape: (51895441, 5)
NPY shape: (622935, 256)
Splits shape: (622935, 2)


## Marina's Matching

In [3]:
df = pd.read_csv("/cluster/work/bewi/data/tahoe100/metadata/controls_col_cellCode_with_sensitivity.csv")

def norm(x):
    return re.sub(r"[^0-9A-Za-z]", "", str(x)).upper()

df["_norm"] = df["cell_name"].map(norm)

matched = df[df["drug"].notna()]
print("TRUE matched cells (normalized):", matched["_norm"].nunique())

TRUE matched cells (normalized): 46


In [5]:
tahoe_path = "/cluster/work/bewi/data/tahoe100/h5ad/controls_Trametinib_merged.h5ad"
adata = sc.read_h5ad(tahoe_path, backed ="r")

In [12]:
tahoe_names = adata.obs["cell_name"].unique()
norm_names = matched["_norm"].unique()

In [21]:
print(sorted(list(tahoe_names)))

['A-172', 'A-427', 'A498', 'A549', 'AN3 CA', 'AsPC-1', 'BT-474', 'C-33 A', 'C32', 'CFPAC-1', 'CHP-212', 'COLO 205', 'H4', 'HCT15', 'HEC-1-A', 'HOP62', 'HS-578T', 'HT-29', 'HepG2/C3A', 'Hs 766T', 'J82', 'KATO III', 'LOX-IMVI', 'LS 180', 'LoVo', 'MIA PaCa-2', 'NCI-H1573', 'NCI-H1792', 'NCI-H2030', 'NCI-H2122', 'NCI-H23', 'NCI-H2347', 'NCI-H460', 'NCI-H596', 'NCI-H661', 'PANC-1', 'Panc 03.27', 'RKO', 'RPMI-7951', 'SHP-77', 'SK-MEL-2', 'SNU-1', 'SNU-423', 'SW 1088', 'SW 1271', 'SW 900', 'SW1417', 'SW48', 'SW480', 'hTERT-HPNE']


In [48]:
print(sorted(list(norm_names)))

['A172', 'A427', 'A498', 'A549', 'AN3CA', 'ASPC1', 'BT474', 'C32', 'C33A', 'C3A', 'CFPAC1', 'CHP212', 'COLO205', 'H4', 'HCT15', 'HEC1A', 'HOP62', 'HS578T', 'HS766T', 'HT29', 'J82', 'KATOIII', 'LOVO', 'LOXIMVI', 'LS180', 'MIAPACA2', 'NCIH1573', 'NCIH1792', 'NCIH2030', 'NCIH2122', 'NCIH23', 'NCIH2347', 'NCIH596', 'NCIH661', 'NCISNU1', 'PANC0327', 'RKO', 'RPMI7951', 'SHP77', 'SKMEL2', 'SNU423', 'SW1088', 'SW1271', 'SW1417', 'SW48', 'SW900']


In [59]:
matched_tahoe = []

for t_name in tahoe_names:
    matched = False
    for name in norm_names:
        if norm(t_name) in name:
            matched_tahoe.append(t_name)
            matched = True
            break
        elif "/" in t_name and name in t_name.split("/"):
            matched_tahoe.append(t_name)
            matched = True
            
    if not matched:
        print(f"Not matched: {t_name}")

print("Matched", len(matched_tahoe))
print(sorted(matched_tahoe))

Not matched: PANC-1
Not matched: SW480
Not matched: hTERT-HPNE
Not matched: NCI-H460
Matched 46
['A-172', 'A-427', 'A498', 'A549', 'AN3 CA', 'AsPC-1', 'BT-474', 'C-33 A', 'C32', 'CFPAC-1', 'CHP-212', 'COLO 205', 'H4', 'HCT15', 'HEC-1-A', 'HOP62', 'HS-578T', 'HT-29', 'HepG2/C3A', 'Hs 766T', 'J82', 'KATO III', 'LOX-IMVI', 'LS 180', 'LoVo', 'MIA PaCa-2', 'NCI-H1573', 'NCI-H1792', 'NCI-H2030', 'NCI-H2122', 'NCI-H23', 'NCI-H2347', 'NCI-H596', 'NCI-H661', 'Panc 03.27', 'RKO', 'RPMI-7951', 'SHP-77', 'SK-MEL-2', 'SNU-1', 'SNU-423', 'SW 1088', 'SW 1271', 'SW 900', 'SW1417', 'SW48']


In [52]:
print(sorted(list(norm_names)))

['A172', 'A427', 'A498', 'A549', 'AN3CA', 'ASPC1', 'BT474', 'C32', 'C33A', 'C3A', 'CFPAC1', 'CHP212', 'COLO205', 'H4', 'HCT15', 'HEC1A', 'HOP62', 'HS578T', 'HS766T', 'HT29', 'J82', 'KATOIII', 'LOVO', 'LOXIMVI', 'LS180', 'MIAPACA2', 'NCIH1573', 'NCIH1792', 'NCIH2030', 'NCIH2122', 'NCIH23', 'NCIH2347', 'NCIH596', 'NCIH661', 'NCISNU1', 'PANC0327', 'RKO', 'RPMI7951', 'SHP77', 'SKMEL2', 'SNU423', 'SW1088', 'SW1271', 'SW1417', 'SW48', 'SW900']


## GDSC Lines in TAHOE:

In [61]:
print("Matched", len(matched_tahoe))
print(sorted(matched_tahoe))

Matched 46
['A-172', 'A-427', 'A498', 'A549', 'AN3 CA', 'AsPC-1', 'BT-474', 'C-33 A', 'C32', 'CFPAC-1', 'CHP-212', 'COLO 205', 'H4', 'HCT15', 'HEC-1-A', 'HOP62', 'HS-578T', 'HT-29', 'HepG2/C3A', 'Hs 766T', 'J82', 'KATO III', 'LOX-IMVI', 'LS 180', 'LoVo', 'MIA PaCa-2', 'NCI-H1573', 'NCI-H1792', 'NCI-H2030', 'NCI-H2122', 'NCI-H23', 'NCI-H2347', 'NCI-H596', 'NCI-H661', 'Panc 03.27', 'RKO', 'RPMI-7951', 'SHP-77', 'SK-MEL-2', 'SNU-1', 'SNU-423', 'SW 1088', 'SW 1271', 'SW 900', 'SW1417', 'SW48']


## Polish datasplits.csv

In [2]:
path = "/cluster/work/bewi/members/rquiles/zeroshot_amr/data/data_splits.csv"
splits = pd.read_csv(path)
splits = splits.drop(columns=["species"])
splits.to_csv(path, index=False)

In [3]:
splits = pd.read_csv(path)
splits

Unnamed: 0,sample_id,Set
0,95_001_111-lib_1681,train
1,95_002_071-lib_1681,train
2,95_003_116-lib_1681,train
3,95_004_100-lib_1681,train
4,95_004_118-lib_1681,train
...,...,...
622930,96_186_088-lib_2608,train
622931,96_187_065-lib_2608,train
622932,96_187_181-lib_2608,train
622933,96_191_125-lib_2608,train


## Ensure the order of sample_ids is consistent

In [2]:
base_dir = "/cluster/work/bewi/members/rquiles/zeroshot_amr/data"
metadata_path = os.path.join(base_dir, "combined_long_table.csv")
splits_path = os.path.join(base_dir, "data_splits.csv")
raw_data_path = os.path.join(base_dir, "raw_data_pcs_10.npy")

metadata = pd.read_csv(metadata_path)
splits = pd.read_csv(splits_path)
raw_data = np.load(raw_data_path)

In [5]:
ordered_ids = metadata["sample_id"].unique()
ordered_ids[:10]

array(['95_001_111-lib_1681', '95_002_071-lib_1681',
       '95_003_116-lib_1681', '95_004_100-lib_1681',
       '95_004_118-lib_1681', '95_005_118-lib_1681',
       '95_005_173-lib_1681', '95_006_043-lib_1681',
       '95_006_119-lib_1681', '95_006_180-lib_1681'], dtype=object)

In [6]:
splits["sample_id"].values[:10]

array(['95_001_111-lib_1681', '95_002_071-lib_1681',
       '95_003_116-lib_1681', '95_004_100-lib_1681',
       '95_004_118-lib_1681', '95_005_118-lib_1681',
       '95_005_173-lib_1681', '95_006_043-lib_1681',
       '95_006_119-lib_1681', '95_006_180-lib_1681'], dtype=object)

In [7]:
raw_data

array([[-20.876472  ,  61.55096   ,   0.69299126, ...,  -3.4878244 ,
         -5.769747  ,  -3.2680035 ],
       [-31.642021  , -15.300543  ,   8.374955  , ...,  -5.9579697 ,
         -4.089421  ,  -4.496514  ],
       [-32.7939    , -43.668045  ,   6.595552  , ...,  -0.9419194 ,
         -7.8659644 ,   9.137633  ],
       ...,
       [134.20972   , -95.82281   ,  24.512474  , ...,  16.040594  ,
         21.88488   ,  24.234621  ],
       [-21.950214  ,  -2.6763594 ,  -9.915962  , ...,   5.8302073 ,
          3.9614544 ,   1.4512347 ],
       [212.29276   ,  73.57664   ,  55.4049    , ...,  17.794415  ,
         -2.0499473 ,   0.60318065]], dtype=float32)

## Drug Fingerprints

In [6]:
fp_path = "/cluster/work/bewi/members/rquiles/zeroshot_amr/data/drug_fingerprints_Mol_selfies.csv"
tutotrial_fp_path = "/cluster/work/bewi/members/rquiles/zeroshot_amr/tutorial/data/drug_fingerprints_Mol_selfies.csv"
fp = pd.read_csv(fp_path)
tutorial_fp = pd.read_csv(tutotrial_fp_path)

In [5]:
fp

Unnamed: 0,drug,morgan_512_fp,morgan_1024_fp,MACCS_fp
0,Fulvestrant,0001000100000001000001000000100001001000000000...,0001000000000001000000000000100001001000000000...,0000000000000000000000000000000000000000001000...
1,Paclitaxel,0100000001010000000000000000000001001000000001...,0100000000010000000000000000000001001000000000...,0000000010010000000000000010000000000000000000...
2,Bortezomib,0100000000000000000000000000010001000000101000...,0100000000000000000000000000000001000000001000...,0000000000000000001000000000000000000000000110...
3,Rucaparib,0000000000001000000000000000000001000100000000...,0000000000000000000000000000000001000000000000...,0000000000000000000100000000000000000000001000...
4,Vismodegib,0000000000000000000000000000000001000000000000...,0000000000000000000000000000000001000000000000...,0000000000000000000000000000000000000000000000...
...,...,...,...,...
60,Docetaxel,0100000000010000100000000000000001001010000001...,0100000000010000100000000000000001001000000000...,0000000010010000000000010010000000000000000000...
61,Venetoclax,0000000100010000000000000000011001001000100001...,0000000100000000000000000000010001001000000001...,0000000000000000000000001000000011000010000000...
62,Bexarotene,0000000000000000010000000000000001001000001000...,0000000000000000010000000000000001001000001000...,0000000000000000000000000000000000100000000000...
63,Trametinib,0000000000000001001000000100000001000000000000...,0000000000000001000000000100000001000000000000...,0000000000000000000000100001000000000110001000...


In [7]:
tutorial_fp

Unnamed: 0,drug,MACCS_fp,morgan_512_fp,morgan_1024_fp,pubchem_fp,molformer_github,molformer_huggingFace,selfies_label,selfies_flattened_one_hot
0,5-Fluorocytosine,0000000000000000000000000000000000000110001100...,0000000000000000000000000000000000000000000000...,0000000000000000000000000000000000000000000000...,1101110001100000000110001100100001000000000000...,"-0.0951872841,-0.636924803,-0.331326634,0.5368...","0.5046858787536621,0.4210881292819977,-0.08740...",27 19 30 8 27 17 30 23 27 19 30 2 27 17 30 2...,0000000000000000000000000001000000000000000000...
1,Amikacin,0000000000000000000000000000000000000000000000...,0100000000000100000001000000001001010000000000...,0100000000000100000000000000001000010000000000...,1101110001111100000111101110111100000000000000...,"0.794443607,0.012108786,0.0602180995,0.8221735...","0.19468684494495392,-0.5759977698326111,-0.192...",27 19 30 8 27 27 19 30 18 18 22 30 2 27 27 1...,0000000000000000000000000001000000000000000000...
2,Amoxicillin,0000000010010000000100000000000000001000000000...,0100010000000000001000000000000001001000000000...,0100010000000000001000000000000001001000000000...,1101110001111000000111101100111000000000000100...,"-0.277433664,0.456426084,0.418058097,0.2918549...","0.7805112600326538,0.3971932530403137,0.067437...",27 19 30 27 19 30 8 2 27 27 19 30 18 18 22 3...,0000000000000000000000000001000000000000000000...
3,Amphotericin B,0000000000000000000000000000000000000000000000...,0100000000000000000001000010000001011011100000...,0100000000000000000000000010000001011000100000...,1101110001111100000111111000111110000000000000...,"0.440224081,-0.506301224,0.470279306,0.1629443...","0.9433532953262329,-0.13454332947731018,1.1354...",27 19 30 27 27 19 30 18 22 30 8 7 27 19 30 2...,0000000000000000000000000001000000000000000000...
4,Ampicillin,0000000010010000000100000000000000001000000000...,0100010000000000001000000000000001001000000001...,0100010000000000001000000000000001001000000000...,1101110001111000000111101100111000000000000100...,"-0.651477933,0.708402514,0.419002682,0.8218743...","0.8034102916717529,0.3502717614173889,-0.03101...",27 19 30 27 19 30 8 2 27 27 19 30 18 18 22 3...,0000000000000000000000000001000000000000000000...
...,...,...,...,...,...,...,...,...,...
60,Ticarcillin,0000000010010000000100000000000000001000000000...,0100010000000010001000000000000001001000100000...,0100010000000010001000000000000001001000000000...,1101110001111000000111001100111000000000000110...,"-0.416813314,0.33261463,0.812118948,0.62490588...","0.7444556355476379,0.4137423634529114,0.243047...",27 19 30 27 19 30 8 2 27 27 19 30 18 18 22 3...,0000000000000000000000000001000000000000000000...
61,Tigecycline,0000000000000000000000000010000000000000000000...,0000000100000100000000000000010001001100011001...,0000000000000000000000000000000001001100010000...,1101110001111100000111101110111100000000000000...,"0.265602767,-0.041300226,0.903219342,0.1615294...","0.6586130261421204,0.11598372459411621,0.12656...",27 19 30 27 19 30 2 27 19 30 3 2 27 19 30 ...,0000000000000000000000000001000000000000000000...
62,Tobramycin,0000000000000000000000000000000000000000000000...,0000000000010000000001000000000000010000000000...,0000000000010000000000000000000000010000000000...,1101110001111100000111101110111100000000000000...,"0.935077727,0.235020116,0.335049093,0.92905676...","0.3344675600528717,-0.3647572696208954,0.02367...",27 19 30 8 27 27 19 30 18 18 22 30 2 27 27 1...,0000000000000000000000000001000000000000000000...
63,Vancomycin,0000000000000000000000000000000000000000000000...,0101000000111000000010000000000001001001000010...,0101000000110000000000000000000001001000000010...,1101110001111100000111111111111110000000000000...,"0.655232847,0.684636295,0.521011412,0.81529390...","0.8847649097442627,0.35866332054138184,0.79055...",27 19 30 27 27 19 30 18 22 30 8 27 27 19 30 1...,0000000000000000000000000001000000000000000000...


In [9]:
print("Morgan 1024:")
print(f"Type: {type(fp['morgan_1024_fp'].values[0])}")
print(f"Lenght : {len(fp['morgan_1024_fp'].values[0])}")
print(f"Type tutorial: {type(tutorial_fp['morgan_1024_fp'].values[0])}")
print(f"Lenght : {len(tutorial_fp['morgan_1024_fp'].values[0])}")

Morgan 1024:
Type: <class 'str'>
Lenght : 1024
Type tutorial: <class 'str'>
Lenght : 1024


In [11]:
print("Morgan 512:")
print(f"Type: {type(fp['morgan_512_fp'].values[0])}")
print(f"Lenght : {len(fp['morgan_512_fp'].values[0])}")
print(f"Type tutorial: {type(tutorial_fp['morgan_512_fp'].values[0])}")
print(f"Lenght : {len(tutorial_fp['morgan_512_fp'].values[0])}")

Morgan 512:
Type: <class 'str'>
Lenght : 512
Type tutorial: <class 'str'>
Lenght : 512


In [12]:
print("MACCS:")
print(f"Type: {type(fp['MACCS_fp'].values[0])}")
print(f"Lenght : {len(fp['MACCS_fp'].values[0])}")
print(f"Type tutorial: {type(tutorial_fp['MACCS_fp'].values[0])}")
print(f"Lenght : {len(tutorial_fp['MACCS_fp'].values[0])}")

MACCS:
Type: <class 'str'>
Lenght : 167
Type tutorial: <class 'str'>
Lenght : 167


## NPY files

In [14]:
base_dir = "/cluster/work/bewi/members/rquiles/zeroshot_amr/data"
raw_data_path = os.path.join(base_dir, "raw_data_piscvi.npy")
raw_data = np.load(raw_data_path)

In [15]:
raw_data

array([[ 0.30881578, -0.14809877, -1.1304791 , ...,  0.9338942 ,
        -0.24623129,  0.9067464 ],
       [ 0.30683222, -1.2602931 ,  1.3540354 , ..., -0.54319316,
        -0.74291515,  1.2977934 ],
       [ 0.03457972, -0.04553065, -1.1440802 , ..., -2.477941  ,
        -1.2291691 ,  0.32076746],
       ...,
       [ 0.016106  ,  0.39079282, -0.41521865, ...,  0.03211251,
        -0.58732617,  0.70907813],
       [ 0.13133311,  1.0478519 ,  0.8778622 , ..., -0.3308695 ,
         1.5471386 , -0.40646625],
       [-0.7676844 ,  0.45143938,  0.2846542 , ..., -0.89967483,
        -0.78235483, -0.51485986]], dtype=float32)

In [16]:
raw_data.shape

(622935, 100)

## Predictions file

In [2]:
splits_path = "/cluster/work/bewi/members/rquiles/zeroshot_amr/data/data_splits.csv"
metadata_path = "/cluster/work/bewi/members/rquiles/zeroshot_amr/data/combined_long_table.csv"
base_dir = "/cluster/work/bewi/members/rquiles/zeroshot_amr/data"
GROUP_NAME = "morgan_1024"
EXPERIMENT_NAME = "pcs_25"
pred_path = f"{base_dir}/experiments/outputs/{GROUP_NAME}/{EXPERIMENT_NAME}_DRIAMS-any_specific_results/test_set_seed0.csv"

df = pd.read_csv(metadata_path)
splits = pd.read_csv(splits_path)
pred = pd.read_csv(pred_path)

FileNotFoundError: [Errno 2] No such file or directory: '/cluster/work/bewi/members/rquiles/zeroshot_amr/data/experiments/outputs/morgan_1024/pcs_25_DRIAMS-any_specific_results/test_set_seed0.csv'

In [None]:
splits

In [51]:
train_ids = splits[splits["Set"] == "train"]["sample_id"].values
test_ids = splits[splits["Set"] == "test"]["sample_id"].values

all_lines = df["species"].unique()
test_lines = df[df["sample_id"].isin(test_ids)]["species"].unique()
train_lines = df[df["sample_id"].isin(train_ids)]["species"].unique()
train_only_lines = train_lines[:35]
train_random_lines = train_lines[35:40]
zeroshot_lines = [line for line in test_lines if line not in train_lines]

print(f"All lines ({len(all_lines)}): {all_lines}")
print("\n")
print(f"Train-Only lines ({len(train_only_lines)}): {train_only_lines}")
print("\n")
print(f"Test lines ({len(test_lines)}): {test_lines}")
print("\n")
print(f"Train-Random Split lines ({len(train_random_lines)}): {train_random_lines}")
print("\n")
print(f"Zeroshot lines ({len(zeroshot_lines)}): {zeroshot_lines}")

All lines (46): ['HOP62' 'A172' 'NCIH2030' 'NCISNU1' 'HCT15' 'A498' 'ASPC1' 'MIAPACA2'
 'HT29' 'SW900' 'COLO205' 'CFPAC1' 'LOXIMVI' 'LS180' 'RKO' 'J82' 'C33A'
 'NCIH1792' 'C32' 'LOVO' 'NCIH2347' 'RPMI7951' 'A549' 'NCIH23' 'SKMEL2'
 'AN3CA' 'HS578T' 'KATOIII' 'SNU423' 'HEC1A' 'BT474' 'A427' 'SHP77'
 'CHP212' 'HS766T' 'C3A' 'SW1417' 'SW48' 'PANC0327' 'NCIH1573' 'H4'
 'SW1088' 'NCIH596' 'NCIH661' 'SW1271' 'NCIH2122']


Train-Only lines (35): ['HOP62' 'A172' 'NCIH2030' 'NCISNU1' 'HCT15' 'A498' 'ASPC1' 'MIAPACA2'
 'HT29' 'SW900' 'COLO205' 'CFPAC1' 'LOXIMVI' 'LS180' 'RKO' 'J82' 'C33A'
 'NCIH1792' 'C32' 'LOVO' 'NCIH2347' 'RPMI7951' 'A549' 'NCIH23' 'SKMEL2'
 'AN3CA' 'HS578T' 'KATOIII' 'SNU423' 'HEC1A' 'BT474' 'A427' 'SHP77'
 'CHP212' 'HS766T']


Test lines (11): ['C3A' 'SW1417' 'PANC0327' 'H4' 'SW1088' 'NCIH596' 'NCIH661' 'NCIH1573'
 'SW1271' 'SW48' 'NCIH2122']


Train-Random Split lines (5): ['SW48' 'NCIH1573' 'C3A' 'PANC0327' 'SW1417']


Zeroshot lines (6): ['H4', 'SW1088', 'NCIH596', 'NCIH6

In [53]:
all_drugs = df["drug"].unique()
train_drugs = df[df["species"].isin(train_only_lines)]["drug"].unique() 
zeroshot_drugs = [drug for drug in all_drugs if drug not in train_only_drugs]

print(f"All Drugs: ({len(all_drugs)}): {sorted(all_drugs)}")
print("\n")
print(f"Train Drugs: ({len(train_drugs)}): {sorted(train_drugs)}")
print("\n")
print(f"Zeroshot Drugs: ({len(zeroshot_drugs)}): {sorted(zeroshot_drugs)}")

All Drugs: (65): ['5-Fluorouracil', 'Afatinib', 'Alectinib', 'Axitinib', 'Belinostat', 'Bexarotene', 'Bicalutamide', 'Bleomycin', 'Bortezomib', 'Bosutinib', 'Cabozantinib', 'Carmustine', 'Cisplatin', 'Crizotinib', 'Cyclophosphamide', 'Cytarabine', 'Dabrafenib', 'Dactinomycin', 'Dasatinib', 'Docetaxel', 'Doxorubicin', 'Epirubicin', 'Erlotinib', 'Etoposide', 'Fulvestrant', 'Gefitinib', 'Gemcitabine', 'Ibrutinib', 'Idelalisib', 'Irinotecan', 'Lapatinib', 'Lenalidomide', 'Methotrexate', 'Mitomycin-C', 'Mitoxantrone', 'Nelarabine', 'Nilotinib', 'Olaparib', 'Osimertinib', 'Oxaliplatin', 'Paclitaxel', 'Palbociclib', 'Panobinostat', 'Pazopanib', 'Pemetrexed', 'Ponatinib', 'Rapamycin', 'Rucaparib', 'Ruxolitinib', 'SN-38', 'Sorafenib', 'Tamoxifen', 'Temozolomide', 'Temsirolimus', 'Teniposide', 'Topotecan', 'Trametinib', 'Tretinoin', 'Venetoclax', 'Vinblastine', 'Vincristine', 'Vinorelbine', 'Vismodegib', 'Vorinostat', 'Zoledronate']


Train Drugs: (57): ['5-Fluorouracil', 'Afatinib', 'Alectinib'

Note: compared to the "generate_inputs.py" script output, there are 2 missing zeroshot drugs: Sunitinib and Sunitinib. The only explanation is that they were effectively eliminated from the train_only set, and the remaining cell lines did not have data about these drugs, so they were effectively eliminated from the dataset.

In [48]:
df[(df["drug"] == "Imatinib")]

Unnamed: 0,species,sample_id,drug,response,dataset


In [49]:
df[(df["drug"] == "Sunitinib")]

Unnamed: 0,species,sample_id,drug,response,dataset


In [56]:
# Define the following sets (by sample_id)
pred["experiment"] = "_" # placeholder

# a) Random splits
mask = (pred["species"].isin(train_random_lines)) & (pred["drug"].isin(train_drugs))
pred.loc[mask, "experiment"] = "random_split"

# b) Cell line zeroshot
mask = (pred["species"].isin(zeroshot_lines)) & (pred["drug"].isin(train_drugs))
pred.loc[mask, "experiment"] = "cell_line_zeroshot"

# c) Drug zeroshot
mask = (pred["species"].isin(train_random_lines)) & (pred["drug"].isin(zeroshot_drugs))
pred.loc[mask, "experiment"] = "drug_zeroshot"

# d) Drug and Cell line zeroshot
mask = (pred["species"].isin(zeroshot_lines)) & (pred["drug"].isin(zeroshot_drugs))
pred.loc[mask, "experiment"] = "cell_line_drug_zeroshot"

In [58]:
assert len(pred[pred["experiment"] == "_"]) == 0

In [59]:
# Generate 4 different dataframes
random_split = pred[pred["experiment"] == "random_split"].copy()
cell_line_zeroshot = pred[pred["experiment"] == "cell_line_zeroshot"].copy()
drug_zeroshot = pred[pred["experiment"] == "drug_zeroshot"].copy()
cell_line_drug_zeroshot = pred[pred["experiment"] == "cell_line_drug_zeroshot"].copy()