In [1]:
import os
import sys

In [7]:
import scanpy as sc
import anndata as ad
import numpy as np
import pandas as pd
import re
import chemprop
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

from rdkit import Chem
from rdkit.Chem import Descriptors
import numpy as np
import pandas as pd

In [18]:
ood_drugs = [
    "Dacinostat",
    "Givinostat",
    "Belinostat",
    "Hesperadin",
    "Quisinostat",
    "Alvespimycin",
    "Tanespimycin",
    "TAK-901",
    "Flavopiridol",
]

validation_drugs = [
    "CUDC-907",
    "Luminespib",
    "Epothilone",
    "Raltitrexed",
    "CUDC-101",
    "Abexinostat",
    "Panobinostat",
    "AR-42",
    "Trichostatin",
    "M344",
    "Resminostat",
    "Pirarubicin", #ood
    "Mocetinostat", #no_ood
    "Pracinostat", #no_ood
    "Entinostat", #no_ood
    "Tucidinostat", #no_ood
    "Tacedinaline", #no_ood
    "Patupilone", #no_ood
    "GSK1070916", #no_ood
    "JNJ-26854165", #no_ood
    "Trametinib",  #no_ood
    "Dasatinib" #no_ood
    "YM155", #apoptosis
    "Barasertib", #cell cycle
    "Fulvestrant", #nuclear receptor
    "Nintedanib", #tyrosine
    "Rigosertib", #tyrosine
    "BMS-754807", #tyrosine
    "KW-2449", #tyrosine
    "Crizotinib", #tyrosin
    "ENMD-2076", #cell cycle
    "Alisertib", #cell cycle
    "(+)-JQ1", #epigenetic
]


groups = [
    "Epigenetic regulation",
    "Tyrosine kinase signaling",
    "DNA damage & DNA repair",
    "Cell cycle regulation"
]

groups_colors = {
    "Epigenetic regulation": "#FFE162",
    "Histone deacetylation" : "#FFE162",
    "Tyrosine kinase signaling": "#FF6464",
    "DNA damage & DNA repair":"#91C483",
    "Cell cycle regulation": "#7EC8E3",
    "NA":"#EEEEEE",
}


decoder_width = 4096
decoder_depth = 4
latent_lr = 1e-4
latent_wd = 1e-4
decoder_lr = 1e-4
decoder_wd = 1e-4
attribute_dropout_rate = 0.1
attribute_nn_width = 2048
attribute_nn_depth = 2
attribute_nn_lr = 1e-2
attribute_nn_wd = 4e-8
unknown_attribute_noise_param = 2e+1
unknown_attribute_penalty = 1e-1
cosine_scheduler = True
train_classifiers = False

gene_likelihood = "normal"
n_latent_attribute_ordered = 256
n_latent_attribute_categorical = 3
reconstruction_penalty = 1e+4
cosine_scheduler = True
scheduler_final_lr = 1e-5
step_size_lr = 45
use_batch_norm = False
use_layer_norm = False


module_params = {
    "decoder_width": decoder_width,
    "decoder_depth": decoder_depth,
    "attribute_nn_width":  attribute_nn_width,
    "attribute_nn_depth": attribute_nn_depth,
    "use_batch_norm": use_batch_norm,
    "use_layer_norm": use_layer_norm,
    "unknown_attribute_noise_param": unknown_attribute_noise_param,
    "seed": 42,
    "n_latent_attribute_ordered": n_latent_attribute_ordered,
    "n_latent_attribute_categorical": n_latent_attribute_categorical,
    "gene_likelihood": gene_likelihood,
    "reconstruction_penalty": reconstruction_penalty,
    "unknown_attribute_penalty": unknown_attribute_penalty,
    "attribute_dropout_rate": attribute_dropout_rate,
}

trainer_params = {
    "n_epochs_warmup": 0,
    "latent_lr": latent_lr,
    "latent_wd": latent_wd,
    "decoder_lr": decoder_lr,
    "decoder_wd": decoder_wd,
    "attribute_nn_lr": attribute_nn_lr,
    "attribute_nn_wd": attribute_nn_wd,
    "step_size_lr": step_size_lr,
    "cosine_scheduler": cosine_scheduler,
    "scheduler_final_lr": scheduler_final_lr
}

In [5]:
adata = ad.read_h5ad("C:\\Users\\admin\\Documents\\scFiLM\\data\\sciplex_complete_middle_subset.h5ad")

In [11]:
# Function to compute 2D descriptors
def compute_rdkit_2d_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    descriptor_names = [desc[0] for desc in Descriptors._descList]
    descriptor_funcs = [desc[1] for desc in Descriptors._descList]
    descriptors = [func(mol) for func in descriptor_funcs]
    return descriptors

# Get descriptor names (once)
descriptor_names = [desc[0] for desc in Descriptors._descList]

# Build the features dict
features = {}
for smiles in adata.obs["SMILES"].unique():
    desc = compute_rdkit_2d_descriptors(smiles)
    if desc is not None:
        features[smiles] = desc

# Convert to DataFrame
features_df = pd.DataFrame.from_dict(features, orient="index", columns=descriptor_names)

# Handle missing values
features_df = features_df.fillna(0)

In [12]:
features_df

Unnamed: 0,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
Cn1cc(C2=C(c3cn(C4CCN(Cc5ccccn5)CC4)c4ccccc34)C(=O)NC2=O)c2ccccc21,13.339103,13.339103,0.310559,-0.341654,0.337027,17.051282,515.617,486.385,515.232125,194,...,0,0,0,0,0,0,0,0,0,0
CS(C)=O,9.555556,9.555556,0.611111,-0.611111,0.398185,8.750000,78.136,72.088,78.013936,26,...,0,0,0,0,0,0,0,0,0,0
CCCC(CCC)C(=O)O.CCCC(CCC)C(=O)[O-].[Na+],10.479352,10.479352,0.000000,-0.885463,0.586605,9.809524,310.410,279.162,310.212004,120,...,0,0,0,0,0,0,0,0,0,0
O=C(O)c1ccc(Nc2ncc3c(n2)-c2ccc(Cl)cc2C(c2c(F)cccc2F)=NC3)cc1,14.647223,14.647223,0.109453,-1.022970,0.384505,12.264706,476.870,461.750,476.085160,168,...,0,0,0,0,0,0,0,0,0,0
O=C(CCCc1ccccc1)O[Na],10.814514,10.814514,0.067039,-0.067039,0.667944,9.692308,186.186,175.098,186.065674,64,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CC1COc2c(N3CCN(C)CC3)c(F)cc3c(=O)c(C(=O)O)cn1c23,14.983653,14.983653,0.043359,-1.315329,0.874740,20.269231,361.373,341.213,361.143784,138,...,0,0,0,0,0,0,0,0,0,0
CCN(CC)Cc1ccc2cc(COC(=O)Nc3ccc(C(=O)NO)cc3)ccc2c1.Cl.O,12.071346,12.071346,0.000000,-0.619810,0.333434,10.181818,475.973,445.733,475.187399,178,...,0,0,0,0,0,0,0,0,0,0
S=P(N1CC1)(N1CC1)N1CC1,5.750000,5.750000,1.254244,-1.268519,0.461380,32.363636,189.224,177.128,189.048955,62,...,0,0,0,0,0,0,0,0,0,0
CNCc1ccc(-c2[nH]c3cc(F)cc4c3c2CCNC4=O)cc1.O=P(O)(O)O,13.888363,13.888363,0.214659,-4.638889,0.358599,13.482759,421.365,400.197,421.120286,154,...,0,0,0,0,0,0,0,0,0,0


In [13]:
threshold = 0.001
cols_keep = list(np.where(features_df.std() > threshold)[0])
features_df = features_df.iloc[:, np.where(features_df.std() > threshold)[0]]

In [14]:
normalized_df = (features_df - features_df.mean()) / features_df.std()

In [15]:
features_cells = np.zeros((adata.shape[0], normalized_df.shape[1] + 1))
for mol, rdkit_2d in normalized_df.iterrows():
    features_cells[adata.obs["SMILES"].isin([mol]), :-1] = rdkit_2d.values

In [16]:
dose = adata.obs["dose"] / np.max(adata.obs["dose"])
features_cells[:, -1] = dose
adata.obsm["rdkit2d"] = features_cells[:, :-1]
adata.obsm["rdkit2d_dose"] = features_cells

In [19]:
adata.obs["split_ood"] = "train"

# ood
adata.obs.loc[adata.obs.condition.isin(ood_drugs), "split_ood"] = "ood"

# test
validation_cond = (adata.obs["condition"].isin(validation_drugs)) & (
    adata.obs.dose.isin([1e3, 1e4])
)
val_idx = sc.pp.subsample(adata[validation_cond], 0.4, copy=True).obs.index
adata.obs.loc[val_idx, "split_ood"] = "test"

validation_cond = (adata.obs["condition"].isin(validation_drugs)) & (
    adata.obs.dose.isin([1e1, 1e2])
)

val_idx = sc.pp.subsample(adata[validation_cond], 0.2, copy=True).obs.index
adata.obs.loc[val_idx, "split_ood"] = "test"

validation_cond = adata.obs["split_ood"] == "train"
val_idx = sc.pp.subsample(adata[validation_cond], 0.04, copy=True).obs.index
adata.obs.loc[val_idx, "split_ood"] = "test"

validation_cond = (adata.obs["split_ood"] == "train") & (adata.obs.control.isin([1]))

val_idx = sc.pp.subsample(adata[validation_cond], 0.05, copy=True).obs.index
adata.obs.loc[val_idx, "split_ood"] = "test"

In [20]:
adata.obs["split_ood"].value_counts()

split_ood
train    313598
test      29192
ood       11850
Name: count, dtype: int64

In [21]:
adata.obs["pathway_level_1_groups"] = adata.obs["pathway_level_1"].copy().astype(str)
adata.obs.loc[
    ~adata.obs["pathway_level_1"].isin(groups), "pathway_level_1_groups"
] = "NA"
adata.obs["pathway_level_1_groups"] = adata.obs["pathway_level_1_groups"].astype(
    "category"
)
adata.obs["pathway_level_1_groups"] = adata.obs[
    "pathway_level_1_groups"
].cat.reorder_categories(
    [
        "Cell cycle regulation",
        "DNA damage & DNA repair",
        "Epigenetic regulation",
        "Tyrosine kinase signaling",
        "NA",
    ]
)

adata.uns["pathway_level_1_groups_colors"] = [
    groups_colors[cat] for cat in adata.obs["pathway_level_1_groups"].cat.categories
]

In [22]:
adata.obs["pathway_groups"] = adata.obs["pathway_level_1_groups"].copy().astype(str)
adata.obs.loc[
    adata.obs["pathway_groups"].isin(["Epigenetic regulation"])
    & ~adata.obs["pathway_level_2"].isin(["Histone deacetylation"]),
    "pathway_groups",
] = "NA"
adata.obs.loc[
    adata.obs["pathway_groups"].isin(["Epigenetic regulation"]), "pathway_groups"
] = "Histone deacetylation"

adata.obs["pathway_groups"] = adata.obs["pathway_groups"].astype("category")

adata.obs["pathway_groups"] = adata.obs["pathway_groups"].cat.reorder_categories(
    [
        "Cell cycle regulation",
        "DNA damage & DNA repair",
        "Histone deacetylation",
        "Tyrosine kinase signaling",
        "NA",
    ]
)

adata.uns["pathway_groups_colors"] = [
    groups_colors[cat] for cat in adata.obs["pathway_groups"].cat.categories
]

In [23]:
titles = {
    "A549": "A549 (Lung)",
    "K562": "K562 (Leukemia)",
    "MCF7": "MCF7 (Breast epithelial)",
}

In [24]:
adata.write_h5ad("C:\\Users\\admin\\Documents\\scFiLM\\data\\sciplex3_biolord.h5ad")