In [4]:
from openadmet.toolkit.database.chembl import PermissiveChEMBLTargetCurator
from openadmet.toolkit.chemoinformatics.rdkit_funcs import canonical_smiles, smiles_to_inchikey
from tqdm.auto import tqdm
tqdm.pandas()
import datamol as dm

  from .autonotebook import tqdm as notebook_tqdm


# Curating basic pChEMBL data and pushing to a remote intake catalog

Our goal is to curate activity data from ChEMBL and push this to a remote location with a catalog that can be used by others to look up our data. This will enable consistency and rapid dissemination of our work as well as an over-time evolution of our data curation practices. 

We use the `Intake` package for a lightweight self-describing data parsing workflow. Read more about intake here: https://intake.readthedocs.io/en/latest/index.html


Here we gather `pChEMBL` data permissivley from ChEMBL (ie without activity based curation) for our 5 main targets (AHR, PXR, CYP3A4, CYP2C9, CYP2D6) and also additional target CYP2J2.

We then aggregate `pChEMBL` measurements on the same compound by taking the mean. This is the most basic form of curation available, but serves as a good baseline for our initial models. 


## gather ChEMBL data

First we need to gather in our data from ChEMBL using our SQL API defined in `openadmet-toolkit`

We use `OPENADMET_SMILES` and `OPENADMET_INCHIKEY` to distinguish our ML ready representation from the source SMILES

In [5]:
def gather_chembl_data_for_target(target_name: str, chembl_tid: str, chembl_ver: int):
    print(f"working on target {target_name}")
    pctc = PermissiveChEMBLTargetCurator(chembl_target_id=chembl_tid, version=chembl_ver)
    activity_data = pctc.get_activity_data(return_as="df")
    print("canonicalising raw data")
    with dm.without_rdkit_log():
        activity_data["OPENADMET_SMILES"] = activity_data["canonical_smiles"].progress_apply(lambda x: canonical_smiles(x))
        activity_data["OPENADMET_INCHIKEY"] = activity_data["OPENADMET_SMILES"].progress_apply(lambda x: smiles_to_inchikey(x))
    # important to canonicalise here so compound deduplication is done correctly
    aggregated_activity = pctc.aggregate_activity_data_by_compound(canonicalise=True)
    print("smiles duplicates", aggregated_activity["OPENADMET_SMILES"].duplicated().sum())
    print("inchikey duplicates", aggregated_activity["OPENADMET_INCHIKEY"].duplicated().sum())
    return aggregated_activity, activity_data
        

## Define target metadata

We need the CHEMBL codes for our targets

In [6]:
targets = {
    "AHR": "CHEMBL3201",
    "PXR": "CHEMBL3401",
    "CYP3A4": "CHEMBL340",
    "CYP2C9": "CHEMBL3397",
    "CYP2D6": "CHEMBL289",
    "CYP2J2": "CHEMBL3491"
}

In [7]:
chembl_ver = 35

In [8]:
from openadmet.toolkit.webservices.credentials import S3Settings
from openadmet.toolkit.webservices.s3 import S3Bucket

# Setup S3

After curating our data we would like to push to a remote bucket to save both the raw data and the catalog

In [None]:
settings = S3Settings()

In [9]:
bucket = "openadmet-data-public-dev"

In [10]:
bucket = S3Bucket.from_settings(settings, bucket)

NameError: name 'settings' is not defined

In [11]:
import datetime

In [12]:
t = datetime.datetime.now()

In [13]:
date = t.strftime("%Y-%m-%d")

In [14]:
location=f"ChEMBL{chembl_ver}_permissive_{date}"

In [17]:
import os
from pathlib import Path

location_path = Path(location)

In [20]:
location_path.mkdir(exist_ok=False)

# Main loop

Generate the data for each target and save to parquet, then push to S3 data lake with parquet files. 

We use parquet here for improved performance and reduced size on disk.

In [None]:
uris_raw = {}
uris_agg = {}
for target, chembl_tid in targets.items():
    agg, raw  = gather_chembl_data_for_target(target, chembl_tid, chembl_ver)
    # TODO: make a function this is clunky
    fname_agg = f"ChEMBL_permissive_{target}_{chembl_tid}_aggregated.parquet"
    fname_raw = f"ChEMBL_permissive_{target}_{chembl_tid}_raw.parquet"
    
    agg.to_parquet(location_path/fname_agg)
    raw.to_parquet(location_path/fname_raw)
    
    bucket_destination_agg = location + "/" + fname_agg
    bucket.push_file(location_path/fname_agg, bucket_destination_agg)
    bucket_destination_raw = location + "/" + fname_raw
    bucket.push_file(location_path/fname_raw, bucket_destination_raw)

    # get S3 URIs
    uri_agg = bucket.to_uri(bucket_destination_agg)
    uris_agg[target] = uri_agg

    uri_raw = bucket.to_uri(bucket_destination_raw)
    uris_raw[target] = uri_raw

    


# Build the Intake Catalog

We have sucessfully aggregted our data and pushed it to a remote destination. Now for others to consume our data, we are going to make an `Intake` catalog such that our data can be readily made available. 

The workflow here is drawn from the `creator` walkthrough from the main intake tutorials https://intake.readthedocs.io/en/latest/walkthrough2.html

TODO: add descriptions to the catalog

In [None]:
import intake

In [None]:
intake.Catalog?

In [None]:
cat = intake.entry.Catalog()

In [None]:
uris_agg

In [None]:
uris_raw

In [None]:
for k,v in uris_agg.items():
    cat[k+"_aggregated"] = intake.readers.PandasParquet(v)

In [None]:
for k,v in uris_raw.items():
    cat[k+"_raw"] = intake.readers.PandasParquet(v)

## Push the Catalog

Ok now we have made the catalog, lets push it to the remote location so it can live alongside the data. 

The catalog can then be used from S3 or from github etc, anything that exposes a file-like API. 

In [None]:
cat

In [None]:
catname = f"CATALOG_{location}.yaml"

In [None]:
cat.to_yaml_file(catname)

In [None]:
cat_location = location+ "/" +catname

In [None]:
cat_location

In [None]:
bucket.push_file(catname, cat_location)

In [None]:
cat_uri = bucket.to_uri(cat_location)

In [None]:
# Now can read the catalog from URI
# cat = intake.Catalog.from_yaml_file("s3://openadmet-data-public-dev/ChEMBL34_permissive_2025-02-12/CATALOG_ChEMBL34_permissive_2025-02-12.yaml")