In [1]:
## testing CDDVault's API pull for future push into Scispot
## lkp 2023/07/17

import sys
import requests
from pathlib import Path

api_key_file = "G:/My Drive/Lindsay Pino/proj/2023_scispot_utils/data/cddvault_api_key2.txt"
if Path(api_key_file).is_file():
    API_KEY = open(api_key_file, "r").readlines()[0].strip()
else:
    # I am adding here the path to my api key file
    # just so I do not have to use Lindsay's shared drive
    api_key_file = Path("~/.cddvault").expanduser()
    API_KEY = open(api_key_file, "r").readlines()[0].strip()

assert (
    len(API_KEY) >= 70
), "API key is too short. Please check your API key file."
assert (
    len(API_KEY) <= 80
), "API key is too long. Please check your API key file."
# print(API_KEY)

In [2]:
from pprint import pprint
# API documentation
## https://support.collaborativedrug.com/hc/en-us/categories/115001259423-API-Documentation

## Notes from Sebastian :)
# molecule = 'TB-0000001' # this does not seem to be in the vault
# molecule = "TAL0066" # Aliases do not seem to be supported
# molecule = "TB-0000066" # names do not seem to be supported either
molecule = "126908061"
vault = "7476"

base_url = f"https://app.collaborativedrug.com/api/v1/vaults/{vault}/"
headers = {'X-CDD-token': API_KEY}
url = base_url + f"molecules/{molecule}"

response = requests.request("GET", url, headers=headers)

#to view the status, use:
print(response)

#to view the detailed JSON return content:
pprint(response.json())
print(list(response.json().keys()))

<Response [200]>
{'batches': [{'batch_fields': {'Alias 1': 'TBT-033',
                               'Current Amount': 4.88,
                               'Date Received': '2023-07-03',
                               'Initial Amount': 4.88,
                               'Purity': 95.91,
                               'Supplier ID': 'EC17466-4-P1',
                               'Task ID': 'TALUS-20230516',
                               'Vendor': 'Wuxi'},
              'class': 'batch',
              'created_at': '2023-07-13T21:45:04.000Z',
              'formula_weight': 702.278,
              'id': 139380758,
              'modified_at': '2023-07-13T21:45:04.000Z',
              'molecule_batch_identifier': 'TB-0000066-001',
              'name': '001',
              'owner': 'Margaux McBirney',
              'projects': [{'id': 25250, 'name': 'Drug Discovery Team'}],
              'salt_name': 'No Salt, free base or acid'}],
 'class': 'molecule',
 'cns_mpo_score': 1.3,
 'collecti

Here I am exploring the response we get from the api

In [3]:
for k, v in response.json().items():
    print(k, type(v))

id <class 'int'>
class <class 'str'>
created_at <class 'str'>
modified_at <class 'str'>
name <class 'str'>
synonyms <class 'list'>
registration_type <class 'str'>
projects <class 'list'>
collections <class 'list'>
owner <class 'str'>
smiles <class 'str'>
cxsmiles <class 'str'>
inchi <class 'str'>
inchi_key <class 'str'>
iupac_name <class 'str'>
molfile <class 'str'>
molecular_weight <class 'float'>
log_p <class 'float'>
log_d <class 'float'>
log_s <class 'float'>
num_h_bond_donors <class 'int'>
num_h_bond_acceptors <class 'int'>
num_rule_of_5_violations <class 'int'>
formula <class 'str'>
isotope_formula <class 'str'>
p_k_a <class 'float'>
p_k_a_type <class 'str'>
p_k_a_acidic <class 'float'>
p_k_a_basic <class 'float'>
exact_mass <class 'float'>
heavy_atom_count <class 'int'>
composition <class 'str'>
isotope_composition <class 'str'>
topological_polar_surface_area <class 'float'>
num_rotatable_bonds <class 'int'>
cns_mpo_score <class 'float'>
fsp3 <class 'float'>
batches <class 'list

Even more exploration on the nested fields

In [4]:
for k, v in response.json().items():
    if isinstance(v, dict):
        # print(k, type(v))
        print(f"class {k}(BaseModel):")
        for k2, v2 in v.items():
            print(f"\t{k2}: {type(v2).__name__}")

class molecule_fields(BaseModel):
	Covalent: str
class udfs(BaseModel):
	Covalent: str


yet another exploration of the fields that are lists

In [5]:
from pprint import pprint

for k, v in response.json().items():
    if isinstance(v, list):
        print(k, type(v))
        types = set([type(x).__name__ for x in v])
        print(types)
        pprint(v[0])

synonyms <class 'list'>
{'str'}
'TAL0066'
projects <class 'list'>
{'dict'}
{'id': 25250, 'name': 'Drug Discovery Team'}
collections <class 'list'>
{'dict'}
{'id': 554552, 'name': 'TBXT'}
batches <class 'list'>
{'dict'}
{'batch_fields': {'Alias 1': 'TBT-033',
                  'Current Amount': 4.88,
                  'Date Received': '2023-07-03',
                  'Initial Amount': 4.88,
                  'Purity': 95.91,
                  'Supplier ID': 'EC17466-4-P1',
                  'Task ID': 'TALUS-20230516',
                  'Vendor': 'Wuxi'},
 'class': 'batch',
 'created_at': '2023-07-13T21:45:04.000Z',
 'formula_weight': 702.278,
 'id': 139380758,
 'modified_at': '2023-07-13T21:45:04.000Z',
 'molecule_batch_identifier': 'TB-0000066-001',
 'name': '001',
 'owner': 'Margaux McBirney',
 'projects': [{'id': 25250, 'name': 'Drug Discovery Team'}],
 'salt_name': 'No Salt, free base or acid'}
source_files <class 'list'>
{'dict'}
{'id': 19061332, 'name': 'CDD import_parts.xlsx'}


Here I am exploring on using pydantic to parse and validate what we get out
of the API.

In [6]:
from pydantic import BaseModel, Field
from typing import List

class SourceFile(BaseModel):
	id: int
	name: str
	
class Project(BaseModel):
	# [{'name': 'Drug Discovery Team', 'id': 25250}]
    name: str
    id: int

class Collection(BaseModel):
	# [{'name': 'TBXT', 'id': 554552}]
    name: str
    id: int

class BatchField(BaseModel):
    # {'Alias 1': 'TBT-033',
    # 'Current Amount': 4.88,
    # 'Date Received': '2023-07-03',
    # 'Initial Amount': 4.88,
    # 'Purity': 95.91,
    # 'Supplier ID': 'EC17466-4-P1',
    # 'Task ID': 'TALUS-20230516',
    # 'Vendor': 'Wuxi'}
    alias_1: str = Field(alias="Alias 1")
    current_amount: float = Field(alias="Current Amount")
    date_received: str = Field(alias="Date Received")
    initial_amount: float = Field(alias="Initial Amount")
    purity: float = Field(alias="Purity")
    supplier_id: str = Field(alias="Supplier ID")
    task_id: str = Field(alias="Task ID")
    vendor: str = Field(alias="Vendor")

    class Config:
        populate_by_name = True

class Batch(BaseModel):
    """
    {'batch_fields': {'Alias 1': 'TBT-033',
                    'Current Amount': 4.88,
                    'Date Received': '2023-07-03',
                    'Initial Amount': 4.88,
                    'Purity': 95.91,
                    'Supplier ID': 'EC17466-4-P1',
                    'Task ID': 'TALUS-20230516',
                    'Vendor': 'Wuxi'},
    'class': 'batch',
    'created_at': '2023-07-13T21:45:04.000Z',
    'formula_weight': 702.278,
    'id': 139380758,
    'modified_at': '2023-07-13T21:45:04.000Z',
    'molecule_batch_identifier': 'TB-0000066-001',
    'name': '001',
    'owner': 'Margaux McBirney',
    'projects': [{'id': 25250, 'name': 'Drug Discovery Team'}],
    'salt_name': 'No Salt, free base or acid'}
    """
    batch_fields: List[BatchField] = Field(alias="batch_fields")
    class_: str = Field(alias="class")
    created_at: str = Field(alias="created_at")
    formula_weight: float = Field(alias="formula_weight")
    id: int = Field(alias="id")
    modified_at: str = Field(alias="modified_at")
    molecule_batch_identifier: str = Field(alias="molecule_batch_identifier")
    name: str = Field(alias="name")
    owner: str = Field(alias="owner")
    projects: List[Project] = Field(alias="projects")
    salt_name: str = Field(alias="salt_name")

class molecule_fields(BaseModel):
	Covalent: str

class udfs(BaseModel):
	Covalent: str

class Molecule(BaseModel):
    id: int
    class_: str = Field(alias="class")
    created_at: str
    modified_at: str
    name: str
    synonyms: List[str]
    registration_type: str
    projects: list
    collections: list
    owner: str
    smiles: str
    cxsmiles: str
    inchi: str
    inchi_key: str
    iupac_name: str
    molfile: str
    molecular_weight: float
    log_p: float
    log_d: float
    log_s: float
    num_h_bond_donors: int
    num_h_bond_acceptors: int
    num_rule_of_5_violations: int
    formula: str
    isotope_formula: str
    p_k_a: float
    p_k_a_type: str
    p_k_a_acidic: float
    p_k_a_basic: float
    exact_mass: float
    heavy_atom_count: int
    composition: str
    isotope_composition: str
    topological_polar_surface_area: float
    num_rotatable_bonds: int
    cns_mpo_score: float
    fsp3: float
    batches: list
    source_files: List[SourceFile]
    molecule_fields: dict
    udfs: dict

    class Config:
        populate_by_name = True

    @classmethod
    def from_cddvault(cls, molecule_id: str, vault_id: str, api_key: str):
        base_url = f"https://app.collaborativedrug.com/api/v1/vaults/{vault_id}/"
        headers = {"X-CDD-token": api_key}
        url = base_url + f"molecules/{molecule_id}"
        response = requests.request("GET", url, headers=headers)
        return cls(**response.json())



Molecule(**response.json())


Molecule(id=126908061, class_='molecule', created_at='2023-07-13T21:45:04.000Z', modified_at='2023-07-13T21:45:04.000Z', name='TB-0000066', synonyms=['TAL0066', 'TB-0000066'], registration_type='CHEMICAL_STRUCTURE', projects=[{'name': 'Drug Discovery Team', 'id': 25250}], collections=[{'name': 'TBXT', 'id': 554552}], owner='Margaux McBirney', smiles='CC(C)N(CC)S(=O)(=O)c1cc(Nc2ncnc3cc(OC)c(NC(=O)/C=C/CN4CCN(C(=O)OC(C)(C)C)CC4)cc23)ccc1Cl', cxsmiles='CCN(C(C)C)S(=O)(=O)C1=CC(NC2=NC=NC3=CC(OC)=C(NC(=O)/C=C/CN4CCN(C(=O)OC(C)(C)C)CC4)C=C23)=CC=C1Cl', inchi='InChI=1S/C33H44ClN7O6S/c1-8-41(22(2)3)48(44,45)29-18-23(11-12-25(29)34)37-31-24-19-27(28(46-7)20-26(24)35-21-36-31)38-30(42)10-9-13-39-14-16-40(17-15-39)32(43)47-33(4,5)6/h9-12,18-22H,8,13-17H2,1-7H3,(H,38,42)(H,35,36,37)/b10-9+', inchi_key='XKGURMHJCWLGJO-MDZDMXLPSA-N', iupac_name='tert-butyl 4-[(E)-3-(N-4-{3-[N-ethyl(isopropyl)aminosulfonyl]-4-chlorophenylamino}-7-methoxy-6-quinazolinylcarbamoyl)-2-propenyl]-1-piperazinecarboxylate', 

Now that we have the validator, we can use it in one line to query the api!

In [7]:
pprint(Molecule.from_cddvault(molecule, vault, API_KEY).model_dump())

{'batches': [{'batch_fields': {'Alias 1': 'TBT-033',
                               'Current Amount': 4.88,
                               'Date Received': '2023-07-03',
                               'Initial Amount': 4.88,
                               'Purity': 95.91,
                               'Supplier ID': 'EC17466-4-P1',
                               'Task ID': 'TALUS-20230516',
                               'Vendor': 'Wuxi'},
              'class': 'batch',
              'created_at': '2023-07-13T21:45:04.000Z',
              'formula_weight': 702.278,
              'id': 139380758,
              'modified_at': '2023-07-13T21:45:04.000Z',
              'molecule_batch_identifier': 'TB-0000066-001',
              'name': '001',
              'owner': 'Margaux McBirney',
              'projects': [{'id': 25250, 'name': 'Drug Discovery Team'}],
              'salt_name': 'No Salt, free base or acid'}],
 'class_': 'molecule',
 'cns_mpo_score': 1.3,
 'collections': [{'id': 55

# Exploration of other api functions

## Vaults

In theory this gives all vaults we have access to

In [8]:
response = requests.request(
    "GET",
    "https://app.collaborativedrug.com/api/v1/vaults/",
    headers=headers)

#to view the status, use:
print(response)

#to view the detailed JSON return content:
pprint(response.json())

<Response [200]>
[{'id': 7476, 'name': 'Talus Bio Sandbox'}]


## Fields

In theory this gives would be what we use for stuff like ... getting all of our molecules.
I was wrong, it gives what fields are deifned for each data type and the options when it is a "pick list"

In [9]:
VAULT_ID = 7476
response = requests.request(
    "GET",
    f"https://app.collaborativedrug.com/api/v1/vaults/{VAULT_ID}/fields",
    headers=headers)

#to view the status, use:
print(response)

#to view the detailed JSON return content:
pprint(response.json())

<Response [200]>
{'batch': [{'data_type_name': 'Text',
            'id': 118012,
            'name': 'Place',
            'overwritable': False,
            'required_group_number': None,
            'type': 'BatchFieldDefinition',
            'unique_value': False},
           {'data_type_name': 'Text',
            'id': 118013,
            'name': 'Vendor',
            'overwritable': False,
            'required_group_number': None,
            'type': 'BatchFieldDefinition',
            'unique_value': False},
           {'data_type_name': 'Text',
            'id': 118014,
            'name': 'Note',
            'overwritable': False,
            'required_group_number': None,
            'type': 'BatchFieldDefinition',
            'unique_value': False},
           {'data_type_name': 'Number',
            'id': 118015,
            'name': 'Initial Amount',
            'overwritable': False,
            'required_group_number': None,
            'type': 'BatchFieldDefinition',
    

# Getting molecules by name

In [10]:
vault_url = f"https://app.collaborativedrug.com/api/v1/vaults/{vault}/"

response = requests.request(
    "GET",
    f"{vault_url}molecules",
    headers=headers,
    params={'names': 'TAL0066,TAL0067,SOMEFAKENAME'})

#to view the status, use:
print(response)

#to view the detailed JSON return content:
pprint(response.json())
print(list(response.json().keys()))

<Response [200]>
{'count': 2,
 'objects': [{'batches': [{'batch_fields': {'Alias 1': 'TBT-033',
                                            'Current Amount': 4.88,
                                            'Date Received': '2023-07-03',
                                            'Initial Amount': 4.88,
                                            'Purity': 95.91,
                                            'Supplier ID': 'EC17466-4-P1',
                                            'Task ID': 'TALUS-20230516',
                                            'Vendor': 'Wuxi'},
                           'class': 'batch',
                           'created_at': '2023-07-13T21:45:04.000Z',
                           'formula_weight': 702.278,
                           'id': 139380758,
                           'modified_at': '2023-07-13T21:45:04.000Z',
                           'molecule_batch_identifier': 'TB-0000066-001',
                           'name': '001',
                       

It is worth noting that this has a page limit size and just ignores the names that do not have
a match in the data (`SOMEFAKENAME` in this case).

## Testing the async interface

This interface is meant to be used for large queries (althoug it can be used for small ones as well).
It differs in the sense that the response is not what we actually asked fr but only a progress, that
when finished can be downloaded by using the `export_progress` API.

psedudocode of the workflow

```pseudocode
request_id = MAKE_REQUEST
while not request_id.is_done():
    wait
response = request_results(request_id)
```

In [11]:
response = requests.request(
    "GET",
    f"{vault_url}molecules",
    headers=headers,
    params={'names': 'TAL0066,TAL0067,SOMEFAKENAME', 'async': 'true'})

#to view the status, use:
print(response)

#to view the detailed JSON return content:
pprint(response.json())
print(list(response.json().keys()))

<Response [200]>
{'created_at': '2023-07-19T14:32:10.000Z',
 'id': 29025467,
 'modified_at': '2023-07-19T14:32:10.000Z',
 'status': 'new'}
['id', 'created_at', 'modified_at', 'status']


In [12]:
response_id = response.json()['id']

status_response = requests.request(
    "GET",
    f'https://app.collaborativedrug.com/api/v1/vaults/{VAULT_ID}/export_progress/{response_id}',
    headers=headers)

pprint(status_response.json())


{'created_at': '2023-07-19T14:32:10.000Z',
 'id': 29025467,
 'modified_at': '2023-07-19T14:32:10.000Z',
 'status': 'new'}


In [14]:
from typing import Literal
from pydantic import BaseModel
import time

AsyncResponseStatus = Literal["new", "started", "finished"]

class AsyncResponse(BaseModel):
    id: int
    status: AsyncResponseStatus
    created_at: str
    modified_at: str

    class Config:
        populate_by_name = True

    def is_finished(self):
        return self.status == "finished"
    
    def check_status(self, VAULT_ID, API_KEY):
        url = f"https://app.collaborativedrug.com/api/v1/vaults/{VAULT_ID}/export_progress/{self.id}"
        headers = {"X-CDD-token": API_KEY}
        response = requests.request("GET", url, headers=headers)
        return type(self)(**response.json())
    
    def download_export(self, VAULT_ID, API_KEY):
        if not self.is_finished():
            raise ValueError("Export is not finished yet.")
        url = f"https://app.collaborativedrug.com/api/v1/vaults/{VAULT_ID}/exports/{self.id}"
        headers = {"X-CDD-token": API_KEY}
        response = requests.request("GET", url, headers=headers)
        return response
    
async_response = AsyncResponse(**status_response.json())
while not async_response.is_finished():
    async_response = async_response.check_status(VAULT_ID, API_KEY)
    print(".... waiting for export to finish ....")
    time.sleep(1) # 1 second might be a lot ... who knows ...

export_response = async_response.download_export(VAULT_ID, API_KEY)
pprint(export_response.json())

{'count': 2,
 'objects': [{'batches': [{'batch_fields': {'Alias 1': 'TBT-033',
                                            'Current Amount': 4.88,
                                            'Date Received': '2023-07-03',
                                            'Initial Amount': 4.88,
                                            'Purity': 95.91,
                                            'Supplier ID': 'EC17466-4-P1',
                                            'Task ID': 'TALUS-20230516',
                                            'Vendor': 'Wuxi'},
                           'class': 'batch',
                           'created_at': '2023-07-13T21:45:04.000Z',
                           'formula_weight': 702.278,
                           'id': 139380758,
                           'modified_at': '2023-07-13T21:45:04.000Z',
                           'molecule_batch_identifier': 'TB-0000066-001',
                           'name': '001',
                           'owner': 'Mar

In [15]:
class MoleculeGroup(BaseModel):
     count: int
     objects: List[Molecule]

MoleculeGroup(**export_response.json())

MoleculeGroup(count=2, objects=[Molecule(id=126908061, class_='molecule', created_at='2023-07-13T21:45:04.000Z', modified_at='2023-07-13T21:45:04.000Z', name='TB-0000066', synonyms=['TAL0066', 'TB-0000066'], registration_type='CHEMICAL_STRUCTURE', projects=[{'name': 'Drug Discovery Team', 'id': 25250}], collections=[{'name': 'TBXT', 'id': 554552}], owner='Margaux McBirney', smiles='CC(C)N(CC)S(=O)(=O)c1cc(Nc2ncnc3cc(OC)c(NC(=O)/C=C/CN4CCN(C(=O)OC(C)(C)C)CC4)cc23)ccc1Cl', cxsmiles='CCN(C(C)C)S(=O)(=O)C1=CC(NC2=NC=NC3=CC(OC)=C(NC(=O)/C=C/CN4CCN(C(=O)OC(C)(C)C)CC4)C=C23)=CC=C1Cl', inchi='InChI=1S/C33H44ClN7O6S/c1-8-41(22(2)3)48(44,45)29-18-23(11-12-25(29)34)37-31-24-19-27(28(46-7)20-26(24)35-21-36-31)38-30(42)10-9-13-39-14-16-40(17-15-39)32(43)47-33(4,5)6/h9-12,18-22H,8,13-17H2,1-7H3,(H,38,42)(H,35,36,37)/b10-9+', inchi_key='XKGURMHJCWLGJO-MDZDMXLPSA-N', iupac_name='tert-butyl 4-[(E)-3-(N-4-{3-[N-ethyl(isopropyl)aminosulfonyl]-4-chlorophenylamino}-7-methoxy-6-quinazolinylcarbamoyl)-2-prop