# 🗜 CLAMP demo

This notebook shows how to run a pretrained CLAMP model or train one yourself.

In [1]:
%cd ..

/system/user/publicwork/seidl/projects/projects/clamp


In [2]:
import torch
import clamp

model = clamp.CLAMP(device='cpu')
model.eval()

model

PretrainedCLAMP(
  (compound_encoder): NetworkLayerNorm(
    (linear_input): Linear(in_features=8192, out_features=4096, bias=True)
    (linear_hidden_l): ModuleList(
      (0): Linear(in_features=4096, out_features=2048, bias=True)
    )
    (linear_output): Linear(in_features=2048, out_features=768, bias=True)
    (normalization_input): LayerNorm((4096,), eps=1e-05, elementwise_affine=False)
    (normalization_hidden_l): ModuleList(
      (0): LayerNorm((2048,), eps=1e-05, elementwise_affine=False)
    )
    (nonlinearity): ReLU()
    (dropout_input): Dropout(p=0.1, inplace=False)
    (dropout_hidden): Dropout(p=0.2, inplace=False)
  )
  (assay_encoder): NetworkLayerNorm(
    (linear_input): Linear(in_features=512, out_features=4096, bias=True)
    (linear_hidden_l): ModuleList(
      (0): Linear(in_features=4096, out_features=2048, bias=True)
    )
    (linear_output): Linear(in_features=2048, out_features=768, bias=True)
    (normalization_input): LayerNorm((4096,), eps=1e-05, elem

In [3]:
model.encode_smiles(['CCC']).shape

torch.Size([1, 8192])

In [4]:
molecules = [
    'CC1=CC(=NN1C2=CC3=CC=CC=C3C=C2)OCCN4CCOCC4', #CID 44247568
    'CC1=CC2=C(S1)NC3=CC=CC=C3N=C2N4CCN(CC4)C', #CID 135398745
    'CC(=O)OC1=CC=CC=C1C(=O)O', # aspirin
    'C=CC(=O)N', # acrylamide
    ]
assay_descriptions = [
    'Inflammation reduction in humans',
    ]

with torch.no_grad():
    logits = model.forward_dense(molecules, assay_descriptions)
    probs = logits.softmax(dim=0).cpu().numpy() # probs for molecules

print("Mol probs for assay:", probs[:,0]) #[0.2677869  0.28258166 0.22538778 0.22424364]

Mol probs for assay: [0.2677869  0.28258166 0.22538778 0.22424364]


In [5]:
# to get the molecule encoding
mol_encodings = model.compound_encoder(model.encode_smiles(molecules)).detach().cpu().numpy()
mol_encodings.shape

(4, 768)

In [6]:
# load hiv dataset
import pandas as pd
activity_df = pd.read_parquet('./data/downstream/hiv/activity.parquet')
smiles_df = pd.read_parquet('./data/downstream/hiv/compound_smiles.parquet')
assay_df = pd.read_parquet('./data/downstream/hiv/assay_names.parquet')

In [7]:
molecules = smiles_df.CanonicalSMILES
#mol_encodings = model.compound_encoder(model.encode_smiles(molecules)).detach().cpu().numpy()

In [8]:
smiles_df.CanonicalSMILES

0                               CCOP(=O)(Nc1cccc(Cl)c1)OCC
1                                          O=C(O)c1ccccc1O
2                                      CCOP(N)(=O)c1ccccc1
3                                      NNP(=S)(NN)c1ccccc1
4                                  Nc1c(Cl)cc(Cl)cc1C(=O)O
                               ...                        
41122                   O=P(Nc1ccccc1)(Nc1ccccc1)Nc1ccccc1
41123                CC1=C2C(=COC(C)C2C)C(O)=C(C(=O)O)C1=O
41124                     CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21
41125    C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3...
41126    CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)...
Name: CanonicalSMILES, Length: 41127, dtype: object

In [9]:
assay_df.molnet_paper_description.values

array([' HIV. The HIV dataset was introduced by the Drug Therapeutics Program (DTP) AIDS Antiviral Screen, which tested the ability to inhibit HIV replication for over 40[thin space (1/6-em)]000 compounds.47 Screening results were evaluated and placed into three categories: confirmed inactive (CI), confirmed active (CA) and confirmed moderately active (CM). We further combine the latter two labels, making it a classification task between inactive (CI) and active (CA and CM). As we are more interested in discover new categories of HIV inhibitors, scaffold splitting (introduced in the next subsection) is recommended for this dataset.'],
      dtype=object)

In [10]:
import torch
import clamp

model = clamp.CLAMP(device='cpu')
model.eval()

molecules = [
    'CCOP(=O)(Nc1cccc(Cl)c1)OCC', #inactive
    'O=C(O)c1ccccc1O', #inactive
    'NNP(=S)(NN)c1ccccc1', #active
    'CC(=O)OC1=CC=CC=C1C(=O)O', # Aspirin
 #   'C'*100
    ]
assay_descriptions = [
    'HIV: Experimentally measured abilities to inhibit HIV replication.',
    ]

with torch.no_grad():
    logits = model.forward_dense(molecules, assay_descriptions)
    probs = logits.softmax(dim=0).cpu().numpy() # probs for molecules

print("Mol probs for assay:", probs[:,0])

Mol probs for assay: [0.2582146  0.23540357 0.26953387 0.236848  ]
