In [2]:
import sys
import time
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

from rdkit import Chem
from rdkit.Chem import AllChem

sns.set_theme(style="white")

In [6]:
!conda env list 
!conda activate master-thesis-log

# conda environments:
#
base                     /users/cwoest/Applications/anaconda3
gdsctools_env            /users/cwoest/Applications/anaconda3/envs/gdsctools_env
master-thesis-log     *  /users/cwoest/Applications/anaconda3/envs/master-thesis-log
r-env                    /users/cwoest/Applications/anaconda3/envs/r-env


CommandNotFoundError: Your shell has not been properly configured to use 'conda activate'.
To initialize your shell, run

    $ conda init <SHELL_NAME>

Currently supported shells are:
  - bash
  - fish
  - tcsh
  - xonsh
  - zsh
  - powershell

See 'conda init --help' for more information and options.

IMPORTANT: You may need to close and restart your shell after running 'conda init'.




Run the following if `rdkit` is not installed.
```bash
    !conda install -c rdkit rdkit
```

In [10]:
!pwd
!find ../../datasets/gdsc -name '*smiles*.csv' -ls

/Users/cwoest/Documents/Academics/Data_Science_UP/master_thesis/material/GNN-material
37474177       88 -rw-r--r--    1 cwoest           staff               41866 Apr 28 16:31 ../../datasets/gdsc/GDSC_compounds_inchi_key_with_smiles.csv


__Background__: 
  Conventionally, molecular fingerprints (numerical representations of molecules) are calculated through rule-based algorithms that map molecules to a sparse discrete space.
However, these algorithms perform poorly for shallow prediction models or small datasets. To address this issue, we present SMILES Transformer. Inspired by Transformer and pre-trained language models from natural language processing, SMILES Transformer learns molecular fingerprints through unsupervised pre-training of the sequence-to-sequence language model using a huge corpus of SMILES, a text representation system for molecules.

__Conclusion__: 
- fingerprint = numerical representation of molecules (via SMILES)

In [3]:
# Get GDSC base data.
PATH_TO_SAVE_DATA_TO = '../../datasets/gdsc/my_datasets/'
gdsc_base = pd.read_pickle(f'{PATH_TO_SAVE_DATA_TO}gdsc_base.pkl')
print(gdsc_base.shape)
gdsc_base.head(3)

(446521, 14)


Unnamed: 0,DRUG_ID,CELL_LINE_NAME,AUC,CELL_ID,LN_IC50,CONC,MASTER_CELL_ID,INTENSITY,DATASET,Z_SCORE,DRUG_NAME,RMSE,COSMIC_ID,POSITION
0,1,MC-CAR,0.982114,3137,2.395685,2.0,49,544404,GDSC1,-0.189576,Erlotinib,0.022521,683665,14
9,1,ES3,0.984816,2366,3.140923,2.0,1342,404197,GDSC1,0.508635,Erlotinib,0.03184,684055,14
27,1,ES5,0.985693,2368,3.968757,2.0,610,797378,GDSC1,1.284229,Erlotinib,0.026052,684057,14


In [4]:
uniq_drug_names = np.unique(gdsc_base.DRUG_NAME)
print(f"""
    There are {len(uniq_drug_names)} different DRUG_NAME's in the GDSC base table.
""")


    There are 449 different DRUG_NAME's in the GDSC base table.



The wanted dataset lies in here and the used code is taken from [this repository](https://github.com/PascalIversen/GDSC_utils/blob/main/GDSC_utils.py#L50-L105).

In [5]:
PATH_TO_GDSC_DATA = '../../datasets/gdsc/'
SMILES_FILE = 'GDSC_compounds_inchi_key_with_smiles.csv'

start = time.time()
smiles_data = pd.read_csv(f'{PATH_TO_GDSC_DATA}{SMILES_FILE}', sep=",", header=0)
print(f"File `{SMILES_FILE}` took {time.time()-start:.5f} seconds to import. \nShape: {smiles_data.shape}")
smiles_data.head(3)

File `GDSC_compounds_inchi_key_with_smiles.csv` took 0.00679 seconds to import. 
Shape: (425, 4)


Unnamed: 0.1,Unnamed: 0,drug_name,inchi_key,smiles
0,0,(5Z)-7-Oxozeaenol,NEQZWEXWOFPKOT-BYRRXHGESA-N,C[C@H]1CC=CC(=O)[C@H]([C@H](CC=Cc2cc(cc(c2C(=O...
1,1,5-Fluorouracil,GHASVSINZRGABV-UHFFFAOYSA-N,c1c(c(nc(n1)O)O)F
2,2,A-443654,YWTBGJGMTBHQTM-IBGZPJMESA-N,Cc1c2cc(ccc2n[nH]1)c1cc(cnc1)OC[C@H](Cc1c[nH]c...


In [7]:
uniq_drug_names[0]

'(5Z)-7-Oxozeaenol'

In [8]:
from external.gdsc_utils.GDSC_utils import get_demorgen_fingerprints

# Returns a list of fingerprints or nans.
fps = get_demorgen_fingerprints(
    drugs=uniq_drug_names,
    n_bits=256,
    path_drug_smiles=f'{PATH_TO_GDSC_DATA}{SMILES_FILE}'
)

print(fps[0].shape)
fps[0]

(256,)


array([1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0])

In [10]:
print(f"""
    Each fingerprint has a shape of {fps[0].shape}.
    There are {len(uniq_drug_names)} unique DRUG_NAME's in the GDSC table. Each will get a fingerprint.
    {len([fp for fp in fps if fp is None])} ({100*len([fp for fp in fps if fp is None])/len(fps):2.2f}%) out of {len(fps)} fingerprints are None.
        - Thus, they didn't get a fingerprint.
        - They are None if the fingerprint is inaccessible.
""")


    Each fingerprint has a shape of (256,).
    There are 449 unique DRUG_NAME's in the GDSC table. Each will get a fingerprint.
    82 (18.26%) out of 449 fingerprints are None.
        - Thus, they didn't get a fingerprint.
        - They are None if the fingerprint is inaccessible.



In [11]:
# Append the fingerprints to the corresponding drug names. 
drug_name_fps = {uniq_drug_name: fps[i] for i, uniq_drug_name in enumerate(uniq_drug_names)}
assert len(uniq_drug_names) == len(uniq_drug_names) == len(fps)

In [14]:
len(drug_name_fps)

449

In [13]:
drug_name_fps

{'(5Z)-7-Oxozeaenol': array([1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
        1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0]),
 '5-Fluorouracil': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0

This would correspond to a $449 \times 256$ matrix. Each row containing a fingerprint for a drug.

Now the dictionary `drug_name_fps` consists of the following:
- _Keys_: `DRUG_NAME`'s
- _Values_: corresponding SMILE fingerprints

In [67]:
# Save the DRUG_NAME - fingerprint dictionary to a file.
import pickle 
with open(f'{PATH_TO_SAVE_DATA_TO}drug_name_fingerprints.pkl', 'wb') as f:
    pickle.dump(drug_name_fps, f)

In [95]:
# Test if the read works.
with open(f'{PATH_TO_SAVE_DATA_TO}drug_name_fingerprints.pkl', 'rb') as f:
    loaded_drug_name_fps = pickle.load(f)

assert loaded_drug_name_fps.keys() == drug_name_fps.keys()
#assert np.array_equal(loaded_drug_name_fps.values(), drug_name_fps.values())