In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.9/34.9 MB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.6


In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import ast

from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski
from rdkit.Chem import AllChem, MACCSkeys, RDKFingerprint, LayeredFingerprint

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Dataframes/df_balanced.csv')
df.head(2)

Unnamed: 0.1,Unnamed: 0,subset,docking_score,pdb_id,zinc_id,smiles
0,0,test,-10.467345,5EK0,ZINC001686499439,CO[C@@H](CNC(=O)NCCC[C@@H]1CCCN1C(=O)OC(C)(C)C...
1,1,validation,-9.679859,5MZJ,ZINC001422246376,C[C@H](O)CN(C)C[C@H]1CCCN1C(=O)CCc1nccs1


### Lipinski

In [None]:
def calculate_lipinski_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        return [Descriptors.MolWt(mol), Descriptors.MolLogP(mol),
                Lipinski.NumHDonors(mol), Lipinski.NumHAcceptors(mol)]
    else:
        return [np.nan] * 4

lipinski_descriptors = df['smiles'].apply(calculate_lipinski_descriptors).tolist()

column_names = ['molecular_weight', 'logP', 'numH_donors', 'numH_acceptors']
result_df = pd.DataFrame(lipinski_descriptors, columns=column_names)

df_lipinski = pd.concat([df, result_df], axis=1)
df_lipinski.head(2)

Unnamed: 0.1,Unnamed: 0,interval,subset,docking_score,pdb_id,zinc_id,smiles,molecular_weight,logP,numH_donors,numH_acceptors
0,0,"(-15.0, -14.0]",validation,-14.881847,6IIU,ZINC001129722346,C#C[C@@H](NC(=O)[C@@H]1CCCN(c2nc3ccccc3s2)C1)c...,409.942,4.6568,1,4
1,1,"(-15.0, -14.0]",validation,-14.196672,6IIU,ZINC001600492567,Cc1ccc2c(CN3[C@@H]4C[C@H](C(=O)O)O[C@H]4CC[C@H...,357.406,2.69642,1,5


In [None]:
len(df_lipinski)

1200000

In [None]:
df_lipinski.to_csv('/content/drive/MyDrive/Dataframes/df_lipinski.csv', index=False)

### Fingerprints

In [None]:
def calculate_fingerprints(smiles_list):
    extended_connectivity_fps = []
    maccs_keys_fps = []
    rdkit_fps = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)

        if mol is not None:
            extended_connectivity_fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=1024)
            maccs_keys_fp = MACCSkeys.GenMACCSKeys(mol)
            rdkit_fp = RDKFingerprint(mol, maxPath=5, fpSize=1024)

            extended_connectivity_fps.append(list(extended_connectivity_fp))
            maccs_keys_fps.append(list(maccs_keys_fp))
            rdkit_fps.append(list(rdkit_fp))

        else:
            print(f"Unable to process SMILES: {smiles}")

    return extended_connectivity_fps, maccs_keys_fps, rdkit_fps

extended_connectivity_fps, maccs_keys_fps, rdkit_fps = calculate_fingerprints(df['smiles'])

result_df = pd.DataFrame({
    'extended_connectivity_fps': extended_connectivity_fps,
    'maccs_keys_fps': maccs_keys_fps,
    'rdkit_fps': rdkit_fps
})

df_fingerprints = pd.concat([df, result_df], axis=1)
df_fingerprints.head(2)

Unnamed: 0.1,Unnamed: 0,interval,subset,docking_score,pdb_id,zinc_id,smiles,extended_connectivity_fps,maccs_keys_fps,rdkit_fps
0,0,"(-15.0, -14.0]",validation,-14.881847,6IIU,ZINC001129722346,C#C[C@@H](NC(=O)[C@@H]1CCCN(c2nc3ccccc3s2)C1)c...,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, ..."
1,1,"(-15.0, -14.0]",validation,-14.196672,6IIU,ZINC001600492567,Cc1ccc2c(CN3[C@@H]4C[C@H](C(=O)O)O[C@H]4CC[C@H...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."


In [None]:
len(df_fingerprints)

1200000

In [None]:
df_fingerprints.to_csv('/content/drive/MyDrive/Dataframes/df_fingerprints.csv', index=False)

In [None]:
df_fingerprints = pd.read_csv('/content/drive/MyDrive/Dataframes/df_fingerprints.csv')
df_fingerprints.head(2)

Unnamed: 0.1,Unnamed: 0,interval,subset,docking_score,pdb_id,zinc_id,smiles,extended_connectivity_fps,maccs_keys_fps,rdkit_fps
0,0,"(-15.0, -14.0]",validation,-14.881847,6IIU,ZINC001129722346,C#C[C@@H](NC(=O)[C@@H]1CCCN(c2nc3ccccc3s2)C1)c...,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, ..."
1,1,"(-15.0, -14.0]",validation,-14.196672,6IIU,ZINC001600492567,Cc1ccc2c(CN3[C@@H]4C[C@H](C(=O)O)O[C@H]4CC[C@H...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."


In [None]:
df_fingerprints['extended_connectivity_fps'] = df_fingerprints['extended_connectivity_fps'].apply(ast.literal_eval)
df_fingerprints['maccs_keys_fps'] = df_fingerprints['maccs_keys_fps'].apply(ast.literal_eval)
df_fingerprints['rdkit_fps'] = df_fingerprints['rdkit_fps'].apply(ast.literal_eval)

df_fingerprints.to_pickle("/content/drive/MyDrive/Dataframes/df_fingerprints.pkl")

In [None]:
df_fingerprints = pd.read_pickle('/content/drive/MyDrive/Dataframes/df_fingerprints.pkl')
df_fingerprints.head(2)

Unnamed: 0.1,Unnamed: 0,interval,subset,docking_score,pdb_id,zinc_id,smiles,extended_connectivity_fps,maccs_keys_fps,rdkit_fps
0,0,"(-15.0, -14.0]",validation,-14.881847,6IIU,ZINC001129722346,C#C[C@@H](NC(=O)[C@@H]1CCCN(c2nc3ccccc3s2)C1)c...,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, ..."
1,1,"(-15.0, -14.0]",validation,-14.196672,6IIU,ZINC001600492567,Cc1ccc2c(CN3[C@@H]4C[C@H](C(=O)O)O[C@H]4CC[C@H...,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."


In [None]:
#columns_to_read_fp = ['docking_score', 'extended_connectivity_fps', 'maccs_keys_fps', 'rdkit_fps']
columns_to_read_fp = ['docking_score', 'extended_connectivity_fps']
#columns_to_read_fp = ['docking_score', 'maccs_keys_fps']
#columns_to_read_fp = ['docking_score', 'rdkit_fps']

df_fingerprints_opt = df_fingerprints[columns_to_read_fp].copy()

In [None]:
columns_to_read_rec = ['encoded_seq']
df_fingerprints_opt[columns_to_read_rec] = df_receptors[columns_to_read_rec].copy()

df_fingerprints_opt.head(2)

Unnamed: 0,docking_score,rdkit_fps,encoded_seq
0,-14.881847,"[1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, ...","[3, 20, 9, 3, 3, 3, 3, 6, 1, 13, 1, 3, 10, 4, ..."
1,-14.196672,"[0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...","[3, 20, 9, 3, 3, 3, 3, 6, 1, 13, 1, 3, 10, 4, ..."


In [None]:
df_fingerprints_opt.to_pickle("/content/drive/MyDrive/Dataframes/df_fingerprints_extended_connectivity_opt.pkl")
#df_fingerprints_opt.to_pickle("/content/drive/MyDrive/Dataframes/df_fingerprints_maccs_keys_opt.pkl")
#df_fingerprints_opt.to_pickle("/content/drive/MyDrive/Dataframes/df_fingerprints_rdkit_opt.pkl")

### Graph

In [None]:
def extract_graph_features(smiles):
    mol = Chem.MolFromSmiles(smiles)

    if mol is None:
        return None

    num_atoms = mol.GetNumAtoms()
    node_features = np.array([atom.GetAtomicNum() for atom in mol.GetAtoms()])
    edge_features = np.array([bond.GetBondTypeAsDouble() for bond in mol.GetBonds()])
    adjacency_matrix = Chem.GetAdjacencyMatrix(mol)

    return node_features, edge_features, adjacency_matrix

node_features_list, edge_features_list, adjacency_matrix_list = [], [], []

df_graphs=df.copy()

for smiles in df_graphs['smiles']:
    features = extract_graph_features(smiles)
    if features is not None:
        node_features_list.append(list(features[0]))
        edge_features_list.append(list(features[1]))
        adjacency_matrix_list.append(features[2].tolist())

df_graphs['node_features'] =  node_features_list
df_graphs['edge_features'] = edge_features_list
df_graphs['adjacency_matrix'] = adjacency_matrix_list

df_graphs.head(2)

Unnamed: 0.1,Unnamed: 0,subset,docking_score,pdb_id,zinc_id,smiles,node_features,edge_features,adjacency_matrix
0,0,test,-10.467345,5EK0,ZINC001686499439,CO[C@@H](CNC(=O)NCCC[C@@H]1CCCN1C(=O)OC(C)(C)C...,"[6, 8, 6, 6, 7, 6, 8, 7, 6, 6, 6, 6, 6, 6, 6, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, ...","[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,1,validation,-9.679859,5MZJ,ZINC001422246376,C[C@H](O)CN(C)C[C@H]1CCCN1C(=O)CCc1nccs1,"[6, 6, 8, 6, 7, 6, 6, 6, 6, 6, 6, 7, 6, 8, 6, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [None]:
df_graphs.to_csv('/content/drive/MyDrive/Dataframes/df_graphs.csv', index=False)

In [None]:
df_graphs = pd.read_csv('/content/drive/MyDrive/Dataframes/df_graphs.csv')
df_graphs.head(2)

Unnamed: 0.1,Unnamed: 0,subset,docking_score,pdb_id,zinc_id,smiles,node_features,edge_features,adjacency_matrix
0,0,test,-10.467345,5EK0,ZINC001686499439,CO[C@@H](CNC(=O)NCCC[C@@H]1CCCN1C(=O)OC(C)(C)C...,"[6, 8, 6, 6, 7, 6, 8, 7, 6, 6, 6, 6, 6, 6, 6, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, ...","[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,1,validation,-9.679859,5MZJ,ZINC001422246376,C[C@H](O)CN(C)C[C@H]1CCCN1C(=O)CCc1nccs1,"[6, 6, 8, 6, 7, 6, 6, 6, 6, 6, 6, 7, 6, 8, 6, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [None]:
df_graphs['node_features'] = df_graphs['node_features'].apply(ast.literal_eval)
df_graphs['edge_features'] = df_graphs['edge_features'].apply(ast.literal_eval)
df_graphs['adjacency_matrix'] = df_graphs['adjacency_matrix'].apply(ast.literal_eval)

df_graphs.head(2)

Unnamed: 0.1,Unnamed: 0,subset,docking_score,pdb_id,zinc_id,smiles,node_features,edge_features,adjacency_matrix
0,0,test,-10.467345,5EK0,ZINC001686499439,CO[C@@H](CNC(=O)NCCC[C@@H]1CCCN1C(=O)OC(C)(C)C...,"[6, 8, 6, 6, 7, 6, 8, 7, 6, 6, 6, 6, 6, 6, 6, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, ...","[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
1,1,validation,-9.679859,5MZJ,ZINC001422246376,C[C@H](O)CN(C)C[C@H]1CCCN1C(=O)CCc1nccs1,"[6, 6, 8, 6, 7, 6, 6, 6, 6, 6, 6, 7, 6, 8, 6, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."


In [None]:
df_graphs.to_pickle("/content/drive/MyDrive/Dataframes/df_graphs.pkl")