In [1]:
import pandas as pd
from rdkit import Chem
import numpy as np
from src.data.descriptors.rdNormalizedDescriptors import RDKit2DNormalized
from src.data.featurizer import smiles_to_graph_tune
from scipy import sparse as sp
from multiprocessing import Pool
import dgl.backend as F
from dgl.data.utils import load_graphs
from dgl.data.utils import save_graphs
from dgllife.utils.io import pmap
import torch
from torch.utils.data import DataLoader
from src.utils import set_random_seed
from src.data.featurizer import Vocab, N_ATOM_TYPES, N_BOND_TYPES
from src.data.finetune_dataset import MoleculeDataset
from src.data.collator import Collator_tune
from src.model.light import LiGhTPredictor as LiGhT
from src.model_config import config_dict



# 1. Data Preparation

## 1.1 Load smiles file

In [None]:
df = pd.read_csv('datasets/smiles/ori_smiles.csv')
smiless = df.smiles.values.tolist()

## 1.2 Generate molecular graph file

In [13]:
n_jobs = 8
task_names = df.columns.drop(['smiles']).tolist()
print('constructing graphs')
graphs = pmap(smiles_to_graph_tune,
              smiless,
              max_length=5,
              n_virtual_nodes=2,
              n_jobs=n_jobs)
valid_ids = []
valid_graphs = []
# Index without smiles: [935, 950, 975, 990, 1039, 1080]
none_smiles_index = []
for i, g in enumerate(graphs):
    if g is not None:
        valid_ids.append(i)
        valid_graphs.append(g)
    else:
        none_smiles_index.append(i)
_label_values = df[task_names].values
labels = F.zerocopy_from_numpy(
    _label_values.astype(np.float32))[valid_ids]
print('saving graphs')
save_graphs('datasets/smiles/smiles_5.pkl', valid_graphs, labels={'labels': labels})

constructing graphs


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 1184 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 1222 out of 1237 | elapsed:    0.9s remaining:    0.0s
[Parallel(n_jobs=8)]: Done 1237 out of 1237 | elapsed:    0.9s finished


saving graphs


In [14]:
graphs, label_dict = load_graphs('datasets/smiles/smiles_5.pkl')

print(f"Number of graphs: {len(graphs)}")

# Print detailed information of the first graph
if len(graphs) > 0:
    print(graphs[0])

# Print the content of label_dict
print(label_dict)

Number of graphs: 1231
Graph(num_nodes=31, num_edges=563,
      ndata_schemes={'begin_end': Scheme(shape=(2, 137), dtype=torch.float32), 'vavn': Scheme(shape=(), dtype=torch.int64), 'edge': Scheme(shape=(14,), dtype=torch.float32)}
      edata_schemes={'path': Scheme(shape=(5,), dtype=torch.int64), 'lgp': Scheme(shape=(), dtype=torch.uint8), 'mgp': Scheme(shape=(), dtype=torch.uint8), 'vp': Scheme(shape=(), dtype=torch.uint8), 'sl': Scheme(shape=(), dtype=torch.uint8)})
{'labels': tensor([], size=(1231, 0))}


## 1.3 Generate fingerprint file 

In [15]:
print('extracting fingerprints')
FP_list = []
for smiles in smiless:
    if smiles != 'None':
        mol = Chem.MolFromSmiles(smiles, sanitize=False)
        fp = list(Chem.RDKFingerprint(mol, minPath=1, maxPath=7, fpSize=512))
    FP_list.append(fp)

FP_arr = np.array(FP_list)
FP_sp_mat = sp.csc_matrix(FP_arr)
print('saving fingerprints')
sp.save_npz('datasets/smiles/rdkfp1-7_512.npz', FP_sp_mat)

extracting fingerprints
saving fingerprints


## 1.4 Generate molecular descriptor file

In [16]:
print('extracting molecular descriptors')
n_jobs = 8
generator = RDKit2DNormalized()
# 201-dimension vector, the first dimension is a boolean value, and the next 200 dimensions are floating number.
features_map = Pool(n_jobs).imap(generator.process, smiless)
features_list = list(features_map)
# 处理空值
features = []
for i in features_list:
    if i is not None:
        features.append(i)
# none_indices = [i for i, x in enumerate(features_list) if x is None]
arr = np.array(features)
np.savez_compressed('datasets/smiles/molecular_descriptors.npz',md=arr[:,1:])

extracting molecular descriptors


# 2. Extract Embeddings

## 2.1 Load data

In [None]:
set_random_seed(22,1)
config = config_dict['base']
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vocab = Vocab(N_ATOM_TYPES, N_BOND_TYPES)
collator = Collator_tune(config['path_length'])
mol_dataset = MoleculeDataset(root_path='datasets', dataset = 'smiles', dataset_type=None)
loader = DataLoader(mol_dataset, batch_size=32, shuffle=False, num_workers=4, drop_last=False, collate_fn=collator)

## 2.2 Load pre-trained KPGT model

In [None]:
model = LiGhT(
    d_node_feats=config['d_node_feats'],
    d_edge_feats=config['d_edge_feats'],
    d_g_feats=config['d_g_feats'],
    d_hpath_ratio=config['d_hpath_ratio'],
    n_mol_layers=config['n_mol_layers'],
    path_length=config['path_length'],
    n_heads=config['n_heads'],
    n_ffn_dense_layers=config['n_ffn_dense_layers'],
    input_drop=0,
    attn_drop=0,
    feat_drop=0,
    n_node_types=vocab.vocab_size
    ).to(device)

model.load_state_dict({k.replace('module.',''):v for k,v in torch.load('models/KPGT/base.pth').items()})

## 2.3 Generate embeddings

In [None]:
fps_list = []
for batch_idx, batched_data in enumerate(loader):
    (_, g, ecfp, md, labels) = batched_data
    ecfp = ecfp.to(device)
    md = md.to(device)
    g = g.to(device)
    fps = model.generate_fps(g, ecfp, md)
    fps_list.extend(fps.detach().cpu().numpy().tolist())

## 2.4 Randomly generate embeddings for components without smiles

In [None]:
# These are the indices of the data without SMILES
none_indices = [935, 950, 975, 990, 1039, 1080]

# Generate random embeddings for the data without SMILES
random_fps = np.random.normal(size=(len(none_indices), 2304)).tolist()

# Insert the random embeddings into the fps_list at the correct indices
for idx, random_fp in zip(none_indices, random_fps):
    fps_list.insert(idx, random_fp)

## 2.5 Generate embeddings files (.npz and .csv)

In [None]:
np.savez_compressed('datasets/smiles/kpgt_embeddings.npz', fps=np.array(fps_list))
print(f"The extracted features were saving at 'datasets/smiles/kpgt_embeddings.npz'")

In [None]:
# We also provide embeddings files in ".csv" format
ac = np.load('datasets/smiles/kpgt_embeddings.npz')
arr = ac['fps']
df = pd.DataFrame(arr)
df.to_csv('datasets/smiles/smiles_embeddings.csv', index=False)