# Preprocessing

In [16]:
from typing import List, Union
import pandas as pd
import ast

from rdkit import Chem

import torch
from torch_geometric.data import Data, Dataset

from tqdm import tqdm


from joblib import Parallel, delayed


In [17]:
def convert_string_to_list(string):
    try:
        return ast.literal_eval(string)
    except ValueError:
        return []
    
data = pd.read_csv('../data/QM_137k.csv', converters={'CDD': convert_string_to_list})

data = data.head(10000)

In [18]:
class FeaturizationParameters:
    def __init__(self):
        self.max_atomic_num = 100
        self.atom_features = {
            'atomic_num': list(range(self.max_atomic_num)),
            'degree': [0, 1, 2, 3, 4, 5],
            'formal_charge': [-1, -2, 1, 2, 0],
            'chiral_tag': [0, 1, 2, 3],
            'num_Hs': [0, 1, 2, 3, 4],
            'hybridization': [
                Chem.rdchem.HybridizationType.SP,
                Chem.rdchem.HybridizationType.SP2,
                Chem.rdchem.HybridizationType.SP3,
                Chem.rdchem.HybridizationType.SP3D,
                Chem.rdchem.HybridizationType.SP3D2
            ],
        }
        self.atom_fdim = sum(len(choices) + 1 for choices in self.atom_features.values()) + 2

def onek_encoding_unk(value, choices):
    encoding = [0] * (len(choices) + 1)
    index = choices.index(value) if value in choices else -1
    encoding[index] = 1
    return encoding

def atom_features(atom, params):
    features = onek_encoding_unk(atom.GetAtomicNum() - 1, params.atom_features['atomic_num']) + \
               onek_encoding_unk(atom.GetTotalDegree(), params.atom_features['degree']) + \
               onek_encoding_unk(atom.GetFormalCharge(), params.atom_features['formal_charge']) + \
               onek_encoding_unk(int(atom.GetChiralTag()), params.atom_features['chiral_tag']) + \
               onek_encoding_unk(int(atom.GetTotalNumHs()), params.atom_features['num_Hs']) + \
               onek_encoding_unk(int(atom.GetHybridization()), params.atom_features['hybridization']) + \
               [1 if atom.GetIsAromatic() else 0] + \
               [atom.GetMass() * 0.01]  # scaled to about the same range as other features
    return features

PARAMS = {
    'BOND_FDIM': 10
}

def bond_features(bond: Chem.rdchem.Bond) -> List[Union[bool, int, float]]:
    if bond is None:
        fbond = [1] + [0] * (PARAMS['BOND_FDIM'] - 1)
    else:
        bt = bond.GetBondType()
        fbond = [
            0,  # bond is not None
            bt == Chem.rdchem.BondType.SINGLE,
            bt == Chem.rdchem.BondType.DOUBLE,
            bt == Chem.rdchem.BondType.TRIPLE,
            bt == Chem.rdchem.BondType.AROMATIC,
            bond.GetIsConjugated() if bt is not None else 0,
            bond.IsInRing() if bt is not None else 0
        ]
        fbond += onek_encoding_unk(int(bond.GetStereo()), list(range(6)))
    return fbond


class MoleculeData:
    def __init__(self, smiles, target, addHs=True):
        self.smiles = smiles
        self.target = torch.tensor(target, dtype=torch.float)
        self.mol = Chem.MolFromSmiles(smiles)
        if addHs:
            self.mol = Chem.AddHs(self.mol)
        self.params = FeaturizationParameters()
        self.edge_index, self.edge_attr = self.construct_graph()

    def construct_graph(self):
        edge_index = []
        edge_attr = []
        for bond in self.mol.GetBonds():
            start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
            edge_index.extend([[start, end], [end, start]])
            edge_attr.extend([bond_features(bond), bond_features(bond)])  # Добавляем признаки для обеих направлений связи
        return torch.tensor(edge_index).t().contiguous(), torch.tensor(edge_attr, dtype=torch.float)

    def generate_atom_features(self):
        features = []
        for atom in self.mol.GetAtoms():
            features.append(atom_features(atom, self.params))
        return torch.tensor(features, dtype=torch.float)

class MoleculeDataset(Dataset):
    def __init__(self, dataframe, smiles_column='smiles', target_column='target', addHs=True, n_jobs=-1):
        super(MoleculeDataset, self).__init__()
        
        self.data_list = Parallel(n_jobs=n_jobs)(
            delayed(lambda row: MoleculeData(row[smiles_column], row[target_column], addHs))(
                row) for _, row in tqdm(dataframe.iterrows(), total=dataframe.shape[0]))

    def len(self): 
        return len(self.data_list)

    def get(self, idx):
        molecule_data = self.data_list[idx]
        x = molecule_data.generate_atom_features()
        edge_index = molecule_data.edge_index
        edge_attr = molecule_data.edge_attr
        y = molecule_data.target
        
        data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)
        data.smiles = molecule_data.smiles
        
        return data


In [19]:
molecule_dataset = MoleculeDataset(data, smiles_column='smiles', target_column='CDD')

100%|██████████| 10000/10000 [03:33<00:00, 46.74it/s]


In [20]:
datapoint = molecule_dataset[2]
datapoint

Data(x=[26, 133], edge_index=[2, 54], edge_attr=[54, 14], y=[26], smiles='C=C(C)[C@H]1C[C@@H]2OO[C@H]1C=C2C')

In [21]:
print(f"Shape of atom features (x): {datapoint.x.shape}")

# Вывод размерности edge_index, которая указывает связи между атомами
print(f"Shape of edge index: {datapoint.edge_index.shape}")
print(f"Shape of edge attr: {datapoint.edge_attr.shape}")

# Вывод размерности и содержания целевых значений
print(f"Target value (y): {datapoint.y}")
print(f"Shape of target value: {datapoint.y.shape}")

# Вывод количества атомов в молекуле, что соответствует размеру второго измерения атомных фичей
print(f"Number of atoms in the molecule: {datapoint.x.size(0)}")

# Вывод количества связей в молекуле, что соответствует половине размера второго измерения edge_index,
# так как каждая связь представлена дважды (для обоих направлений в неориентированном графе)
print(f"Number of bonds in the molecule: {datapoint.edge_index.size(1) // 2}")

Shape of atom features (x): torch.Size([26, 133])
Shape of edge index: torch.Size([2, 54])
Shape of edge attr: torch.Size([54, 14])
Target value (y): tensor([-0.3516, -0.0627, -0.1982, -0.0723, -0.1465,  0.0553, -0.4408, -0.4395,
         0.0524, -0.2450, -0.1293, -0.1994, -0.0198,  0.0029,  0.0258,  0.0208,
         0.0184,  0.0178, -0.0059,  0.0097,  0.0272,  0.0341, -0.0099,  0.0146,
         0.0269,  0.0153])
Shape of target value: torch.Size([26])
Number of atoms in the molecule: 26
Number of bonds in the molecule: 27


In [22]:
torch.save(molecule_dataset, '../data/QM_10k.pt')
