# Preprocessing

In [26]:
import pandas as pd
import torch
from utils.prepare import MoleculeDataset, convert_string_to_list

In [27]:
data = pd.read_csv('../data/QM_100.csv', converters={'CDD': convert_string_to_list})
data = data.head(100)

In [28]:
from skipatom import SkipAtomInducedModel

skipatom_model = SkipAtomInducedModel.load(
    "../skipatom/data/mp_2020_10_09.dim200.model", 
    "../skipatom/data/mp_2020_10_09.training.data", 
    min_count=2e7, top_n=5
)
# кирилл ваня пикулин

In [29]:
dataset = MoleculeDataset(data, smiles_column='smiles', target_column='CDD', skipatom_model=skipatom_model)

100%|██████████| 100/100 [00:58<00:00,  1.72it/s]


In [30]:
datapoint = dataset[0]
datapoint

Data(x=[31, 333], edge_index=[2, 64], edge_attr=[64, 414], y=[31], smiles='CNC(=S)N/N=C/c1c(O)ccc2ccccc12')

In [31]:
print(f"Shape of atom features (x): {datapoint.x.shape}")
print(f"Shape of edge index: {datapoint.edge_index.shape}")
print(f"Shape of edge attr: {datapoint.edge_attr.shape}")
print(f"Target value (y): {datapoint.y}")
print(f"Shape of target value: {datapoint.y.shape}")
print(f"Number of atoms in the molecule: {datapoint.x.size(0)}")
print(f"Number of bonds in the molecule: {datapoint.edge_index.size(1) // 2}")

Shape of atom features (x): torch.Size([31, 333])
Shape of edge index: torch.Size([2, 64])
Shape of edge attr: torch.Size([64, 414])
Target value (y): tensor([-0.0756, -0.1880,  0.1527, -0.9194, -0.1204, -0.2333, -0.0461, -0.1620,
         0.1055, -0.4826, -0.1695, -0.1810, -0.0448, -0.1405, -0.1686, -0.1502,
        -0.1488, -0.0190,  0.0474,  0.0435,  0.0400,  0.2173,  0.2133,  0.0341,
         0.1984,  0.0413,  0.0254,  0.0345,  0.0283,  0.0312,  0.0369])
Shape of target value: torch.Size([31])
Number of atoms in the molecule: 31
Number of bonds in the molecule: 32


In [32]:
torch.save(dataset, "../data/QM_100.pt")