# Preprocessing

This notebook preprocesses molecular data for model training. We load, process, and save the dataset, with and without SkipAtom features.


### Import necessary libraries and functions.


In [5]:
import pandas as pd
import torch
from tqdm import tqdm
tqdm.pandas()

from fukui_net.utils.prepare import MoleculeDataset, convert_string_to_list

### Load the dataset and parse specific columns.

In [6]:
data = pd.read_parquet('../data/processed/qm_calc_data.parquet')

data['CDD'] = data['CDD'].progress_apply(convert_string_to_list)
display(data)

100%|██████████| 136219/136219 [00:09<00:00, 13999.23it/s]


Unnamed: 0,smiles,hirshfeld_charges,hirshfeld_fukui_elec,hirshfeld_fukui_neu,CDD
0,CNC(=S)N/N=C/c1c(O)ccc2ccccc12,[-0.026644 -0.075508 0.096217 -0.287798 -0.02...,[0.010333 0.015418 0.022324 0.111353 0.01318 ...,[0.011959 0.021591 0.017375 0.232471 0.050276 ...,"[-0.07558000000000001, -0.18802500000000003, 0..."
1,O=C(NCCn1cccc1)c1cccc2ccccc12,[-0.292411 0.170263 -0.085754 0.002736 0.01...,[ 0.04781 0.029278 0.02064 0.00908 0.00...,[0.026718 0.004006 0.010781 0.008124 0.007945 ...,"[-0.6593499999999999, 0.30724199999999996, -0...."
2,C=C(C)[C@H]1C[C@@H]2OO[C@H]1C=C2C,[-0.101749 0.012339 -0.07947 -0.020027 -0.05...,[0.082324 0.05751 0.020546 0.011972 0.01684 ...,[0.065786 0.029895 0.018721 0.020284 0.022533 ...,"[-0.35160800000000003, -0.062727, -0.198206999..."
3,OCCCc1cc[nH]n1,[-0.268379 0.027614 -0.050745 -0.045047 0.03...,[0.025129 0.01567 0.021808 0.028649 0.085482 ...,[0.210686 0.044551 0.042072 0.021857 0.059646 ...,"[-0.772573, -0.004993000000000011, -0.16537000..."
4,CC(=N)NCc1cccc(CNCc2ccncc2)c1,[-8.31620e-02 1.14954e-01 -2.74544e-01 -1.003...,[0.005584 0.007569 0.010781 0.00532 0.010963 ...,[0.011283 0.029042 0.090907 0.087975 0.015502 ...,"[-0.183191, 0.193297, -0.650776, -0.294033, -0..."
...,...,...,...,...,...
136214,CC(C)(O)C#Cc1ccc(B(O)O)c([C@H](Cc2cc(F)cc(F)c2...,[-0.085936 0.106032 -0.079299 -0.227724 -0.04...,[ 0.012857 0.009235 0.009797 0.019639 0.09...,[ 1.4045e-02 7.3160e-03 1.0070e-02 1.9841e-...,"[-0.198774, 0.195513, -0.17846499999999998, -0..."
136215,CC(C)(C)OC(=O)N1CC(CC#N)(n2cc(B3OC(C)(C)C(C)(C...,[-0.092575 0.108615 -0.092711 -0.08598 -0.12...,[ 0.002865 0.000517 0.002298 0.002298 0.00...,[ 0.007811 0.002285 0.006834 0.007181 0.02...,"[-0.195826, 0.214428, -0.194554, -0.1814390000..."
136216,CC1(C)OB(C2=CCNCC2)OC1(C)C,[-0.085222 0.09662 -0.0883 -0.199958 0.20...,[0.00882 0.005026 0.008375 0.026625 0.143174 ...,[0.008056 0.007976 0.013125 0.033209 0.022619 ...,"[-0.18732000000000001, 0.180238, -0.1981, -0.4..."
136217,CC(C)(C)OC(=O)Nc1cc(B2OC(C)(C)C(C)(C)O2)ccn1,[-0.085576 0.108086 -0.092508 -0.092366 -0.13...,[ 0.004339 0.00159 0.003044 0.003057 0.01...,[0.005991 0.003475 0.007144 0.007182 0.033176 ...,"[-0.181482, 0.211107, -0.19520400000000002, -0..."


SkipAtom is a machine learning model designed to predict material properties by analyzing the structural and chemical environments of atoms within a material. It uses word embedding techniques to represent atomic environments and make predictions about material properties based on these embeddings. Despite its innovative approach, in our case, using SkipAtom did not improve the accuracy or performance of our predictions.

In [7]:
import subprocess
from pathlib import Path
from skipatom import SkipAtomInducedModel

model_dir = Path("../data/exrernal/skipatom/data")

if not model_dir.exists():

    subprocess.run(["git", "clone", "https://github.com/lantunes/skipatom.git", "../data/external/skipatom"])

skipatom_model = SkipAtomInducedModel.load(
    "../data/external/skipatom/data/mp_2020_10_09.dim200.model", 
    "../data/external/skipatom/data/mp_2020_10_09.training.data", 
    min_count=2e7, top_n=5
)

fatal: destination path '../data/external/skipatom' already exists and is not an empty directory.


In [8]:
dataset = MoleculeDataset(data, 
                          smiles_column='smiles',
                          target_column='CDD',
                          addHs=True,
                          n_jobs=-1,
                          skipatom_model=None)

100%|██████████| 136219/136219 [01:47<00:00, 1270.19it/s]


In [9]:
datapoint = dataset[0]
datapoint

Data(x=[31, 133], edge_index=[2, 64], edge_attr=[64, 14], y=[31], smiles='CNC(=S)N/N=C/c1c(O)ccc2ccccc12')

In [10]:
print(f"Shape of atom features (x): {datapoint.x.shape}")
print(f"Shape of edge index: {datapoint.edge_index.shape}")
print(f"Shape of edge attr: {datapoint.edge_attr.shape}")
print(f"Target value (y): {datapoint.y}")
print(f"Shape of target value: {datapoint.y.shape}")
print(f"Number of atoms in the molecule: {datapoint.x.size(0)}")
print(f"Number of bonds in the molecule: {datapoint.edge_index.size(1) // 2}")

Shape of atom features (x): torch.Size([31, 133])
Shape of edge index: torch.Size([2, 64])
Shape of edge attr: torch.Size([64, 14])
Target value (y): tensor([-0.0756, -0.1880,  0.1527, -0.9194, -0.1204, -0.2333, -0.0461, -0.1620,
         0.1055, -0.4826, -0.1695, -0.1810, -0.0448, -0.1405, -0.1686, -0.1502,
        -0.1488, -0.0190,  0.0474,  0.0435,  0.0400,  0.2173,  0.2133,  0.0341,
         0.1984,  0.0413,  0.0254,  0.0345,  0.0283,  0.0312,  0.0369])
Shape of target value: torch.Size([31])
Number of atoms in the molecule: 31
Number of bonds in the molecule: 32


In [12]:
torch.save(dataset, "../data/processed/QM_137k.pt")