# Preprocessing

In [None]:
import pandas as pd
import torch
from utils.prepare import MoleculeDataset, convert_string_to_list

In [None]:
data = pd.read_csv('../data/QM_137k.csv', converters={'CDD': convert_string_to_list})

In [None]:
from skipatom import SkipAtomInducedModel

skipatom_model = SkipAtomInducedModel.load(
    "../skipatom/data/mp_2020_10_09.dim200.model", 
    "../skipatom/data/mp_2020_10_09.training.data", 
    min_count=2e7, top_n=5
)


In [None]:
dataset = MoleculeDataset(data, smiles_column='smiles', target_column='CDD', skipatom_model=skipatom_model)

In [None]:
datapoint = dataset[0]
datapoint

In [None]:
print(f"Shape of atom features (x): {datapoint.x.shape}")
print(f"Shape of edge index: {datapoint.edge_index.shape}")
print(f"Shape of edge attr: {datapoint.edge_attr.shape}")
print(f"Target value (y): {datapoint.y}")
print(f"Shape of target value: {datapoint.y.shape}")
print(f"Number of atoms in the molecule: {datapoint.x.size(0)}")
print(f"Number of bonds in the molecule: {datapoint.edge_index.size(1) // 2}")

In [None]:
torch.save(dataset, "../data/QM_137k_only200.pt")