In [31]:
import pandas as pd
import numpy as np
from pathlib import Path
import ast

import torch
from utils.prepare import MoleculeDataset, convert_string_to_list

def convert_string_to_float_list(s):
    return [float(x) for x in ast.literal_eval(s)]


new_cdd_data = pd.read_csv('../data/extended_CDD.csv')
new_cdd_data['CDD'] = new_cdd_data['CDD'].apply(convert_string_to_float_list)


In [32]:
new_cdd_data

Unnamed: 0,Molecule,Conformation,CDD
0,11912,2,"[-0.9169406870495196, -1.8858158447225923, -1...."
1,11912,3,"[-0.8993202700876205, -2.047958693156822, -1.6..."
2,12149,0,"[-2.3218197556688422, -2.284684797997657, -2.4..."
3,12149,1,"[-2.7245031940845785, -2.537871992539606, -2.0..."
4,12149,2,"[-2.238486757496124, -1.4640037934792547, -1.4..."
...,...,...,...
915,9565,0,"[-1.0387456154020016, -0.20998639727657742, -0..."
916,9565,1,"[-1.0238404671709036, -0.21270022964559143, -0..."
917,9565,2,"[-1.0391089346549305, -0.22464907019702152, -0..."
918,9565,3,"[-1.0465600420354508, -0.21426118209312595, -0..."


In [33]:
average_cdd_data = new_cdd_data.groupby('Molecule')['CDD'].apply(lambda x: np.mean(x.tolist(), axis=0)).reset_index()
average_cdd_data

Unnamed: 0,Molecule,CDD
0,63,"[-1.0718126417200082, -0.3950268079450434, -0...."
1,126,"[-0.7976550415172552, -1.3643236555025653, -0...."
2,485,"[-0.4025085754204631, -0.667003190744647, -0.9..."
3,489,"[-1.3737476624856164, -1.1614149865567525, -0...."
4,514,"[-1.5605711921910963, -2.1242695065405837, -1...."
...,...,...
236,51509,"[-1.6136102248475788, -1.000006584493352, -0.9..."
237,51590,"[-0.31267078960289024, -1.319711155065329, -1...."
238,52396,"[-1.936559241337586, -2.4883118958360995, -2.3..."
239,52416,"[-0.7063632497802963, -1.1841816067757227, -0...."


In [34]:
smiles_data = pd.read_csv('../data/QM_137k.csv', usecols=['smiles'])
smiles_data = smiles_data.reset_index().rename(columns={'index': 'Molecule'})


In [35]:
smiles_data

Unnamed: 0,Molecule,smiles
0,0,CNC(=S)N/N=C/c1c(O)ccc2ccccc12
1,1,O=C(NCCn1cccc1)c1cccc2ccccc12
2,2,C=C(C)[C@H]1C[C@@H]2OO[C@H]1C=C2C
3,3,OCCCc1cc[nH]n1
4,4,CC(=N)NCc1cccc(CNCc2ccncc2)c1
...,...,...
136214,136214,CC(C)(O)C#Cc1ccc(B(O)O)c([C@H](Cc2cc(F)cc(F)c2...
136215,136215,CC(C)(C)OC(=O)N1CC(CC#N)(n2cc(B3OC(C)(C)C(C)(C...
136216,136216,CC1(C)OB(C2=CCNCC2)OC1(C)C
136217,136217,CC(C)(C)OC(=O)Nc1cc(B2OC(C)(C)C(C)(C)O2)ccn1


In [36]:
final_data = pd.merge(smiles_data, new_cdd_data, on='Molecule')


In [37]:
dataset = MoleculeDataset(final_data, smiles_column='smiles', target_column='CDD')

datapoint = dataset[0]
datapoint

print(f"Shape of atom features (x): {datapoint.x.shape}")
print(f"Shape of edge index: {datapoint.edge_index.shape}")
print(f"Shape of edge attr: {datapoint.edge_attr.shape}")
print(f"Target value (y): {datapoint.y}")
print(f"Shape of target value: {datapoint.y.shape}")
print(f"Number of atoms in the molecule: {datapoint.x.size(0)}")
print(f"Number of bonds in the molecule: {datapoint.edge_index.size(1) // 2}")

torch.save(dataset, "../data/QM_cool_no_conf.pt")

100%|██████████| 920/920 [00:09<00:00, 94.33it/s] 


Shape of atom features (x): torch.Size([30, 133])
Shape of edge index: torch.Size([2, 60])
Shape of edge attr: torch.Size([60, 14])
Target value (y): tensor([-1.0891, -0.3920, -0.4455, -0.7294, -0.4249, -0.4090, -1.0854, -0.7796,
        -1.1161, -1.7733, -2.2821, -1.8900, -1.1697, -1.0854, -0.9848, -0.8177,
        -0.9240, -0.9834, -0.9214, -1.1069, -0.8317, -0.9432, -1.7105, -1.7346,
        -1.5855, -1.2647, -1.7561, -1.6749, -1.7402, -1.8857])
Shape of target value: torch.Size([30])
Number of atoms in the molecule: 30
Number of bonds in the molecule: 30
