In [1]:
import ast
import numpy as np
import rdkit
from rdkit import Chem
from rdkit.Chem import BondType
import torch
import pandas as pd
import pickle

# 필요한 함수 정의

1. 동적 matrix(adjacency, feature) 생성 함수

In [2]:
def smiles_to_graph(smiles):
    molecule = Chem.MolFromSmiles(smiles)
    
    num_atoms = molecule.GetNumAtoms()
    
    adjacency = np.zeros((num_atoms, num_atoms), 'int64')
    features = np.zeros((num_atoms), 'int64')
    
    for atom in molecule.GetAtoms():
        i = atom.GetIdx()
        atom_type = atom_mapping[atom.GetSymbol()]
        features[i] = atom_type
        
        for neighbor in atom.GetNeighbors():
            j = neighbor.GetIdx()
            bond = molecule.GetBondBetweenAtoms(i, j)
            bond_type_idx = bond_mapping[bond.GetBondType().name]
            adjacency[[i, j], [j, i]] = bond_type_idx
            
    adjacency = torch.tensor(adjacency.tolist(), dtype=torch.int8)
    features = torch.tensor(features.tolist(), dtype=torch.int8)
    
    return adjacency, features

2. adjacency, features 정보로 분자 복구하는 함수

In [3]:
def graph_to_molecule(features, adjacency):
    molecule = Chem.RWMol()  # 편집 가능한 빈 molecule 추가
    num_atoms = len(features)
    atom_index_map = {}
    
    # 'features' 이용해서 molecule에 atom 추가
    for i in range(num_atoms):
        atom_type = features[i].item() # Convert tensor to int
        atom_symbol = atom_mapping[atom_type]
        new_atom_index = molecule.AddAtom(Chem.Atom(atom_symbol))
        atom_index_map[i] = new_atom_index

    # 'adjacency' 이용해서 molecule에 bond 추가
    for i in range(num_atoms):
        for j in range(i + 1, num_atoms):
            bond_type_idx = adjacency[i, j].item()  # Convert tensor to int

            if bond_type_idx == 0:
                continue

            bond_type = bond_mapping[bond_type_idx]

            new_i = atom_index_map[i]
            new_j = atom_index_map[j]

            if bond_type != 0:
                molecule.AddBond(new_i, new_j, bond_type)

    return molecule

3. df 받아서 list of dict 생성<br>

!!! 추출하고자 하는 속성명 df 내 컬럼명과 동일하게 바꾸기 !!!

In [4]:
def generate_pickle(df):
    mol_list = []
    for index, row in df.iterrows():
        smiles = row['smiles']
        homo = row['homo']     ### 꼭 맞춰서 바꾸기!!!!!!###

        adjacency, features = smiles_to_graph(smiles)

        molecule_dict = {
            'num_atom' : adjacency.shape[0],
            'atom_type' : torch.tensor(features, dtype=torch.int8),
            'bond_type' : torch.tensor(adjacency, dtype=torch.int8),
            'homo' : torch.tensor([homo], dtype=torch.float32) ### 꼭 맞춰서 바꾸기!!!!!!###
        }

        mol_list.append(molecule_dict)
        
    return mol_list

# Data load

### From QM9

In [5]:
data = pd.read_csv('/home/sjang/MILESTONE/1_DatasetForDescriptor/QM9_allfeatures.csv')
data.columns

Index(['num_atoms', 'atomic_symbols', 'pos', 'charges',
       'harmonic_oscillator_frequencies', 'smiles', 'inchi', 'A', 'B', 'C',
       'mu', 'alpha', 'homo', 'lumo', 'gap', 'r2', 'zpve', 'u0', 'u', 'h', 'g',
       'cv'],
      dtype='object')

In [6]:
df = data[['smiles', 'charges', 'homo', 'lumo', 'mu']]
df.columns

Index(['smiles', 'charges', 'homo', 'lumo', 'mu'], dtype='object')

In [7]:
df.head(3)

Unnamed: 0,smiles,charges,homo,lumo,mu
0,CCC12CCC(C)C1O2,[-0.379761 -0.27778 0.431848 -0.288567 -0.22...,-0.2497,0.089,1.7168
1,CC1NCC2OC2C1=O,[-0.408106 -0.060018 -0.300466 -0.213953 0.06...,-0.2283,-0.0338,3.5647
2,CCC(O)CC1CC1C,[-0.381135 -0.183739 0.13924 -0.424479 -0.23...,-0.2514,0.0706,1.3788


# bond, atom Dictionary  생성

In [8]:
# bond_mapping Dictionary 생성
bond_mapping = {"SINGLE": 1, "DOUBLE": 2, "TRIPLE": 3, "AROMATIC": 4}
bond_mapping.update({1:BondType.SINGLE, 2: BondType.DOUBLE, 3: BondType.TRIPLE, 4: BondType.AROMATIC})

# 새로운 DF의 'smiles' 컬럼에서 원자 종류 뽑는 과정
atom_type = df['smiles'].apply(lambda x: [symbol for symbol in x if symbol.isalpha()]).explode().unique()
SMILE_CHARSET = atom_type.tolist()
SMILE_CHARSET.sort()
print('SMILE_CHARSET: ', SMILE_CHARSET)

# atom_mapping_Dictionary 생성
SMILE_to_index = dict((c, i) for i, c in enumerate(SMILE_CHARSET))
index_to_SMILE = dict((i, c) for i, c in enumerate(SMILE_CHARSET))
atom_mapping = dict(SMILE_to_index)
atom_mapping.update(index_to_SMILE)

SMILE_CHARSET:  ['C', 'F', 'H', 'N', 'O']


# Train, Val, Test 분리

In [9]:
from sklearn.model_selection import train_test_split

shuffled_indices = df.sample(frac=1, random_state=100).index
shuffled_df = df.sample(frac=1, random_state=100)

train_ratio = 0.9
val_ratio = 0.07
test_ratio = 0.03

# train, val, test indices 분리
train_indices, val_test_indices = train_test_split(shuffled_indices, 
                                                   train_size=train_ratio, 
                                                   test_size=val_ratio+test_ratio, 
                                                   random_state=100)
val_indices, test_indices = train_test_split(val_test_indices, 
                                             train_size=val_ratio/(val_ratio+test_ratio), 
                                             test_size=test_ratio/(val_ratio+test_ratio), 
                                             random_state=100)

train_index = [train_indices]
val_index = [val_indices]
test_index = [test_indices]

# train, val, test data 분리
train_data = df.loc[train_indices]
val_data = df.loc[val_indices]
test_data = df.loc[test_indices]

print("Train set size:", len(train_data))
print("Validation set size:", len(val_data))
print("Test set size:", len(test_data))

Train set size: 120496
Validation set size: 9372
Test set size: 4017


In [10]:
print("Train indices length:", len(train_index))
print("Validation indices length:", len(val_index))
print("Test indices length:", len(test_index))
print('-'*50)
print("train_index[0] length:", len(train_index[0]))
print("val_index[0] length:", len(val_index[0]))
print("test_index[0] length:", len(test_index[0]))

Train indices length: 1
Validation indices length: 1
Test indices length: 1
--------------------------------------------------
train_index[0] length: 120496
val_index[0] length: 9372
test_index[0] length: 4017


# 각 데이터셋으로 Dict 생성

In [11]:
print('Generating train_list_of_dict...')
train_list_of_dict = generate_pickle(train_data)
print('train_list_of_dict generated')
print('Generating val_list_of_dict...')
val_list_of_dict = generate_pickle(val_data)
print('val_list_of_dict generated')
print('Generating test_list_of_dict...')
test_list_of_dict = generate_pickle(test_data)
print('test_list_of_dict generated')

Generating train_list_of_dict...


  'atom_type' : torch.tensor(features, dtype=torch.int8),
  'bond_type' : torch.tensor(adjacency, dtype=torch.int8),


train_list_of_dict generated
Generating val_list_of_dict...
val_list_of_dict generated
Generating test_list_of_dict...
test_list_of_dict generated


# index, pickle 저장

!!! 경로 수정 !!!

In [12]:
# Define the file paths for saving the data and indices
file_path = "/home/sjang/MILESTONE/2_GeneratePickles/1_homo/"
train_index_path = file_path + "train.index"
val_index_path = file_path + "val.index"
test_index_path = file_path + "test.index"
train_data_path = file_path + "train.pickle"
val_data_path = file_path + "val.pickle"
test_data_path = file_path + "test.pickle"

# Save the train, validation, and test indices as separate .INDEX files
np.savetxt(train_index_path, train_index, delimiter=',', fmt='%d')
np.savetxt(val_index_path, val_index, delimiter=',', fmt='%d')
np.savetxt(test_index_path, test_index, delimiter=',', fmt='%d')

# Save train data
with open(train_data_path, 'wb') as f:
    pickle.dump(train_list_of_dict, f)

# Save validation data
with open(val_data_path, 'wb') as f:
    pickle.dump(val_list_of_dict, f)

# Save test data
with open(test_data_path, 'wb') as f:
    pickle.dump(test_list_of_dict, f)

In [13]:
train_data.head(3)

Unnamed: 0,smiles,charges,homo,lumo,mu
87258,CC1(C)C2CC1COC2,[-0.454891 0.35487 -0.475437 -0.069233 -0.24...,-0.2405,0.078,1.432
57910,NC1=CC=CC(=O)N1,[-0.578486 0.457564 -0.255664 -0.026827 -0.27...,-0.1962,-0.0199,5.4369
10792,CC1C2N1C(C)C2=NO,[-0.428464 0.019683 -0.15804 -0.154306 -0.04...,-0.227,0.0008,1.0382


In [14]:
train_indices[:3]

Index([87258, 57910, 10792], dtype='int64')

In [15]:
train_list_of_dict[:3]

[{'num_atom': 9,
  'atom_type': tensor([0, 0, 0, 0, 0, 0, 0, 4, 0], dtype=torch.int8),
  'bond_type': tensor([[0, 1, 0, 0, 0, 0, 0, 0, 0],
          [1, 0, 1, 1, 0, 1, 0, 0, 0],
          [0, 1, 0, 0, 0, 0, 0, 0, 0],
          [0, 1, 0, 0, 1, 0, 0, 0, 1],
          [0, 0, 0, 1, 0, 1, 0, 0, 0],
          [0, 1, 0, 0, 1, 0, 1, 0, 0],
          [0, 0, 0, 0, 0, 1, 0, 1, 0],
          [0, 0, 0, 0, 0, 0, 1, 0, 1],
          [0, 0, 0, 1, 0, 0, 0, 1, 0]], dtype=torch.int8),
  'homo': tensor([-0.2405])},
 {'num_atom': 8,
  'atom_type': tensor([3, 0, 0, 0, 0, 0, 4, 3], dtype=torch.int8),
  'bond_type': tensor([[0, 1, 0, 0, 0, 0, 0, 0],
          [1, 0, 4, 0, 0, 0, 0, 4],
          [0, 4, 0, 4, 0, 0, 0, 0],
          [0, 0, 4, 0, 4, 0, 0, 0],
          [0, 0, 0, 4, 0, 4, 0, 0],
          [0, 0, 0, 0, 4, 0, 2, 4],
          [0, 0, 0, 0, 0, 2, 0, 0],
          [0, 4, 0, 0, 0, 4, 0, 0]], dtype=torch.int8),
  'homo': tensor([-0.1962])},
 {'num_atom': 9,
  'atom_type': tensor([0, 0, 0, 3, 0, 0, 0, 3, 

In [15]:
import csv
import pickle

In [17]:
data_dir = '/home/sjang/MILESTONE/2_GeneratePickles/1_homo'

In [28]:
with open(data_dir + "/train.pickle", "rb") as f:
    data = pickle.load(f)

In [29]:
with open(data_dir + "/train.index", "r") as f:
    data_idx = [list(map(int, idx)) for idx in csv.reader(f)]
    data = [data[i] for i in range(len(data_idx[0]))]
    print(len(data))

120496


In [32]:
data_idx[0][1]

57910

In [None]:
data_idx[2]

In [None]:
len(data_idx)

In [21]:
data[0]

{'num_atom': 9,
 'atom_type': tensor([0, 0, 0, 0, 0, 0, 0, 4, 0], dtype=torch.int8),
 'bond_type': tensor([[0, 1, 0, 0, 0, 0, 0, 0, 0],
         [1, 0, 1, 1, 0, 1, 0, 0, 0],
         [0, 1, 0, 0, 0, 0, 0, 0, 0],
         [0, 1, 0, 0, 1, 0, 0, 0, 1],
         [0, 0, 0, 1, 0, 1, 0, 0, 0],
         [0, 1, 0, 0, 1, 0, 1, 0, 0],
         [0, 0, 0, 0, 0, 1, 0, 1, 0],
         [0, 0, 0, 0, 0, 0, 1, 0, 1],
         [0, 0, 0, 1, 0, 0, 0, 1, 0]], dtype=torch.int8),
 'homo': tensor([-0.2405])}