Runs the oxidation reaction but extracts only initial and final contents of the reaction (H2 molecule far away) --> in terms of distance matrix and in terms of embeddings (no need to save transformation files just check them) 

In [None]:
from schnetpack.datasets import QM9
import numpy as np

qm9_filepath = '../data/datasets/QM9/qm9.db'

#size of molecule... not a GCNN yet
n_target = 16
factor = 5

#For the oxidation: 
targetlabelH1_filepath = '../data/autolabel/Hqm9alllabelpert3/labelpert3ver2.csv'
targetlabelH2_filepath = '../data/autolabel/Hqm910000labelpert2/labelHver2.csv'
labelH1id = 21
labelH2id = 14


save_filepath = '../data/embstodists/distance_organizedpca/16at-1225/oxidation/rxns.xyz'

Loading relevant datasets

In [None]:
qm9_data = QM9(qm9_filepath,download=False,remove_uncharacterized=True)
label1_data = np.genfromtxt(targetlabelH1_filepath,delimiter=' ',encoding='utf-8-sig')
label2_data = np.genfromtxt(targetlabelH2_filepath,delimiter=' ',encoding='utf-8-sig')


In [None]:
def atomic_number_to_element_symbol(atomic_number):
    element_symbols = {
        1: "H",
        6: "C",
        7: "N",
        8: "O",
        9: "F"
    }
    return element_symbols.get(atomic_number, None)

def atomic_xyz_string(atomic_numbers, atomic_positions):
    if len(atomic_numbers) != len(atomic_positions):
        raise ValueError("The number of atomic numbers and atomic positions should be the same.")

    element_symbols = [atomic_number_to_element_symbol(num) for num in atomic_numbers]
    num_atoms = len(atomic_numbers)
    xyz_string = f"{num_atoms}\n0.00000\n"

    for element_symbol, (x, y, z) in zip(element_symbols, atomic_positions):
        xyz_string += f"{element_symbol} {x:.5f} {y:.5f} {z:.5f}\n"

    return xyz_string

def remove_blank_lines(input_string):
    lines = input_string.splitlines()
    non_blank_lines = [line for line in lines if line.strip()]
    return '\n'.join(non_blank_lines)

In [8]:
count = 0
output_file = open(save_filepath,mode='w')
output_string = ''

for each_molecule in range(10000,15000):

    at, props = qm9_data.get_properties(each_molecule)

    atomic_numbers = props['_atomic_numbers']


    #check the molecule has a specific size   
    if len(atomic_numbers) == n_target:
        #check if the molecule satisfies the condition that it has only one primary alcohol/secondary alcohol (use the ldalabel)

        #get the mol_indices of the molecule.
        atom_indices = np.where(label1_data[:,1] == each_molecule)
        
        #extract the labels of the atoms from the label file
        labels1 = label1_data[atom_indices]
        labels2 = label2_data[atom_indices]
        labels1 = labels1[:,0]
        labels2 = labels2[:,0]

        count_target1 = np.count_nonzero(labels1 == labelH1id)
        count_target2 = np.count_nonzero(labels2 == labelH2id)


        if count_target1 == 1 and count_target2 == 2:
            print(each_molecule)
            count = count + 1

            #write the molecule as is to the output to the file
            atomic_numbers = props['_atomic_numbers'].detach().numpy()
            atomic_positions = props['_positions'].detach().numpy()

            xyz_string = atomic_xyz_string(atomic_numbers,atomic_positions)     
            
            output_string = output_string + xyz_string

            #modify by deleting the hydrogen labelled 21, and the one labelled 14, 
            #then placing a hydrogen molecule somewhere far away
            #remember that you only have h labels so you may have to count your way through both H labels (pert2 and pert3)
            #because only H's count as part of the label
            H_pointer = 0
            product_string = ''
            already_done = False
            for each_line in xyz_string.split('\n'):
                if 'H ' in each_line: 
                    if labels1[H_pointer] == labelH1id:
                        save_positions = [float(each_line[2:6]), float(each_line[10:14]), float(each_line[19:23])]
                        each_line = each_line.replace(each_line,  'H ' + str(factor*save_positions[0])+ ' ' + str(factor*save_positions[1]) + ' '  + str(factor*save_positions[2]))
                    H_pointer = H_pointer + 1
                product_string = product_string + each_line + '\n'

            H_pointer = 0
            product_string2 = ''
            for each_line in product_string.split('\n'):
                if 'H' in each_line:
                    if  labels2[H_pointer] == labelH2id and already_done == False:
                        each_line = each_line.replace(each_line,  'H ' + str(factor*(save_positions[0])+0.74)+ ' ' + str(factor*save_positions[1]) + ' '  + str(factor*save_positions[2]))
                        already_done = True 
                    H_pointer = H_pointer + 1
                product_string2 = product_string2 + each_line + '\n'
            #now add hydrogens far away (take the position of labelH1id and move in that direction by a good amount)
#            product_string2 = remove_blank_lines(product_string2)
            output_string = output_string + product_string2

final = remove_blank_lines(output_string)
output_file.write(final)
output_file.close()
print(count)



221
265
385
386
401
558
585
604
781
795
1136
1238
1250
1431
1442
1443
1480
1490
1493
1500
1523
1614
1625
1965
2474
2628
2645
2659
2965
3080
3082
3268
3269
3271
3333
3454
4089
4651
4653
4687
4699
4705
4707
4709
4749
4773
4774
4806
5097
5098
5100
5119
5120
5125
5253
5360
5361
5396
5405
5406
5437
5446
5447
5453
5469
6149
6150
6151
6266
6273
6528
6545
6875
6891
6915
6917
6918
7095
8034
8356
8381
8392
8424
8527
8530
8539
8542
8563
8569
8570
8583
8597
8626
8655
8657
8658
8671
8851
8854
8858
8868
8894
8900
8926
8949
8959
8963
8974
9033
9047
9059
9060
9263
9331
9332
9344
9345
9350
9357
9361
9362
9382
9383
9387
9395
9563
9630
9695
9748
9755
9774
9785
9818
9837
9851
9870
9883
9891
9892
139
