# Convert Kinetics Library to Training Reactions Script

Specify the kinetics library name below and run the script.  It automatically overwrites the training reactions files it needs to.  Then you should commit those files.

This script only trains safely.  In other words, if a single match from an RMG family is found, a training reaction is created.  Sometimes, there are no matches from RMG reaction families, or multiple matches.  This indicates an error that requires manual fixing, and a printout is given in the script.

In [1]:
from rmgpy import settings
from rmgpy.data.rmg import RMGDatabase
from kinetics_library_to_training_tools_modified import *
import kinetics_library_to_training_tools_modified# import process_reactions
from base64 import b64encode
from IPython.display import display, HTML
import os
import re
from importlib import reload


# Set libraries to load reactions from; set to None to load all libraries
# libraries = ['PFAS_HPL']
libraries = ['PFAS_HPL_Test/1,3_sigmatropic_rearrangement/']
# Set families to add training reactions to; either 'default' or a list, e.g. ['R_Addition_MultipleBond']
# families = ['Perfluoroalkene_Formation',
            # 'CF_Radical_Formation_3',
            # 'CF_Radical_Formation_2', 
            # 'CF_Radical_Formation_1', 
            # 'OF_Radical_Formation', 
            # 'Enol_Ether_Formation',
            # 'Lactone_to_Perfluoroalkene', 
            # 'CO2_Elimination_From_Lactone', 
            # 'CO2_Elimination_From_Carboxylic_Acid', 
            # 'CO2_Elimination_From_Lactone_Ether', 
            # 'CO_Elimination_From_Lactone', 
            # 'CO_CF_bond_dissociation', 
            # 'PFAS_Hydrolysis', 
            # 'Lactone_Formation', 
            # 'CO2_Elimination_From_PFCA_Radical']

# Specify whether to plot kinetics comparisons
compare_kinetics = False

# Specify whether to print library reactions which don't fit in the specified families
# This can result in a lot of unnecessary output if only using a few families
show_all = False

# Specify whether to prioritize aromatic resonance structures to reduce cases of multiple matches
filter_aromatic = True
#filter_aromatic = 

# Specify whether to use verbose comments when averaging tree
verbose_comments = True




# families = ['Perfluorinated_Alkyl_Rearrangement']

thermolibs = [
'C1_C2_Fluorine', #putting Siddha's as most trusted because Caroline used this thermo for calcs
'PFCA_thermo',
'NCSU_C2_C8_PFAS', #using this as second trusted thermo because has good estimates for higher C pfas
'Fluorine',
'primaryThermoLibrary',
'FFCM1(-)',
'halogens',
'CHOF_G4',
'CHOCl_G4',
'CHOBr_G4',
'CHOFCl_G4',
'CHOFBr_G4',
'CHOFClBr_G4',
'DFT_QCI_thermo',
'2-BTP_G4',
'thermo_DFT_CCSDTF12_BAC',
'SulfurHaynes'
]

# thermolibs = []

In [2]:
# families = ['default', 'halogens']
families = ['1,3_sigmatropic_rearrangement']

In [None]:
# #if we want to add in the other reaction families (not new) 
# matched_families_path = '/work/westgroup/nora/Code/projects/PFAS/AIChE_2024/organized_into_rxn_families/HPL_matches/'
# matched_htmls = [html for html in os.listdir(matched_families_path) if '_matched_rxns.html' in html]
# previously_matched_families = [re.search('(\S+)_matched_rxns', html).group(1) for html in matched_htmls]
# families.extend(previously_matched_families)

families = ['Perfluoroalkene_Formation',
 'CF_Radical_Formation_3',
 'CF_Radical_Formation_2',
 'CF_Radical_Formation_1',
 'OF_Radical_Formation',
 'Enol_Ether_Formation',
 'Lactone_to_Perfluoroalkene',
'CO2_Elimination_From_Lactone',
 'CO2_Elimination_From_Carboxylic_Acid',
'CO2_Elimination_From_Lactone_Ether',
'CO_Elimination_From_Lactone',
 'CO_CF_bond_dissociation',
 'PFAS_Hydrolysis',
 'Lactone_Formation',
'CO2_Elimination_From_PFCA_Radical',
 '1+2_Cycloaddition',
 '1,2_Insertion_CO',
 '1,2_Insertion_carbene',
 '1,3_Insertion_CO2',
 '1,3_sigmatropic_rearrangement',
 'F_Abstraction',
 'Intra_R_Add_Endocyclic',
 'R_Addition_COm',
 'R_Addition_MultipleBond',
 'R_Recombination',
 'Singlet_Carbene_Intra_Disproportionation',
 'XY_Addition_MultipleBond',
 'Enol_Ether_Formation',
 'Perfluorinated_Alkyl_Rearrangement']

#these ones are missed, maybe they're fitting to a new family? 
# ['CO2_Elimination_From_Lactone',
#  'CO2_Elimination_From_Lactone_Ether',
#  'CO_Elimination_From_Lactone',
#  'CO2_Elimination_From_PFCA_Radical']


## Step 1: Load RMG-database with specified libraries and families

In [3]:
database = RMGDatabase()
database.load(
    path = settings['database.directory'],
    thermo_libraries = thermolibs,  # Can add others if necessary
    kinetics_families = families,
    reaction_libraries = libraries,
    kinetics_depositories = ['training'],
)
# If we want accurate kinetics comparison, add existing training reactions and fill tree by averaging
if compare_kinetics:
    for family in database.kinetics.families.values():
        if not family.auto_generated:
            family.add_rules_from_training(thermo_database=database.thermo)
            family.fill_rules_by_averaging_up(verbose=verbose_comments)

In [4]:
database.kinetics.families.keys()

dict_keys(['1,3_sigmatropic_rearrangement'])

## Step 2a: Generate library reactions from families to get proper labels

In [None]:
reload(kinetics_library_to_training_tools_modified)

In [5]:
master_dict, multiple_dict, lib_fam_rxn_dict = kinetics_library_to_training_tools_modified.process_reactions(database,
                                               libraries,
                                               list(database.kinetics.families.keys()),
                                               compare_kinetics=compare_kinetics,
                                               show_all=show_all,
                                               filter_aromatic=filter_aromatic)

[Species(label="C2F5CFO", molecule=[Molecule(smiles="O=C(F)C(F)(F)C(F)(F)F")], molecular_weight=(166.022,'amu'))] [Species(label="CF3OCFCF2", molecule=[Molecule(smiles="FC(F)=C(F)OC(F)(F)F")], molecular_weight=(166.022,'amu'))]
getting fam rxn list...




This is the family reaction list before filtering aromatics: ['1,3_sigmatropic_rearrangement']
found a family
O=C(F)C(F)(F)C(F)(F)F <=> FC(F)=C(F)OC(F)(F)F 1,3_sigmatropic_rearrangement


[Species(label="CF(O)CF2CFO", molecule=[Molecule(smiles="O=C(F)C(F)(F)C(=O)F")], molecular_weight=(144.024,'amu'))] [Species(label="CF2CFOCFO", molecule=[Molecule(smiles="O=C(F)OC(F)=C(F)F")], molecular_weight=(144.024,'amu'))]
getting fam rxn list...
This is the family reaction list before filtering aromatics: ['1,3_sigmatropic_rearrangement']
found a family
O=C(F)C(F)(F)C(=O)F <=> O=C(F)OC(F)=C(F)F 1,3_sigmatropic_rearrangement


[Species(label="C2F5OCCF3", molecule=[Molecule(smiles="FC(F)(F)[C]OC(F)(F)C(F)(F)F"), Molecule(smiles="FC(F)(F)[C-]=[O+]C(F)(F)C(F)(F)F")], molecular_weight=(216.029,'amu'))] [Species(label="CF2OC(CF3)CF3", molecule=[Molecule(smiles="FC(F)=[O+][C-](C(F)(F)F)C(F)(F)F")], molecular_weight=(216.029,'amu'))]
getting fam rxn list...
This is the family reaction list before filtering aromatics: []
[Species(label="C3F7OCCF3", molecule=[Molecule(smiles="FC(F)(F)[C]OC(F)(F)C(F)(F)C(F)(F)F"), Molecule(smiles="FC(F)(F)[C-]=[O+]C(F)(F)C(F)(F)C(F)(F)F")], molecular_weight=(266.037,'amu'))] [Species(label="CF2OC(CF3)C2F5", molecule=[Molecule(smiles="FC(F)=[O+][C-](C(F)(F)F)C(F)(F)C(F)(F)F")], molecular_weight=(266.037,'amu'))]
getting fam rxn list...
This is the family reaction list before filtering aromatics: []
[Species(label="C2H5CHO", molecule=[Molecule(smiles="CCC=O")], molecular_weight=(58.0792,'amu'))] [Species(label="CH3OCHCH2", molecule=[Molecule(smiles="C=COC")], molecular_weight=(58.0792,'

[Species(label="CH(O)CH2CHO", molecule=[Molecule(smiles="O=CCC=O")], molecular_weight=(72.0626,'amu'))] [Species(label="CH2CHOCHO", molecule=[Molecule(smiles="C=COC=O")], molecular_weight=(72.0626,'amu'))]
getting fam rxn list...
This is the family reaction list before filtering aromatics: ['1,3_sigmatropic_rearrangement']
found a family
O=CCC=O <=> C=COC=O 1,3_sigmatropic_rearrangement


[Species(label="CH(O)CH2CH2CHO", molecule=[Molecule(smiles="O=CCCC=O")], molecular_weight=(86.0892,'amu'))] [Species(label="CH2CHOCH2CHO", molecule=[Molecule(smiles="C=COCC=O")], molecular_weight=(86.0892,'amu'))]
getting fam rxn list...
This is the family reaction list before filtering aromatics: ['1,3_sigmatropic_rearrangement']
found a family
O=CCCC=O <=> C=COCC=O 1,3_sigmatropic_rearrangement


[Species(label="C2H5OCCH3", molecule=[Molecule(smiles="C[C]OCC"), Molecule(smiles="C[C-]=[O+]CC")], molecular_weight=(72.1057,'amu'))] [Species(label="CH2OC(CH3)CH3", molecule=[Molecule(smiles="C=[O+][C-](C)C")], molecular_weight=(72.1057,'amu'))]
getting fam rxn list...
This is the family reaction list before filtering aromatics: []
[Species(label="C3H7OCCH3", molecule=[Molecule(smiles="C[C]OCCC"), Molecule(smiles="C[C-]=[O+]CCC")], molecular_weight=(86.1323,'amu'))] [Species(label="CH2OC(CH3)C2H5", molecule=[Molecule(smiles="C=[O+][C-](C)CC")], molecular_weight=(86.1323,'amu'))]
getting fam rxn list...
This is the family reaction list before filtering aromatics: []


In [None]:
#let's get how many are matched: 
for family, matched_reactions in master_dict['PFAS_HPL_Test/1,3_sigmatropic_rearrangement/'].items():
    print(family, len(matched_reactions))

## Step 2b (optional): Review and select reactions to be added

In [None]:
review_reactions(master_dict, prompt=False)

## Step 2c (optional): Manual processing for reactions with multiple matches

In [None]:
manual_selection(master_dict, multiple_dict, database)

## Step 2d: Final review of reactions to be added

In [None]:
review_reactions(master_dict, prompt=False)

## Step 3: Write the new training reactions to the database

In [None]:
for library_name, reaction_dict in master_dict.items():
    library = database.kinetics.libraries[library_name]
    
    for family_name, reaction_list in reaction_dict.items():
        print('Adding training reactions from {0} to {1}...'.format(library_name, family_name))

        family = database.kinetics.families[family_name]
        try:
            depository = family.get_training_depository()
        except:
            raise Exception('Unable to find training depository in {0}. Check that one exists.'.format(family_name))

        print('Training depository previously had {} rxns. Now adding {} new rxn(s).'.format(len(depository.entries), len(reaction_list)))

        ref_list = []
        type_list = []
        short_list = []
        long_list = []
        
        for reaction in reaction_list:
            # Get the original entry to retrieve metadata
            orig_entry = library.entries[reaction.index]
            short_desc = orig_entry.short_desc
            long_desc = 'Training reaction from kinetics library: {0}\nOriginal entry: {1}'.format(library_name, orig_entry.label)
            if orig_entry.long_desc:
                long_desc += '\n' + orig_entry.long_desc
            
            ref_list.append(orig_entry.reference)
            type_list.append(orig_entry.reference_type)
            short_list.append(short_desc)
            long_list.append(long_desc)
            
        family.save_training_reactions(
            reaction_list,
            reference=ref_list,
            reference_type=type_list,
            short_desc=short_list,
            long_desc=long_list,
        )