# Verify cell below before running all:

In [10]:
reaction_family = 'H_Abstraction'

csv_path = 'distance_data.csv' #does not get appended to general path

import os
general_path = os.path.join(os.path.expandvars('$RMGpy'),
                            '..', 
                            'AutoTST', 
                            'database', 
                            reaction_family, 
                            'TS_training')

new_dict_path = os.path.join(general_path, 'updated_dictionary.txt')
new_reactions_path = os.path.join(general_path, 'updated_reactions.py')

method = 'm062x/6-311+G(2df,2p)'
shortDesc = 'M06-2X/6-311+G(2df,2p) calculation via group additive TS generator.'

In [2]:
import argparse
import os
import time
import math
import numpy as np
import pylab
import scipy.stats
import matplotlib
matplotlib.rc('mathtext', fontset='stixsans', default='regular')
import re
import rmgpy
from rmgpy.quantity import constants
from rmgpy.kinetics import Arrhenius, ArrheniusEP, KineticsData
from rmgpy.data.base import getAllCombinations
from autotst.database import *
from rmgpy.species import Species
from rmgpy.data.rmg import RMGDatabase
import logging
from collections import defaultdict, OrderedDict
import pandas as pd
import itertools
import IPython
from IPython.display import display, Markdown
def mprint(s): display(Markdown(s))
import cPickle as pickle
# attempt at making the cells wider:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
def get_csv(path):
    add_df = pd.DataFrame.from_csv(path)

    results = defaultdict(OrderedDict)
    for i, entry in enumerate(add_df.index):
        r = OrderedDict()

        label = add_df.T[entry].name
        reactants, products = label.split('_')
        r1, r2 = reactants.split('+')
        p1, p2 = products.split('+')

        #label = label.replace('+', ' + ')
        #label = label.replace('_', ' <=> ')

        #r['label'] = label

        r['species'] = [r1, r2, p1, p2]
        #print r['species']
        assert len(r['species']) == 4
        r['d12'] = add_df.T[entry]['d12']
        r['d13'] = add_df.T[entry]['d13']
        r['d23'] = add_df.T[entry]['d23']
        assert r['d12'] > 0
        assert r['d13'] > 0
        assert r['d23'] > 0
        
        results[i] = r

    return pd.DataFrame(results)

In [4]:
def get_need_to_add(csv_df, known_species):
    found_species = {}
    need_to_add = []

    for i, row in csv_df.T.iterrows():

        r1, r2, p1, p2 = row['species']

        mr1 = Molecule(SMILES = r1)
        mr2 = Molecule(SMILES = r2)
        mp1 = Molecule(SMILES = p1)
        mp2 = Molecule(SMILES = p2)

        reaction = Reaction(reactants = [mr1, mr2],
                            products = [mp1, mp2],
                            degeneracy = 1,
                            duplicate = False,
                            reversible = True)
        
        relavent_species = [mr1, mr2, mp1, mp2]
        relavent_labels = {}

        for rel_species in relavent_species:
            for label in known_species:
                known_spec = known_species[label]
                if known_spec.isIsomorphic(rel_species):
                    found_species[rel_species] = label
                    relavent_labels[rel_species] = label

            if rel_species not in found_species.keys():
                need_to_add.append(rel_species.toSMILES())
            """try:
                a = found_species[rel_species]
            except:
                need_to_add.append(rel_species.toSMILES())
                #relavent_labels[rel_species] = '****'
                #logging.warning('{} is missing from species dictionary'.format(rel_species))

            """
    need_to_add = list(set(need_to_add))
    
    return need_to_add

In [5]:
import autotst.updater_methods as UPmethods

In [6]:
def update_reactions(path, csv_df, known_species, Method = '', ShortDesc = ''):
    # Loading reactions database
    from autotst.database import TransitionStateDepository, DistanceData
    r_db = TransitionStateDepository()
    local_context = {'DistanceData': DistanceData}
    r_db.load(path, local_context=local_context)
    
    # Old r_db only has reactions already in reactions.py
    old_r_db = TransitionStateDepository()
    old_r_db.load(path, local_context=local_context)
    
    # New r_db will contain new reactions from the csv_df
    new_r_db = TransitionStateDepository()

    found_species = {}
    need_to_add = []

    Index = 0
    for entry in r_db.entries.values():
        if Index < entry.index:
            Index = entry.index
    Index = Index + 1

    for i, row in csv_df.T.iterrows():
        #every reaction needs: distances, method, shortDesc, label, and reaction object

        r1, r2, p1, p2 = row['species']

        mr1 = Molecule(SMILES = r1)
        mr2 = Molecule(SMILES = r2)
        mp1 = Molecule(SMILES = p1)
        mp2 = Molecule(SMILES = p2)

        reaction = Reaction(reactants = [Species(molecule=[mr1]), Species(molecule=[mr2])],
                            products = [Species(molecule=[mp1]), Species(molecule=[mp2])],
                            degeneracy = 1,
                            duplicate = False,
                            reversible = True)


        Distances = {'d12':row['d12'], 'd13':row['d13'], 'd23':row['d23']}
        distance_data = DistanceData(distances = Distances, method = Method)

        relavent_species = [mr1, mr2, mp1, mp2]
        relavent_labels = {}

        for rel_species in relavent_species:
            for label in known_species:
                known_spec = known_species[label]
                if known_spec.isIsomorphic(rel_species):
                    found_species[rel_species] = label
                    relavent_labels[rel_species] = label

            
            if rel_species not in found_species.keys():
                need_to_add.append(rel_species.toSMILES())

        lr1 = relavent_labels[mr1]
        lr2 = relavent_labels[mr2]
        lp1 = relavent_labels[mp1]
        lp2 = relavent_labels[mp2]

        Label = '{} + {} <=> {} + {}'.format(lr1, lr2, lp1, lp2)
        #print Label

        # adding new entries to r_db, r_db will contain old and new reactions
        r_db.loadEntry(Index + i,
                      reactant1=None,
                      reactant2=None,
                      reactant3=None,
                      product1=None,
                      product2=None,
                      product3=None,
                      distances = distance_data,
                      degeneracy=1,
                      label = Label,
                      duplicate=False,
                      reversible=True,
                      reference=None,
                      referenceType = '',
                      shortDesc = ShortDesc,
                      longDesc = '',
                      rank=None,
                      )

        r_db.entries['{0:d}:{1}'.format(Index + i, Label)].item = reaction

        # Adding new reactions to the new r_db as well
        new_r_db.loadEntry(Index + i,
                      reactant1=None,
                      reactant2=None,
                      reactant3=None,
                      product1=None,
                      product2=None,
                      product3=None,
                      distances = distance_data,
                      degeneracy=1,
                      label = Label,
                      duplicate=False,
                      reversible=True,
                      reference=None,
                      referenceType = '',
                      shortDesc = ShortDesc,
                      longDesc = '',
                      rank=None,
                      )

        new_r_db.entries['{0:d}:{1}'.format(Index + i, Label)].item = reaction

    need_to_add = list(set(need_to_add))
    
    assert len(need_to_add) == 0, 'Species missing from dictionary'
    assert len(r_db.entries) > len(old_r_db.entries) and len(r_db.entries) > len(new_r_db.entries)
    assert len(r_db.entries) == len(old_r_db.entries) + len(new_r_db.entries) 
    
    return r_db, old_r_db, new_r_db

In [7]:
def update_databases_from_csv(gen_path, csv_path, new_dict_path, new_reactions_path, method_str='', shortDesc_str=''):
    """
    csv_path = 'distance_data.csv'
    dict_path = 'database/H_Abstraction/TS_training/dictionary.txt'
    old_style_dict_path = 'database/H_Abstraction/TS_training/old_dictionary.txt'
    new_dict_path = 'updated_dictionary.txt'
    
    method_str = 'm062x/6-311+G(2df,2p)'
    shortDesc_str = 'M06-2X/6-311+G(2df,2p) calculation via group additive TS generator.'
    old_reactions_path = 'database/H_Abstraction/TS_training/reactions.py'
    new_reactions_path = 'updated_reactions.py'
    """
    
    dict_path = os.path.join(general_path, 'dictionary.txt')
    old_reactions_path = os.path.join(general_path, 'reactions.py')
    
    csv_df = get_csv(csv_path)
    print 'New Reactions: ', csv_df.shape[1]

    known_species = rmgpy.data.base.Database().getSpecies(dict_path)
    need_to_add = get_need_to_add(csv_df=csv_df, known_species=known_species)

    if len(need_to_add) > 0:
        print 'New Species: ', len(need_to_add)
        old_entries = UPmethods.rote_load_dict(dict_path)
        all_dict_entries = UPmethods.update_dictionary_entries(old_entries, need_to_add)
        
        if UPmethods.check_dictionary_entries(all_dict_entries):
            UPmethods.rote_save_dictionary(new_dict_path, all_dict_entries)
    
        known_species = rmgpy.data.base.Database().getSpecies(new_dict_path)
    else:
        print "No new species found."

    r_db, old_db, new_db = update_reactions(old_reactions_path,
                                            csv_df,
                                            known_species,
                                            Method = method_str,
                                            ShortDesc = shortDesc_str)
    print
    print 'Old Reactions:', len(old_db.entries)
    print 'Reactions added:', len(new_db.entries)
    print 'Final Reactions:', len(r_db.entries)
    
    #TODO add check for duplicates method
    #if check_reactions():
    if True:
        logging.warning('No duplicate check for reactions database')
        r_db.save(new_reactions_path)
        logging.info('Reactions and their species saved to...\n{}\n...and...\n{}\n...respectively'.format(new_reactions_path, new_dict_path))
    print
    print 'done'
    return

In [8]:
update_databases_from_csv(general_path,
                          csv_path,
                          new_dict_path,
                          new_reactions_path,
                          method_str=method,
                          shortDesc_str=shortDesc)

New Reactions:  920
New Species:  985





Old Reactions: 2490
Reactions added: 920
Final Reactions: 3410

done
