In [3]:
import argparse
import os
import time
import math
import numpy as np
import pylab
import scipy.stats
import matplotlib
matplotlib.rc('mathtext', fontset='stixsans', default='regular')
import re
import rmgpy
from rmgpy.quantity import constants
from rmgpy.kinetics import Arrhenius, ArrheniusEP, KineticsData
from rmgpy.data.base import getAllCombinations
from autotst.database import *
from rmgpy.species import Species
from rmgpy.data.rmg import RMGDatabase
import logging
from collections import defaultdict, OrderedDict
import pandas as pd
import itertools
import IPython
from IPython.display import display, Markdown
def mprint(s): display(Markdown(s))
import cPickle as pickle
# attempt at making the cells wider:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [4]:
def get_csv(path):
    add_df = pd.DataFrame.from_csv(path)

    results = defaultdict(OrderedDict)
    for i, entry in enumerate(add_df.index):
        r = OrderedDict()

        label = add_df.T[entry].name
        reactants, products = label.split('_')
        r1, r2 = reactants.split('+')
        p1, p2 = products.split('+')

        #label = label.replace('+', ' + ')
        #label = label.replace('_', ' <=> ')

        #r['label'] = label

        r['species'] = [r1, r2, p1, p2]
        #print r['species']
        assert len(r['species']) == 4
        r['d12'] = add_df.T[entry]['d12']
        r['d13'] = add_df.T[entry]['d13']
        r['d23'] = add_df.T[entry]['d23']
        assert r['d12'] > 0
        assert r['d13'] > 0
        assert r['d23'] > 0
        
        results[i] = r

    return pd.DataFrame(results)

In [5]:
def get_need_to_add(csv_df, known_species):
    found_species = {}
    need_to_add = []

    for i, row in csv_df.T.iterrows():

        r1, r2, p1, p2 = row['species']

        mr1 = Molecule(SMILES = r1)
        mr2 = Molecule(SMILES = r2)
        mp1 = Molecule(SMILES = p1)
        mp2 = Molecule(SMILES = p2)

        reaction = Reaction(reactants = [mr1, mr2],
                            products = [mp1, mp2],
                            degeneracy = 1,
                            duplicate = False,
                            reversible = True)
        
        relavent_species = [mr1, mr2, mp1, mp2]
        relavent_labels = {}

        for rel_species in relavent_species:
            for label in known_species:
                known_spec = known_species[label]
                if known_spec.isIsomorphic(rel_species):
                    found_species[rel_species] = label
                    relavent_labels[rel_species] = label

            if rel_species not in found_species.keys():
                need_to_add.append(rel_species.toSMILES())
            """try:
                a = found_species[rel_species]
            except:
                need_to_add.append(rel_species.toSMILES())
                #relavent_labels[rel_species] = '****'
                #logging.warning('{} is missing from species dictionary'.format(rel_species))

            """
    need_to_add = list(set(need_to_add))
    
    return need_to_add

In [6]:
def get_unknown_species(reactions, known_species):
    """
    Expects list of auto-TST reactions and known species from a species dictionary
    
    Returns unique list of SMILES of species not in the dictionary
    """
    found_species = {}
    need_to_add = []

    for i, reaction in enumerate(reactions):
        rmg_reaction = reaction.rmg_reaction
        r1, r2 = rmg_reaction.reactants
        p1, p2 = rmg_reaction.products
        
        relavent_species = [r1, r2, p1, p2]
        relavent_labels = {}

        for rel_species in relavent_species:
            for label in known_species:
                known_spec = known_species[label]
                if known_spec.isIsomorphic(rel_species):
                    found_species[rel_species] = label
                    relavent_labels[rel_species] = label

            if rel_species not in found_species.keys():
                need_to_add.append(rel_species.toSMILES())

    need_to_add = list(set(need_to_add))
    
    return need_to_add

In [7]:
def update_dictionary_entries(old_entries, need_to_add):
    list(set(need_to_add))
    for j, species in enumerate(need_to_add):
        
        molecule = Molecule(SMILES = species)
        adjlist = molecule.toAdjacencyList()
        
        multiplicity = None
        multiplicity =  adjlist[adjlist.find("multiplicity ")+13:adjlist.find("multiplicity ")+14]
        if multiplicity is not None:
            adjlist = re.sub(r'multiplicity .*', 'multiplicity [{}]'.format(multiplicity), adjlist)

        group = rmgpy.molecule.group.Group()
        group.fromAdjacencyList(adjlist)

        atom_counts = {}
        rel_label = ''
        for atom in ['C', 'H', 'O']:
            count = species.count(atom)
            if count > 0:
                rel_label = rel_label + atom + str(count)
                
        assert rel_label != ''
        
        """
        3 Scenerios:
        No old -> no need for ID number: max_ID = -1
        Only one old -> needs to have ID of 1: max_ID = 0
        Multiple old -> needs to have a unique ID: max_ID > 0
        """
        
        new_ID = None
        max_ID = -1
        duplicate = False
        for old_label in old_entries:
            old_entry = old_entries[old_label]
            
            if group.isIsomorphic(old_entry.item):
                duplicate = True
                print '{} found to be duplicate'.format(old_entry)
                continue
            
            if rel_label not in old_label:
                continue
            
            
            
            if rel_label == old_label and max_ID == -1:
                # Atleast one with same label
                max_ID = 0
                
            if old_label.find('-') > 0:
                old_label, ID_str = old_label.split('-')
                ID = int(ID_str)
                
                if old_label == rel_label and ID > max_ID:
                    # Multiple exisitng labels
                    max_ID = ID

        if max_ID > -1:
            #All with old labels
            new_ID = max_ID + 1
            rel_label = rel_label + '-' + str(new_ID)

        if not duplicate:
            entry = Entry()
            entry.label = rel_label
            entry.item = group
            assert rel_label not in old_entries.keys()
            old_entries[rel_label] = entry
        
        
    entry_labels = [old_entries[key].label for key in old_entries]
    
    assert len(entry_labels) == len(list(set(entry_labels))), 'Non-unique labels in dictionary'
    
    
    return old_entries

In [8]:
""""def update_dictionary_entries(loud_path, need_to_add):
    
    library = rmgpy.data.kinetics.library.KineticsLibrary()
    pattern = True
    library.loadOldDictionary(load_path, pattern=pattern)

    print 'Starting Species:', len(library.entries.values()) 

    for i, entry in enumerate(library.entries.values()):
        entry.index = i
        library.entries[entry.index] = entry
        del library.entries[entry.label]
    
    print 'Starting Species:', len(entries)

    # Recycling i to help make unique indices moving forward
    i = i+2

    for j, species in enumerate(need_to_add):
        mole = Molecule(SMILES = species)

        adjlist = mole.toAdjacencyList()
        multiplicity = None
        multiplicity =  adjlist[adjlist.find("multiplicity ")+13:adjlist.find("multiplicity ")+14]
        if multiplicity is not None:
            adjlist = re.sub(r'multiplicity .*', 'multiplicity [{}]'.format(multiplicity), adjlist)

        group = rmgpy.molecule.group.Group()
        group.fromAdjacencyList(adjlist)

        atom_counts = {}
        rel_label = ''
        for atom in ['C', 'H', 'O']:
            atom_counts[atom] = species.count(atom)
            if species.count(atom) > 0:
                rel_label = rel_label + atom + str(species.count(atom))
        """
        3 Scenerios:
        No existing -> no need for ID number: max_ID = -1
        Only one existing -> needs to have ID of 1: max_ID = 0
        Multiple existing -> needs to have a unique ID: max_ID > 0
        """
        new_ID = None
        max_ID = -1
        for entry in library.entries.values():
            existing_label = entry.label

            if rel_label not in existing_label:
                continue

            if rel_label == existing_label:
                # Atleast one, but not necessarily more existing with same label
                max_ID = 0

            #print rel_label, ' : ', existing_label
            if existing_label.find('-') > 0:
                ID_str = existing_label[existing_label.find('-')+1:]
                ID = int(ID_str)
                existing_label = existing_label[:existing_label.find('-')]

                if existing_label == rel_label and ID > max_ID:
                    # Multiple exisitng labels
                    max_ID = ID

        if max_ID > -1:
            #All with existing labels
            new_ID = max_ID + 1
            rel_label = rel_label + '-' + str(new_ID)

        #print '\t', rel_label, ':', max_ID

        library.loadEntry(i+j,
                          rel_label,
                          None,
                          degeneracy=1,
                          duplicate=False,
                          reversible=True,
                          reference=None,
                          referenceType='',
                          shortDesc='',
                          longDesc='',
                          has_pdep_route=False,
                          )

        library.entries[i+j].item = group

        
    entry_labels = [entry.label for entry in library.entries.values()]
    assert len(entry_labels) == len(list(set(entry_labels))), 'Non-unique labels in dictionary'

    print 'Final Species:', len(library.entries)
    
    return library.entries.values()""""

IndentationError: unexpected indent (<ipython-input-8-58bd23532fa1>, line 38)

In [9]:
def get_updated_dictionary_entries(load_path, need_to_add):
    
    library = rmgpy.data.kinetics.library.KineticsLibrary()
    pattern = True
    library.loadOldDictionary(load_path, pattern=pattern)

    print 'Starting Species:', len(library.entries.values()) 

    for i, entry in enumerate(library.entries.values()):
        entry.index = i
        library.entries[entry.index] = entry
        del library.entries[entry.label]
    
    # Recycling i to help make unique indices moving forward
    i = i+2

    for j, species in enumerate(need_to_add):
        mole = Molecule(SMILES = species)

        adjlist = mole.toAdjacencyList()
        multiplicity = None
        multiplicity =  adjlist[adjlist.find("multiplicity ")+13:adjlist.find("multiplicity ")+14]
        if multiplicity is not None:
            adjlist = re.sub(r'multiplicity .*', 'multiplicity [{}]'.format(multiplicity), adjlist)

        group = rmgpy.molecule.group.Group()
        group.fromAdjacencyList(adjlist)

        atom_counts = {}
        rel_label = ''
        for atom in ['C', 'H', 'O']:
            atom_counts[atom] = species.count(atom)
            if species.count(atom) > 0:
                rel_label = rel_label + atom + str(species.count(atom))
        """
        3 Scenerios:
        No existing -> no need for ID number: max_ID = -1
        Only one existing -> needs to have ID of 1: max_ID = 0
        Multiple existing -> needs to have a unique ID: max_ID > 0
        """
        new_ID = None
        max_ID = -1
        for entry in library.entries.values():
            existing_label = entry.label

            if rel_label not in existing_label:
                continue

            if rel_label == existing_label:
                # Atleast one, but not necessarily more existing with same label
                max_ID = 0

            #print rel_label, ' : ', existing_label
            if existing_label.find('-') > 0:
                ID_str = existing_label[existing_label.find('-')+1:]
                ID = int(ID_str)
                existing_label = existing_label[:existing_label.find('-')]

                if existing_label == rel_label and ID > max_ID:
                    # Multiple exisitng labels
                    max_ID = ID

        if max_ID > -1:
            #All with existing labels
            new_ID = max_ID + 1
            rel_label = rel_label + '-' + str(new_ID)

        #print '\t', rel_label, ':', max_ID

        library.loadEntry(i+j,
                          rel_label,
                          None,
                          degeneracy=1,
                          duplicate=False,
                          reversible=True,
                          reference=None,
                          referenceType='',
                          shortDesc='',
                          longDesc='',
                          has_pdep_route=False,
                          )

        library.entries[i+j].item = group

        
    entry_labels = [entry.label for entry in library.entries.values()]
    assert len(entry_labels) == len(list(set(entry_labels))), 'Non-unique labels in dictionary'

    print 'Final Species:', len(library.entries)
    
    return library.entries.values()

In [10]:
def check_dictionary(dict_entries):
    entry_indices = []
    entry_adjlists = []
    entry_labels = []
    for entry in dict_entries:
        adjlist = entry.item.toAdjacencyList()
        adjlist = re.sub('\[', '', adjlist)
        adjlist = re.sub('\]', '', adjlist)
        assert entry.index not in entry_indices, 'Non-unique indices for dictionary'
        assert adjlist not in entry_adjlists, 'Non-unique adjacencies for dictionary'
        assert entry.label not in entry_labels, 'Non-unique labels for dictionary'

        entry_indices.append(entry.index)
        entry_adjlists.append(entry.item.toAdjacencyList())
        entry_labels.append(entry.label)
    
    print 'Checked dictionary'
    return True

In [11]:
def check_dictionary_entries(dict_entries):
    entry_indices = []
    entry_adjlists = []
    entry_labels = []
    for entry in dict_entries.values():
        adjlist = entry.item.toAdjacencyList()

        assert adjlist not in entry_adjlists, 'Non-unique adjacencies for dictionary'
        assert entry.label not in entry_labels, 'Non-unique labels for dictionary'

        entry_indices.append(entry.index)
        entry_adjlists.append(entry.item.toAdjacencyList())
        entry_labels.append(entry.label)
    
    print 'Checked dictionary'
    return True

In [12]:
def check_dictionary(dict_entries):
    entry_indices = []
    entry_adjlists = []
    entry_labels = []
    for entry in dict_entries:
        adjlist = entry.item.toAdjacencyList()

        assert adjlist not in entry_adjlists, 'Non-unique adjacencies for dictionary'
        assert entry.label not in entry_labels, 'Non-unique labels for dictionary'

        entry_indices.append(entry.index)
        entry_adjlists.append(entry.item.toAdjacencyList())
        entry_labels.append(entry.label)
    
    print 'Checked dictionary'
    return True

In [13]:
def generate_bracket_dict(load_path):

    library = rmgpy.data.kinetics.library.KineticsLibrary()
    pattern = True
    library.loadOldDictionary(load_path, pattern=pattern)
    entries = library.entries.values()
    
   
    bracket_path = os.path.join(load_path, '..', 'bracket_dictionary.txt')
    f = open(bracket_path, 'w')
    
    for entry in entries:
        f.write(entry.label)
        f.write('\n')
        f.write(entry.item.toAdjacencyList())
        f.write('\n')
                
    f.close()
    print 'Created "bracket dictionary" with multiplicities as lists'
    return

In [14]:
def rote_load_dict(path):
    with open(path, 'r') as f:
        entries_str = f.read().split('\n\n')

    entries = {}
    for entry_str in entries_str:
        label, adjlist = entry_str.split('\n', 1)
        if re.search('(?<=multiplicity ).*', adjlist):
            multiplicity = int(re.search('(?<=multiplicity ).*', adjlist).group(0))
            adjlist = adjlist.split('\n', 1)[1]
            adjlist = 'multiplicity [{}]\n'.format(multiplicity) + adjlist

        group = rmgpy.molecule.group.Group()
        group.fromAdjacencyList(adjlist)

        entry = Entry()
        entry.item = group
        entry.label = label
        entries[label] = entry
    
    print 'Old dictionary:', len(entries)
    
    return entries

In [15]:
def ugly_save_dictionary(path, entries):
    f = open(path, 'w')
    for entry in entries:
        multiplicity = entry.item.multiplicity
        adjlist = entry.item.toAdjacencyList()
        if multiplicity is not None:
            #adjlist = re.sub(r'multiplicity .*', 'multiplicity {}'.format(multiplicity), adjlist)
            adjlist = re.sub('\[', '', adjlist)
            adjlist = re.sub('\]', '', adjlist)
        f.write(entry.label)
        f.write('\n')
        f.write(adjlist)
        f.write('\n')
                
    f.close()
    print 'Saved new dictionary'
    return

In [16]:
def rote_save_dictionary(path, entries):
    f = open(path, 'w')
    for entry in entries.values():
        multiplicity = entry.item.multiplicity
        adjlist = entry.item.toAdjacencyList()
        if multiplicity is not None:
            adjlist = re.sub('\[', '', adjlist)
            adjlist = re.sub('\]', '', adjlist)
        f.write(entry.label)
        f.write('\n')
        f.write(adjlist)
        f.write('\n')
                
    f.close()
    print 'Saved new dictionary'
    return

In [17]:
def update_reactions(path, csv_df, known_species, Method = '', ShortDesc = ''):
    # Loading reactions database
    from autotst.database import TransitionStateDepository, DistanceData
    r_db = TransitionStateDepository()
    local_context = {'DistanceData': DistanceData}
    r_db.load(path, local_context=local_context)
    
    # Old r_db only has reactions already in reactions.py
    old_r_db = TransitionStateDepository()
    old_r_db.load(path, local_context=local_context)
    
    # New r_db will contain new reactions from the csv_df
    new_r_db = TransitionStateDepository()

    found_species = {}
    need_to_add = []

    Index = 0
    for entry in r_db.entries.values():
        if Index < entry.index:
            Index = entry.index
    Index = Index + 1

    for i, row in csv_df.T.iterrows():
        #every reaction needs: distances, method, shortDesc, label, and reaction object

        r1, r2, p1, p2 = row['species']

        mr1 = Molecule(SMILES = r1)
        mr2 = Molecule(SMILES = r2)
        mp1 = Molecule(SMILES = p1)
        mp2 = Molecule(SMILES = p2)

        reaction = Reaction(reactants = [Species(molecule=[mr1]), Species(molecule=[mr2])],
                            products = [Species(molecule=[mp1]), Species(molecule=[mp2])],
                            degeneracy = 1,
                            duplicate = False,
                            reversible = True)


        Distances = {'d12':row['d12'], 'd13':row['d13'], 'd23':row['d23']}
        distance_data = DistanceData(distances = Distances, method = Method)

        relavent_species = [mr1, mr2, mp1, mp2]
        relavent_labels = {}

        for rel_species in relavent_species:
            for label in known_species:
                known_spec = known_species[label]
                if known_spec.isIsomorphic(rel_species):
                    found_species[rel_species] = label
                    relavent_labels[rel_species] = label

            
            if rel_species not in found_species.keys():
                need_to_add.append(rel_species.toSMILES())

        lr1 = relavent_labels[mr1]
        lr2 = relavent_labels[mr2]
        lp1 = relavent_labels[mp1]
        lp2 = relavent_labels[mp2]

        Label = '{} + {} <=> {} + {}'.format(lr1, lr2, lp1, lp2)
        #print Label

        # adding new entries to r_db, r_db will contain old and new reactions
        r_db.loadEntry(Index + i,
                      reactant1=None,
                      reactant2=None,
                      reactant3=None,
                      product1=None,
                      product2=None,
                      product3=None,
                      distances = distance_data,
                      degeneracy=1,
                      label = Label,
                      duplicate=False,
                      reversible=True,
                      reference=None,
                      referenceType = '',
                      shortDesc = ShortDesc,
                      longDesc = '',
                      rank=None,
                      )

        r_db.entries['{0:d}:{1}'.format(Index + i, Label)].item = reaction

        # Adding new reactions to the new r_db as well
        new_r_db.loadEntry(Index + i,
                      reactant1=None,
                      reactant2=None,
                      reactant3=None,
                      product1=None,
                      product2=None,
                      product3=None,
                      distances = distance_data,
                      degeneracy=1,
                      label = Label,
                      duplicate=False,
                      reversible=True,
                      reference=None,
                      referenceType = '',
                      shortDesc = ShortDesc,
                      longDesc = '',
                      rank=None,
                      )

        new_r_db.entries['{0:d}:{1}'.format(Index + i, Label)].item = reaction

    need_to_add = list(set(need_to_add))
    
    assert len(need_to_add) == 0, 'Species missing from dictionary'
    assert len(r_db.entries) > len(old_r_db.entries) and len(r_db.entries) > len(new_r_db.entries)
    assert len(r_db.entries) == len(old_r_db.entries) + len(new_r_db.entries) 
    
    return r_db, old_r_db, new_r_db

In [18]:
def update_known_reactions(path, reactions, known_species, method='', shortDesc=''):
    # Loading reactions database
    from autotst.database import TransitionStateDepository, DistanceData
    r_db = TransitionStateDepository()
    local_context = {'DistanceData': DistanceData}
    r_db.load(path, local_context=local_context)
    
    # Old r_db only has reactions already in reactions.py
    old_r_db = TransitionStateDepository()
    old_r_db.load(path, local_context=local_context)
    
    # New r_db will contain new reactions from the csv_df
    new_r_db = TransitionStateDepository()

    found_species = {}
    need_to_add = []

    Index = 0
    for entry in r_db.entries.values():
        if Index < entry.index:
            Index = entry.index
    Index = Index + 1
    
    for i, reaction in enumerate(reactions):
        Distances = reaction.distance_data.distances
        distance_data = DistanceData(distances = Distances, method = method)
        
        rmg_reaction = reaction.rmg_reaction
        r1, r2 = rmg_reaction.reactants
        p1, p2 = rmg_reaction.products
        
        relavent_species = [r1, r2, p1, p2]
        relavent_labels = {}

        for rel_species in relavent_species:
            for label in known_species:
                known_spec = known_species[label]
                if known_spec.isIsomorphic(rel_species):
                    found_species[rel_species] = label
            
            if rel_species not in found_species.keys():
                need_to_add.append(rel_species.toSMILES())
                logging.warning('{} not found in species dictionary'.format(rel_species))
        
        lr1 = found_species[r1]
        lr2 = found_species[r2]
        lp1 = found_species[p1]
        lp2 = found_species[p2]

        Label = '{} + {} <=> {} + {}'.format(lr1, lr2, lp1, lp2)
        #print Label

        # adding new entries to r_db, r_db will contain old and new reactions
        r_db.loadEntry(Index + i,
                      reactant1=None,
                      reactant2=None,
                      reactant3=None,
                      product1=None,
                      product2=None,
                      product3=None,
                      distances = distance_data,
                      degeneracy=1,
                      label = Label,
                      duplicate=False,
                      reversible=True,
                      reference=None,
                      referenceType = '',
                      shortDesc = shortDesc,
                      longDesc = '',
                      rank=None,
                      )

        r_db.entries['{0:d}:{1}'.format(Index + i, Label)].item = rmg_reaction

        # Adding new reactions to the new r_db as well
        new_r_db.loadEntry(Index + i,
                      reactant1=None,
                      reactant2=None,
                      reactant3=None,
                      product1=None,
                      product2=None,
                      product3=None,
                      distances = distance_data,
                      degeneracy=1,
                      label = Label,
                      duplicate=False,
                      reversible=True,
                      reference=None,
                      referenceType = '',
                      shortDesc = shortDesc,
                      longDesc = '',
                      rank=None,
                      )

        new_r_db.entries['{0:d}:{1}'.format(Index + i, Label)].item = rmg_reaction

    need_to_add = list(set(need_to_add))
    
    assert len(need_to_add) == 0, 'Species missing from dictionary'
    assert len(r_db.entries) > len(old_r_db.entries) and len(r_db.entries) > len(new_r_db.entries)
    assert len(r_db.entries) == len(old_r_db.entries) + len(new_r_db.entries) 
    
    return r_db, old_r_db, new_r_db

In [19]:
def update_databases_from_csv():
    #csv_path = os.path.join(os.path.expandvars('$RMGpy'), "..",  'AutoTST')

    csv_path = 'distance_data.csv'
    dict_path = 'database/H_Abstraction/TS_training/dictionary.txt'
    old_style_dict_path = 'database/H_Abstraction/TS_training/old_dictionary.txt'
    new_dict_path = 'updated_dictionary.txt'
    method_str = 'm062x/6-311+G(2df,2p)'
    shortDesc_str = 'M06-2X/6-311+G(2df,2p) calculation via group additive TS generator.'
    old_reactions_path = 'database/H_Abstraction/TS_training/reactions.py'
    new_reactions_path = 'updated_reactions.py'

    csv_df = get_csv(csv_path)
    print 'New Reactions: ',csv_df.shape[1]

    known_species = rmgpy.data.base.Database().getSpecies(dict_path)
    need_to_add = get_need_to_add(csv_df=csv_df, known_species=known_species)
    print 'New Species: ', len(need_to_add)

    all_dict_entries = get_updated_dictionary_entries(old_style_dict_path, need_to_add)
    if check_dictionary(all_dict_entries):
        ugly_save_dictionary(new_dict_path, all_dict_entries)

    known_species = rmgpy.data.base.Database().getSpecies(new_dict_path)
    r_db, old_db, new_db = update_reactions(old_reactions_path,
                                            csv_df,
                                            known_species,
                                            Method = method_str,
                                            ShortDesc = shortDesc_str
                                           )
    #TODO add check for duplicates method
    #if check_reactions():
    if True:
        r_db.save(new_reactions_path)
    print
    print 'done?'
    return new_db

In [20]:
new_db = update_databases_from_csv()

New Reactions:  920
New Species:  985
Starting Species: 282
Final Species: 1267
Checked dictionary
Saved new dictionary

done?


In [21]:
def update_databases(reactions, method='', shortDesc='', reaction_family=''):
    print 'Reactions to add:', len(reactions)
    print
    if reaction_family == '':
        reaction_family = 'H_Abstraction'
        logging.warning('Defaulting to reaction family of {}'.format(reaction_family))
    
    general_path = os.path.join(os.path.expandvars('$RMGpy'), '..', 'AutoTST', 'database', reaction_family, 'TS_training')
    dict_path = os.path.join(general_path, 'dictionary.txt')
    new_dict_path = os.path.join(general_path, 'updated_dictionary.txt')
    old_reactions_path = os.path.join(general_path, 'reactions.py')
    new_reactions_path = os.path.join(general_path, 'updated_reactions.py')

    known_species = rmgpy.data.base.Database().getSpecies(dict_path)
    print 'Known Species: ', len(known_species)
    unknown_species = get_unknown_species(reactions, known_species)
    print 'New Species: ', len(unknown_species)
    print
    
    updated_known_species = []
    if len(unknown_species) > 0:
        old_dict_entries = rote_load_dict(dict_path)

        assert len(known_species) == len(old_dict_entries)

        all_dict_entries = update_dictionary_entries(old_dict_entries, unknown_species)
        print 'New Dictionary:', len(all_dict_entries)
        assert len(known_species) + len(unknown_species) == len(all_dict_entries)
        #assert len(unknown_species) == len(all_dict_entries) - len(old_dict_entries)
        
        if check_dictionary_entries(all_dict_entries):
            rote_save_dictionary(new_dict_path, all_dict_entries)
    
        updated_known_species = rmgpy.data.base.Database().getSpecies(new_dict_path)
        unk_spec = get_unknown_species(reactions, updated_known_species)
        assert len(unk_spec) == 0, '{} unknown species found after updating'.format(len(unk_spec))
    else:
        updated_known_species = known_species
    
    r_db, old_db, new_db = update_known_reactions(old_reactions_path,
                                                  reactions,
                                                  updated_known_species,
                                                  method = method,
                                                  shortDesc = shortDesc
                                                 )
    print
    print 'Old Reactions:', len(old_db.entries)
    print 'Reactions added:', len(new_db.entries)
    print 'Final Reactions:', len(r_db.entries)
    
    #TODO add check for duplicates method
    #if check_reactions_database():
    if True:
        logging.warning('No duplicate check for reactions database')
        r_db.save(new_reactions_path)
    print
    print 'done?'

In [22]:
update_databases(test_reactions, method='Method goes here', shortDesc='description goes here')



Reactions to add: 0

Known Species:  282
New Species:  0



AssertionError: 

In [23]:
a = 'C10H1O1-1'
b, c = a.split('-')
c

'1'

In [29]:
from autotst.reaction import AutoTST_Reaction
label = 'C#CC+[O]O_C#C[CH2]+OO'
reaction = AutoTST_Reaction(label, reaction_family='H_Abstraction')

test_reactions = []
count = 0
for entry in new_db.entries.values():
    reaction = None
    r1, r2 = entry.item.reactants
    p1, p2 = entry.item.products
    
    r1 = r1.molecule[0].toSMILES()
    r2 = r2.molecule[0].toSMILES()
    p1 = p1.molecule[0].toSMILES()
    p2 = p2.molecule[0].toSMILES()
    
    label = '{}+{}_{}+{}'.format(r1, r2, p1, p2)
    #print label
    if len(r2) > 12 and count < 10:
        count += 1
        #reaction = AutoTST_Reaction(label, reaction_family='H_Abstraction')
        #print repr(reaction)
        test_reactions.append(label)

"""mr1 = Molecule(SMILES = "C#CC")
mr2 = Molecule(SMILES = "[O]O")
mp1 = Molecule(SMILES = "C#C[CH2]")
mp2 = Molecule(SMILES = "OO")

reaction = Reaction(reactants = [mr1, mr2],
                            products = [mp1, mp2],
                            degeneracy = 1,
                            duplicate = False,
                            reversible = True)"""



thermo.py:839 loadLibraries INFO Loading thermodynamics library from primaryThermoLibrary.py in /home/C-Underkoffler/Code/RMG-Py/../RMG-database/input/thermo/libraries...
thermo.py:839 loadLibraries INFO Loading thermodynamics library from thermo_DFT_CCSDTF12_BAC.py in /home/C-Underkoffler/Code/RMG-Py/../RMG-database/input/thermo/libraries...
thermo.py:839 loadLibraries INFO Loading thermodynamics library from CBS_QB3_1dHR.py in /home/C-Underkoffler/Code/RMG-Py/../RMG-database/input/thermo/libraries...
thermo.py:856 loadGroups INFO Loading thermodynamics group database from /home/C-Underkoffler/Code/RMG-Py/../RMG-database/input/thermo/groups...
transport.py:294 loadGroups INFO Loading transport group database from /home/C-Underkoffler/Code/RMG-Py/../RMG-database/input/transport/groups...
database.py:167 loadFamilies INFO Loading the user-specified kinetics families from /home/C-Underkoffler/Code/RMG-Py/../RMG-database/input/kinetics/families
statmech.py:526 loadGroups INFO Loading freq

'mr1 = Molecule(SMILES = "C#CC")\nmr2 = Molecule(SMILES = "[O]O")\nmp1 = Molecule(SMILES = "C#C[CH2]")\nmp2 = Molecule(SMILES = "OO")\n\nreaction = Reaction(reactants = [mr1, mr2],\n                            products = [mp1, mp2],\n                            degeneracy = 1,\n                            duplicate = False,\n                            reversible = True)'

In [None]:
reaction.distance_data.distances
#reaction.get_reactants_and_products()
r1, r2 = reaction.reactant_mols
r1 = r1.rmg_molecule
r1
test_reaction = reaction
test_reaction

In [30]:
test_list = [ repr(reaction) for reaction in test_reactions]
test_reactions

['C+[CH2]COC(=O)C(C)C_CCOC(=O)C(C)C+[CH3]',
 'O+[CH2]C(C)CC(C)C(C)=O_CC(=O)C(C)CC(C)C+[OH]',
 'O+[CH2]C(C)OC(CC)CC_CCC(CC)OC(C)C+[OH]',
 'O+[CH2]COC(=O)C(C)C_CCOC(=O)C(C)C+[OH]',
 'O+[CH]=CC(C)=CCCC(=C)C_C=CC(C)=CCCC(=C)C+[OH]',
 'OO+[CH2]C(=C)C(=O)CC_C=C(C)C(=O)CC+[O]O',
 'OO+[CH2]C(=C)CC(C)=O_C=C(C)CC(C)=O+[O]O',
 'OO+[CH2]C(=C)CCC=C(C)C=C_C=CC(C)=CCCC(=C)C+[O]O',
 'OO+[CH2]C(=O)C(=C)C_C=C(C)C(C)=O+[O]O',
 'OO+[CH2]C(=O)C(C)(C)C_CC(=O)C(C)(C)C+[O]O']

In [None]:
print 'Composite:', len(r_db.entries)
print 'Old:', len(old_db.entries)
print 'New', len(new_db.entries)

#check_reactions(r_db, old_db, new_db)

In [None]:
checked_entries = []
print 'Total:', len(old_db.entries)

matches = 0
exact_match = []
mismatch = []
mismatches = 0
for entry in old_db.entries.values():
    #print entry.item
    for checked_entry in checked_entries:
        if entry.item.isIsomorphic(checked_entry.item):
            e_dists = set(entry.data.distances.values())
            checked_e_dists = set(checked_entry.data.distances.values())
            if e_dists == checked_e_dists:
                exact_match.append(entry)
                matches = matches + 1
                """print 'found exact match'
                print entry.item
                print entry.data
                print entry.item.duplicate
                print checked_entry.item
                print checked_entry.data
                print checked_entry.item.duplicate
                assert False"""
            elif e_dists != checked_e_dists:
                mismatch.append(entry)
                mismatches = mismatches + 1

    checked_entries.append(entry)
    
print 'Matches: ', matches
print 'Mismatches: ', mismatches 
print

In [None]:
print 'Total:', len(new_db.entries)
checked_entries = {}
matches = 0
exact_match = []
mismatch = []
mismatches = 0
for key in new_db.entries:
    entry = new_db.entries[key]
    for checked_key in checked_entries:
        checked_entry = checked_entries[checked_key]
        if entry.item.isIsomorphic(checked_entry.item, eitherDirection=False):
            
            e_dists = set(entry.data.distances.values())
            checked_e_dists = set(checked_entry.data.distances.values())
            
            if e_dists == checked_e_dists:
                exact_match.append(key)
                matches = matches + 1
                #del new_db.entries[key]
            
            elif e_dists != checked_e_dists:
                mismatches = mismatches + 1  
                print 'found mismatch'
                print entry.item
                print entry.data.distances
                print entry.item.duplicate
                print
                print checked_entry.item
                print checked_entry.data.distances
                print checked_entry.item.duplicate
                assert False
                """"if Nate criteria:
                    mismatches.append(key)
                """
                
            
    checked_entries[key] = entry
    
print 'Matches: ', matches
print 'Mismatches: ', mismatches 
print

In [None]:
for duplicate in exact_match:
    del new_db.entries[key]

print len(new_db.entries.keys())

In [None]:
new_db.entries.keys()

In [None]:
Species(molecule= [Molecule(SMILES = 'CCCC')])


# TODO Update Dictionary

In [None]:
db = rmgpy.data.base.Database()
x = this_is_mine(db)

path = os.path.join(os.getcwd(), 'database/H_Abstraction/TS_training/old_dictionary.txt')
pattern = True

x.loadOldDictionary(path, pattern=pattern)

x.database.entries.values()

In [None]:
reactants = [Molecule(SMILES="CCCC"), Molecule(SMILES="[O]O")]
products = [Molecule(SMILES="[CH2]CCC"), Molecule(SMILES="OO")]
rmg_reaction = Reaction(reactants=reactants, products=products)
rmg_reaction.degeneracy

print rmg_reaction

global_context = { '__builtins__': None }

from autotst.database import DistanceData
local_context = {"entry": Entry, 'DistanceData': DistanceData, "nan":np.nan}
r_db = TransitionStateDepository()
path = os.path.join(os.getcwd(), 'database/H_Abstraction/TS_training/reactions.py')
r_db.load(path, local_context=local_context)
#r_db.loadEntry
#r_db.entries

In [None]:
#path = os.path.join(os.path.expandvars('$RMGpy'), "..",  'AutoTST', 'database')
#path = os.path.join(path, 'H_Abstraction', 'TS_training', 'reactions.py')

path = os.path.join(os.getcwd(), 'database/H_Abstraction/TS_training/reactions.py')

r_df = get_existing_reactions(path)
r_df.shape

In [None]:
r_df.T.dropna(subset = ['label']).T
r_df.drop_duplicates()
r_df.shape

In [None]:
r_df

In [None]:
csv_df

In [None]:
r_db.s

In [None]:
print 'Found:', len(found_species)
print 'Need to add:', len(need_to_add)
print len(known_species)

In [None]:
for entry in r_db.entries.values():
    if isinstance(entry.item, Reaction):
        #Write out additional data if depository or library
        #kinetic rules would have a Group object for its reactants instead of Species
        try:
            if isinstance(entry.item.reactants[0], Species):
                # Add degeneracy if the reaction is coming from a depository or kinetics library
                #print '    degeneracy = {0:.1f},\n'.format(entry.item.degeneracy)
                x = 1
        except:
            print entry
            print entry.index
            print entry.item
            print entry.item.reactants
            break

r_db.save('test_saving')
print

In [None]:
found_species = {}
need_to_add = []
for specieseseses in csv_df.T['label']:
    for species in specieseseses:
        mole = Molecule(SMILES = species)
        
        for label in known_species:
            known_spec = known_species[label]
            if known_spec.isIsomorphic(mole):
                found_species[mole] = label
        
        try:
            a = found_species[mole]
        except:
            need_to_add.append(mole)



In [None]:
print 'Found:', len(found_species)
print 'Need to add:', len(need_to_add)

In [None]:
def trim_DF(DF):
    #print DF.shape

    #print '\t dropping NA Reactions'
    DF = DF.T.dropna(subset = ['label']).T
    
    #print '\t dropping NA Distances'
    DF = DF.T.dropna(subset = ['d12']).T
    DF = DF.T.dropna(subset = ['d13']).T
    DF = DF.T.dropna(subset = ['d23']).T
    #print DF.shape

    #print '\t dropping Column Duplicates'
    DF = DF.T.drop_duplicates().T
    #print DF.shape

    
    #print '\t dropping Reaction Duplicates'
    
    #   sorting priorities: older > index
    DF = DF.T.sort_values(['older reaction data', 'index']).T
    DF = DF.T.drop_duplicates(['label'], keep = 'first').T
    #print DF.shape

    return DF

In [None]:
print '\tOld:'
print r_df.count(axis = 1)
print
print '\tAdditional:'
print csv_df.count(axis = 1)

DF = pd.concat([r_df, csv_df], axis = 1, ignore_index = True)
print
print '\tBefore Trimming:'
print DF.count(axis = 1)
DF = trim_DF(DF)

for index in DF.T.index:
    DF[index]['index'] = index + 1

print
print '\tAfter Trimming:'
print DF.count(axis = 1)
DF

In [None]:
def get_existing_reactions(path):
    global_context = { '__builtins__': None }

    from autotst.database import DistanceData
    local_context = {'DistanceData': DistanceData, "nan":np.nan}
    r_db = TransitionStateDepository()
    r_db.load(path, local_context=local_context)

    results = defaultdict(OrderedDict)

    for i, entry in enumerate(r_db.entries.values()):
        r = OrderedDict()
        r['index'] = entry.index
        r['label'] = entry.label
        r['degeneracy'] = entry.item.degeneracy
        r['rank'] = entry.rank
        r['method'] = entry.data.method
        r['shortdesc'] = entry.shortDesc
        r['older reaction data'] = 1

        for key in entry.data.distances:
            r[key] = entry.data.distances[key]

        results[i] = r

    reactions_df = pd.DataFrame(results)
    return reactions_df

In [None]:
for index in range(DF.shape[1]-1):
        new_entry = rmgpy.data.base.Entry()
        
        new_entry.index = DF[index]['index']
        new_entry.label = DF[index]['label']
        #degeneracy = DF[index]['degeneracy']
        reactants, products = DF[index]['label'].split(' <=> ')
        print reactants
        print products
        r1, r2 = reactants.split(' + ')
        p1, p2 = products.split(' + ')
        #TODO Need to put reactanst and products into smiles
        """reactants = [Molecule(SMILES=r1), Molecule(SMILES=r2)]
        products = [Molecule(SMILES=p1), Molecule(SMILES=p2)]
        rmg_reaction = Reaction(reactants=reactants, products=products)
        degeneracy = rmg_reaction.degeneracy"""
        
        d12 = DF[index]['d12']
        d13 = DF[index]['d13']
        d23 = DF[index]['d23']
        
        print new_entry.data
        break
        new_entry.data.method = DF[index]['method']
        new_entry.data.distances = {'d12':d12, 'd13':d13, 'd23':d23}
        
        rank = DF[index]['rank']
        shortDesc = DF[index]['shortDesc']
        
        break

In [None]:
def ugly_write_new_reactions(path):
    entries = []
    for index in range(DF.shape[1]-1):
        my_list = []
        index = DF[index]['index']
        label = DF[index]['label']
        degeneracy = DF[index]['degeneracy']
        d12 = DF[index]['d12']
        d13 = DF[index]['d13']
        d23 = DF[index]['d23']
        method = DF[index]['method']
        rank = DF[index]['rank']
        shortDesc = DF[index]['shortDesc']

        if rank != np.nan:
            #Two options for when rank is not specified and when it is
            #Ugly but wrote it as list to be a little easier to understand rather than large string with formatting
            my_list = ['entry(\n\tindex = ',
                       str(index),
                       ',\n\tlabel = \"',
                       label,
                       '\",\n\tdegeneracy = ',
                       str(degeneracy),
                       ',\n\tdistances = DistanceData(\n\t\tdistances = {\'d12\': ',
                       str(d12),
                       ', \'d13\': ',
                       str(d13),
                       ', \'d23\': ',
                       str(d23),
                       '},\n\t\tmethod = \'',
                       method,
                       '\',\n\t),\n\trank = ',
                       str(rank),
                       ',\n\tshortDesc = u\"\"\"',
                       shortDesc,
                       '\"\"\",\n)'
                      ]
        else:
            my_list = ['entry(\n\tindex = ',
                       str(index),
                       ',\n\tlabel = \"',
                       label,
                       '\",\n\tdegeneracy = ',
                       str(degeneracy),
                       ',\n\tdistances = DistanceData(\n\t\tdistances = {\'d12\': ',
                       str(d12),
                       ', \'d13\': ',
                       str(d13),
                       ', \'d23\': ',
                       str(d23),
                       '},\n\t\tmethod = \'',
                       method,
                       '\',\n\t),\n\tshortDesc = u\"\"\"',
                       shortDesc,
                       '\"\"\",\n)'
                      ]

        entries.append(''.join(my_list))

    f = open(path, 'w')
    f.write('\n\n'.join(entries))
    f.close()
    return

In [None]:
path = 'updated_reactions.py'
ugly_write_new_reactions(path)

# Dictionary Update
#


In [None]:
print os.getcwd()

In [None]:
global_context = { '__builtins__': None }

path = os.path.join(os.getcwd(), 'database/H_Abstraction/TS_training/reactions.py')
path = os.path.join(os.getcwd(), 'updater_test/reactions.py')
from autotst.database import DistanceData
local_context = {'DistanceData': DistanceData, "nan":np.nan}
r_db = TransitionStateDepository()
r_db.load(path, local_context=local_context)
r_db.entries

In [None]:
blem = rmgpy.data.base.Database()
blem.entries = blem.getSpecies("./database/H_Abstraction/TS_training/dictionary.txt")
blem.entries.values()[0]

In [None]:
known_species = rmgpy.data.base.Database().getSpecies("./database/H_Abstraction/TS_training/dictionary.txt")
known_species

#print known_species.values()
m1 = Molecule(SMILES="[CH2]C(C)CO")
relavent_labels = {}
for label in known_species:
    species = known_species[label]
    if species.isIsomorphic(m1):
        relavent_labels[m1] = label
        break
relavent_labels

In [None]:
known_species = rmgpy.data.base.Database().getSpecies("./database/H_Abstraction/TS_training/dictionary.txt")
p_count = 0
s_count = 0

to_add = []
total_species = []

for reaction in csv_df.T['label']:
    #print reaction
    r, p = reaction.split(' <=> ')
    r1, r2 = r.split(' + ')
    p1, p2 = p.split(' + ')
    #print r1, r2, p1, p2
    involved_species = [Molecule(SMILES=r1),
                        Molecule(SMILES=r2),
                        Molecule(SMILES=p1),
                        Molecule(SMILES=p2)]
    
    for a in involved_species:
        c = False
        for b in total_species:
            if b.isIsomorphic(a):
                c = True
        if not c:
            to_add.append(a)
    
    for new_species in involved_species:
        matches_one = False
        for old_species in known_species.values():
            if old_species.isIsomorphic(new_species):
                matches_one = True
        
        if matches_one:
            p_count += 1
        else:
            to_add[new_species] = new_species.toAdjacencyList()
            s_count += 1
            
print p_count
print s_count

print
print len(known_species)
print len(to_add)
print len(list(set(to_add.keys())))
print
to_add.keys()

In [None]:
total_sp = []

for reaction in csv_df.T['label']:
    #print reaction
    r, p = reaction.split(' <=> ')
    r1, r2 = r.split(' + ')
    p1, p2 = p.split(' + ')
    
    sp_list = [r1, r2, p1, p2]
    print reaction
    for sp in sp_list:
        total_sp.append(sp)
        
print len(total_sp)/4
total_sp = list(set(total_sp))
print len(total_sp)

In [None]:
r1, r2 = entry.split("_")[0].split("+")
m1 = Molecule(SMILES=r1)
m2 = Molecule(SMILES=r2)
combined = Molecule.merge(m1,m2)
combined.

In [None]:
t = RMGDatabase()
t

In [None]:
blem = rmgpy.data.base.Database()
blem.getSpecies("./database/H_Abstraction/TS_training/dictionary.txt")


In [None]:
blem.entries = OrderedDict({Molecule(SMILES="CCC"): e})
blem.saveDictionary("test.py")

In [None]:
#e = rmgpy.data.base.Entry(label="CCC", item=Molecule(SMILES="CCC").toAdjacencyList())
print Molecule(SMILES="CCC").toAdjacencyList()

In [None]:
rmg_database = RMGDatabase()
database_path = os.path.join(os.path.expandvars('$RMGpy'), "..",  'RMG-database', 'input')
rmg_database.load(database_path,
                 kineticsFamilies=['H_Abstraction'],
                 transportLibraries=[],
                 reactionLibraries=[],
                 seedMechanisms=[],
                 thermoLibraries=['primaryThermoLibrary', 'thermo_DFT_CCSDTF12_BAC', 'CBS_QB3_1dHR' ],
                 solvation=False,
                 )

# TODO: Edit this so it works with multiple databases

ts_database = TransitionStates()
path = os.path.join(os.path.expandvars("$RMGpy"), "..", "AutoTST", "database", "H_Abstraction")
global_context = { '__builtins__': None }
local_context={'DistanceData': DistanceData}
family = rmg_database.kinetics.families["H_Abstraction"]
ts_database.family = family
ts_database.load(path, local_context, global_context)

In [None]:
class this_is_mine:
    def __init__(self, database):
        self.database = database
    
    def loadOldDictionary(self, path, pattern):
            """
            Parse an old-style RMG database dictionary located at `path`. An RMG
            dictionary is a list of key-value pairs of a one-line string key and a
            multi-line string value. Each record is separated by at least one empty
            line. Returns a ``dict`` object with the values converted to
            :class:`Molecule` or :class:`Group` objects depending on the
            value of `pattern`.
            """

            # The dictionary being loaded
            self.database.entries = {}
            # The current record
            record = ''

            fdict=None
            # Process the dictionary file
            try:
                fdict = open(path, 'r')
                for line in fdict:
                    line = line.strip()
                    # If at blank line, end of record has been found
                    if len(line) == 0 and len(record) > 0:
                        # Label is first line of record
                        lines = record.splitlines()
                        label = lines[0]
                        # Add record to dictionary
                        self.database.entries[label] = Entry(label=label, item=record)
                        # Clear record in preparation for next iteration
                        record = ''
                    # Otherwise append line to record (if not empty and not a comment line)
                    else:
                        line = removeCommentFromLine(line).strip()
                        if len(line) > 0:
                            record += line + '\n'
                # process the last record! (after end of for loop)
                # Label is first line of record
                if record:
                    label = record.splitlines()[0]
                    # Add record to dictionary
                    self.database.entries[label] = Entry(label=label, item=record)
            except DatabaseError, e:
                logging.exception(str(e))
                raise
            except IOError, e:
                logging.exception('Database dictionary file "' + e.filename + '" not found.')
                raise
            finally:
                if fdict: fdict.close()

            # Convert the records in the dictionary to Molecule, Group, or
            # logical objects
            try:
                for label in self.database.entries:
                    record = self.database.entries[label].item
                    lines = record.splitlines()
                    # If record is a logical node, make it into one.
                    if re.match("(?i)\s*(NOT\s)?\s*(OR|AND|UNION)\s*(\{.*\})", lines[1]):
                        self.database.entries[label].item = makeLogicNode(' '.join(lines[1:]) )
                    # Otherwise convert adjacency list to molecule or pattern
                    elif pattern:
                        self.database.entries[label].item = Group().fromAdjacencyList(record)
                    else:
                        self.database.entries[label].item = Molecule().fromAdjacencyList(record,saturateH=True)
            except InvalidAdjacencyListError, e:
                logging.error('Error while loading old-style dictionary "{0}"'.format(path))
                logging.error('Error occurred while parsing adjacency list "{0}"'.format(label))
                raise
