In [1]:
import argparse
import os
import time
import math
import numpy as np
import pylab
import scipy.stats
import matplotlib
matplotlib.rc('mathtext', fontset='stixsans', default='regular')
import re
import rmgpy
from rmgpy.quantity import constants
from rmgpy.kinetics import Arrhenius, ArrheniusEP, KineticsData
from rmgpy.data.base import getAllCombinations
from autotst.database import *
from rmgpy.species import Species
from rmgpy.data.rmg import RMGDatabase
import logging
from collections import defaultdict, OrderedDict
import pandas as pd
import itertools
import IPython
from IPython.display import display, Markdown
def mprint(s): display(Markdown(s))
import cPickle as pickle
# attempt at making the cells wider:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
def get_csv(path):
    add_df = pd.DataFrame.from_csv(path)

    results = defaultdict(OrderedDict)
    for i, entry in enumerate(add_df.index):
        r = OrderedDict()

        label = add_df.T[entry].name
        reactants, products = label.split('_')
        r1, r2 = reactants.split('+')
        p1, p2 = products.split('+')

        #label = label.replace('+', ' + ')
        #label = label.replace('_', ' <=> ')

        #r['label'] = label

        r['species'] = [r1, r2, p1, p2]
        #print r['species']
        assert len(r['species']) == 4
        r['d12'] = add_df.T[entry]['d12']
        r['d13'] = add_df.T[entry]['d13']
        r['d23'] = add_df.T[entry]['d23']
        assert r['d12'] > 0
        assert r['d13'] > 0
        assert r['d23'] > 0
        
        results[i] = r

    return pd.DataFrame(results)

In [3]:
def get_need_to_add(csv_df, known_species):
    found_species = {}
    need_to_add = []

    for i, row in csv_df.T.iterrows():

        r1, r2, p1, p2 = row['species']

        mr1 = Molecule(SMILES = r1)
        mr2 = Molecule(SMILES = r2)
        mp1 = Molecule(SMILES = p1)
        mp2 = Molecule(SMILES = p2)

        reaction = Reaction(reactants = [mr1, mr2],
                            products = [mp1, mp2],
                            degeneracy = 1,
                            duplicate = False,
                            reversible = True)
        
        relavent_species = [mr1, mr2, mp1, mp2]
        relavent_labels = {}

        for rel_species in relavent_species:
            for label in known_species:
                known_spec = known_species[label]
                if known_spec.isIsomorphic(rel_species):
                    found_species[rel_species] = label
                    relavent_labels[rel_species] = label

            if rel_species not in found_species.keys():
                need_to_add.append(rel_species.toSMILES())
            """try:
                a = found_species[rel_species]
            except:
                need_to_add.append(rel_species.toSMILES())
                #relavent_labels[rel_species] = '****'
                #logging.warning('{} is missing from species dictionary'.format(rel_species))

            """
    need_to_add = list(set(need_to_add))
    
    return need_to_add

In [4]:
def get_unknown_species(reactions, known_species):
    """
    Expects list of auto-TST reactions and known species from a species dictionary
    
    Returns unique list of SMILES of species not in the dictionary
    """
    found_species = {}
    need_to_add = []

    for i, reaction in enumerate(reactions):
        rmg_reaction = reaction.rmg_reaction
        r1, r2 = rmg_reaction.reactants
        p1, p2 = rmg_reaction.products
        
        relavent_species = [r1, r2, p1, p2]
        relavent_labels = {}

        for rel_species in relavent_species:
            for label in known_species:
                known_spec = known_species[label]
                if known_spec.isIsomorphic(rel_species):
                    found_species[rel_species] = label
                    relavent_labels[rel_species] = label

            if rel_species not in found_species.keys():
                need_to_add.append(rel_species.toSMILES())

    need_to_add = list(set(need_to_add))
    
    return need_to_add

In [5]:
def update_databases_from_csv():
    #csv_path = os.path.join(os.path.expandvars('$RMGpy'), "..",  'AutoTST')

    csv_path = 'distance_data.csv'
    dict_path = 'database/H_Abstraction/TS_training/dictionary.txt'
    old_style_dict_path = 'database/H_Abstraction/TS_training/old_dictionary.txt'
    new_dict_path = 'updated_dictionary.txt'
    method_str = 'm062x/6-311+G(2df,2p)'
    shortDesc_str = 'M06-2X/6-311+G(2df,2p) calculation via group additive TS generator.'
    old_reactions_path = 'database/H_Abstraction/TS_training/reactions.py'
    new_reactions_path = 'updated_reactions.py'

    csv_df = get_csv(csv_path)
    print 'New Reactions: ',csv_df.shape[1]

    known_species = rmgpy.data.base.Database().getSpecies(dict_path)
    need_to_add = get_need_to_add(csv_df=csv_df, known_species=known_species)
    print 'New Species: ', len(need_to_add)

    all_dict_entries = get_updated_dictionary_entries(old_style_dict_path, need_to_add)
    if check_dictionary(all_dict_entries):
        ugly_save_dictionary(new_dict_path, all_dict_entries)

    known_species = rmgpy.data.base.Database().getSpecies(new_dict_path)
    r_db, old_db, new_db = update_reactions(old_reactions_path,
                                            csv_df,
                                            known_species,
                                            Method = method_str,
                                            ShortDesc = shortDesc_str
                                           )
    #TODO add check for duplicates method
    #if check_reactions():
    if True:
        r_db.save(new_reactions_path)
    print
    print 'done?'
    return new_db