In [None]:
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pymatgen.core.structure import Molecule
from pymatgen.analysis.graphs import MoleculeGraph
from pymatgen.analysis.local_env import OpenBabelNN
import networkx as nx
import networkx.algorithms.isomorphism as iso
import rmsd

In [None]:
ts_methods = ['dft1', 'nn1', 'nn0', 'nn0dft1']
irc_methods = ['dft1', 'nn1', 'nn1', 'dft1']
noise_levels = ['00']
indices = pd.MultiIndex.from_tuples([(rxn, noise, ts_method, irc_method) for rxn in range(265) for noise in noise_levels for (ts_method, irc_method) in zip(ts_methods, irc_methods)], names=['rxn', 'noise', 'ts_method', 'irc_method'])
result = pd.DataFrame(index=indices)
rxns = [
    # no match with training set
    [2, 5, 7, 16, 18, 21, 25, 26, 27, 29, 30, 31, 32, 34, 36, 38, 40, 41, 44, 47, 48, 49, 51, 52, 54, 55, 58, 62, 67, 68, 69, 70, 73, 74, 76, 78, 81, 82, 86, 87, 89, 93, 94, 95, 97, 98, 99, 101, 103, 105, 107, 110, 113, 114, 116, 117, 118, 122, 123, 125, 126, 132, 138, 141, 143, 145, 150, 152, 154, 157, 158, 160, 161, 162, 165, 167, 173, 174, 181, 182, 183, 187, 188, 191, 193, 194, 196, 197, 198, 209, 210, 211, 214, 218, 219, 221, 223, 226, 227, 231, 232, 234, 236, 237, 238, 240, 242, 243, 244, 248, 249, 250, 252, 254, 261, 262], 
    # 1-end match with training set
    [0, 1, 3, 4, 6, 8, 9, 10, 11, 12, 13, 14, 19, 20, 23, 24, 28, 33, 35, 37, 39, 42, 45, 46, 56, 57, 59, 60, 63, 64, 65, 66, 71, 77, 79, 83, 84, 85, 88, 90, 91, 92, 96, 100, 104, 106, 108, 109, 111, 112, 115, 119, 120, 121, 124, 127, 128, 129, 131, 133, 134, 135, 137, 139, 140, 146, 148, 149, 151, 153, 155, 156, 159, 163, 164, 166, 169, 170, 171, 172, 175, 176, 177, 178, 179, 180, 184, 185, 186, 189, 190, 192, 195, 199, 200, 201, 202, 203, 204, 205, 206, 207, 212, 213, 215, 220, 222, 224, 225, 229, 235, 239, 245, 246, 247, 251, 253, 255, 256, 257, 258, 259, 260, 263, 264], 
    # 2-end match with training set
    [15, 17, 22, 43, 50, 53, 61, 72, 75, 80, 102, 130, 136, 142, 144, 147, 168, 208, 216, 217, 228, 230, 233, 241],
    ]

for rxn in tqdm(range(265)):
    try:
        molecule = Molecule.from_file(f'Data/molecules_fromscratch_noised_renamed/{rxn:03}noise{noise}_TS_{ts_method}.xyz')
        assert rxn not in rxns[2]
    except:
        continue

    for noise in noise_levels:
        for ts_method, irc_method in zip(ts_methods, irc_methods):
            try:
                output = json.load(open(f'20230706_Quacc/outputs/{rxn:03}noise{noise}_TS_{ts_method}.json', 'r'))
                if output['nsteps'] < 1000:
                    trajectory = [np.array(json.loads(mol['atoms']['atoms_json'])['positions']['__ndarray__'][-1]).reshape(-1, 3) for mol in output['trajectory']]
                    path_len = [rmsd.kabsch_rmsd(trajectory[i], trajectory[i+1], translate=True) for i in range(len(trajectory)-1)]
                    path_disp = rmsd.kabsch_rmsd(trajectory[0], trajectory[-1], translate=True)
                    result.loc[(rxn, noise, ts_method, irc_method), 'path_len'] = sum(path_len)
                    result.loc[(rxn, noise, ts_method, irc_method), 'path_disp'] = path_disp
                    result.loc[(rxn, noise, ts_method, irc_method), 'nsteps'] = output['nsteps']
                    result.loc[(rxn, noise, ts_method, irc_method), 'nsteps_all'] = len(output['trajectory'])
                    result.loc[(rxn, noise, ts_method, irc_method), 'step_size'] = np.mean(path_len)
                result.loc[(rxn, noise, ts_method, irc_method), 'rxn_status'] = 'converged'
            except FileNotFoundError:
                result.loc[(rxn, noise, ts_method, irc_method), 'path_len'] = np.nan
                result.loc[(rxn, noise, ts_method, irc_method), 'path_disp'] = np.nan
                result.loc[(rxn, noise, ts_method, irc_method), 'nsteps'] = np.nan
                result.loc[(rxn, noise, ts_method, irc_method), 'nsteps_all'] = np.nan
                result.loc[(rxn, noise, ts_method, irc_method), 'step_size'] = np.nan
                result.loc[(rxn, noise, ts_method, irc_method), 'rxn_status'] = 'TS errored'
                result.loc[(rxn, noise, ts_method, irc_method), 'endpoint_match'] = 'TS errored'
                continue
            
            try:
                output_1 = json.load(open(f'20230706_Quacc/outputs/{rxn:03}noise{noise}_R_{ts_method}_{irc_method}.json', 'r'))
                json_1 = json.loads(output_1['atoms']['atoms_json'])
                molecule_1 = Molecule(
                    np.array(json_1['numbers']['__ndarray__'][-1]), 
                    np.array(json_1['positions']['__ndarray__'][-1]).reshape(-1, 3),
                    )
                graph_1 = nx.Graph(MoleculeGraph.with_local_env_strategy(molecule_1, OpenBabelNN()).graph)
                nx.set_node_attributes(graph_1, {idx: idx for idx in graph_1.nodes()}, 'index')
            except FileNotFoundError:
                result.loc[(rxn, noise, ts_method, irc_method), 'rxn_status'] = 'IRC errored'
                result.loc[(rxn, noise, ts_method, irc_method), 'endpoint_match'] = 'IRC errored'
                continue
            try:
                output_2 = json.load(open(f'20230706_Quacc/outputs/{rxn:03}noise{noise}_P_{ts_method}_{irc_method}.json', 'r'))
                json_2 = json.loads(output_2['atoms']['atoms_json'])
                molecule_2 = Molecule(
                    np.array(json_2['numbers']['__ndarray__'][-1]),
                    np.array(json_2['positions']['__ndarray__'][-1]).reshape(-1, 3),
                    )
                graph_2 = nx.Graph(MoleculeGraph.with_local_env_strategy(molecule_2, OpenBabelNN()).graph)
                nx.set_node_attributes(graph_2, {idx: idx for idx in graph_2.nodes()}, 'index')
            except FileNotFoundError:
                result.loc[(rxn, noise, ts_method, irc_method), 'rxn_status'] = 'IRC errored'
                result.loc[(rxn, noise, ts_method, irc_method), 'endpoint_match'] = 'IRC errored'
            
            if rmsd.kabsch_rmsd(molecule_1.cart_coords, molecule_2.cart_coords, translate=True) < 0.1:
                result.loc[(rxn, noise, ts_method, irc_method), 'rxn_status'] = 'Conformational change'
            elif nx.is_isomorphic(graph_1, graph_2, node_match=iso.numerical_node_match('index', -1)):
                result.loc[(rxn, noise, ts_method, irc_method), 'rxn_status'] = 'Conformational change'
            else:
                result.loc[(rxn, noise, ts_method, irc_method), 'rxn_status'] = 'Chemical reaction'

            graph_3 = nx.Graph(np.array([line.split() for line in open(f'Data/molecules_kinbotprod_renamed/{rxn:03}_R.bond', 'r').readlines()], dtype=np.float64))
            graph_4 = nx.Graph(np.array([line.split() for line in open(f'Data/molecules_kinbotprod_renamed/{rxn:03}_P.bond', 'r').readlines()], dtype=np.float64))
            nx.set_node_attributes(graph_3, {idx: idx for idx in graph_3.nodes()}, 'index')
            nx.set_node_attributes(graph_4, {idx: idx for idx in graph_4.nodes()}, 'index')

            graph_compares = (
                nx.is_isomorphic(graph_1, graph_3, node_match=iso.numerical_node_match('index', -1)), 
                nx.is_isomorphic(graph_2, graph_4, node_match=iso.numerical_node_match('index', -1)), 
                nx.is_isomorphic(graph_1, graph_4, node_match=iso.numerical_node_match('index', -1)), 
                nx.is_isomorphic(graph_2, graph_3, node_match=iso.numerical_node_match('index', -1)),
            )
            match graph_compares:
                case (True, True, False, False): 
                    result.loc[(rxn, noise, ts_method, irc_method), 'endpoint_match'] = '2-end match' 
                case (False, False, True, True): 
                    result.loc[(rxn, noise, ts_method, irc_method), 'endpoint_match'] = '2-end match' 
                case (True, False, False, False) | (False, True, False, False) | (False, False, True, False) | (False, False, False, True): 
                    result.loc[(rxn, noise, ts_method, irc_method), 'endpoint_match'] = '1-end match' 
                case (True, False, False, True) | (False, True, True, False):
                    result.loc[(rxn, noise, ts_method, irc_method), 'endpoint_match'] = '1-end match'
                case (False, False, False, False):
                    result.loc[(rxn, noise, ts_method, irc_method), 'endpoint_match'] = 'No match'    # TS failure / No match
                case _:
                    result.loc[(rxn, noise, ts_method, irc_method), 'endpoint_match'] = 'Unknown'
                    print(rxn, noise, ts_method, graph_compares)
           
result_pretty = result.reset_index()
count = result_pretty.groupby(['ts_method', 'noise'])['endpoint_match'].value_counts()
print(count)

In [None]:
colors = ['tab:blue', 'tab:green', 'tab:orange']
labels = ['QN Hessian\n(DFT)', 'QN Hessian\n(NewtonNet)', 'Full Hessian\n(NewtonNet)']
methods = ['dft1', 'nn1', 'nn0']

plt.figure(figsize=(4, 3))
for method_, method in enumerate(methods):
    nsteps = result_pretty[(result_pretty['ts_method'] == method) & (result_pretty['noise'] == '00')]['nsteps'].values
    sns.histplot(nsteps, discrete=True, kde=True, kde_kws={'gridsize':1000}, color=colors[method_], edgecolor=None, alpha=0.25)
plt.legend(labels)
plt.xlim(-5, 125)
plt.xlabel('Number of steps')
plt.ylabel('Reaction count')
plt.savefig('StepDistribution.pdf', bbox_inches='tight')

plt.figure(figsize=(4, 3))
plt.scatter(
    result_pretty[(result_pretty['ts_method'] == 'nn1') & (result_pretty['noise'] == '00')]['nsteps'].values, 
    result_pretty[(result_pretty['ts_method'] == 'nn0') & (result_pretty['noise'] == '00')]['nsteps'].values, 
    color='k', marker='.')
plt.plot([0, 120], [0, 120], color='black', linestyle=':')
plt.xlim(0, 120)
plt.ylim(0, 90)
plt.xticks(np.arange(0, 121, 20))
plt.yticks(np.arange(0, 91, 20))
plt.xlabel('Number of steps (QN Hessian)')
plt.ylabel('Number of steps (full Hessian)')
plt.savefig('StepRelation.pdf', bbox_inches='tight')

In [None]:
colors = ['tab:blue', 'tab:green', 'tab:orange']
labels = ['QN Hessian (DFT)', 'QN Hessian (NewtonNet)', 'Full Hessian (NewtonNet)']
methods = ['dft1', 'nn1', 'nn0']
matches = np.ones(265, dtype=bool)
for method_, method in enumerate(methods):
    matches *= (result_pretty[(result_pretty['ts_method'] == method) & (result_pretty['noise'] == '00')]['endpoint_match'].values == '2-end match')

fig, axs = plt.subplots(3, 1, figsize=(4, 3), gridspec_kw={'height_ratios': [3, 1, 3]})

for method_, method in enumerate(methods):
    step_size = result_pretty[(result_pretty['ts_method'] == method) & (result_pretty['noise'] == '00')]['step_size'].values * 10
    sns.histplot(step_size[matches], bins=np.linspace(0, 2.5, 51), kde=True, color=colors[method_], edgecolor=None, alpha=0.25, ax=axs[0])
axs[0].set_xlabel('Mean step size (pm)')
axs[0].set_ylabel('')
axs[0].legend(labels, loc='upper right', bbox_to_anchor=(1, 4/3), framealpha=1)

axs[1].set_ylim(0, 20)
axs[1].tick_params(labelcolor='none', top=False, bottom=False, left=False, right=False)
axs[1].set_ylabel('Reaction count')
axs[1].set_frame_on(False)

for method_, method in enumerate(methods):
    ts_rmsd = result_pretty[(result_pretty['ts_method'] == method) & (result_pretty['noise'] == '00')]['path_len'].values
    sns.histplot(ts_rmsd[matches], bins=np.linspace(0, 5, 51), kde=True, color=colors[method_], edgecolor=None, alpha=0.25, ax=axs[2])
axs[2].set_xlabel('Optimization path length (Ã…)')
axs[2].set_ylabel('')

plt.savefig('TS_pathlen_stepsize.pdf', bbox_inches='tight')