# Scratch work for evaluation functions

In [None]:
# import initial dataset and graphs

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy

import torch
import torch_geometric
from torch_geometric.utils import to_networkx

import networkx as nx

In [2]:
initial_smiles = pd.read_csv('pesticides_augmented.csv')
initial_graphs = torch.load("./data/pesticides_graphs_augmented_v3.pt", weights_only=False)
output_novel_graphs = torch.load("./data/decoded_new_points.pt", weights_only=False)

In [3]:
G0 = to_networkx(initial_graphs[0], to_undirected=True)
G1 = to_networkx(initial_graphs[1], to_undirected=True)

In [4]:
nx.is_isomorphic(G0, G1, node_match=None, edge_match=None)

False

In [5]:
output_novel_graphs[4].edge_index

array([[0.00000000e+00, 1.00000000e+00, 1.56275749e-01, ...,
        2.35283293e-10, 1.14111331e-10, 2.21240345e-10],
       [1.00000000e+00, 0.00000000e+00, 8.47102404e-01, ...,
        1.25843225e-10, 1.22302460e-10, 1.91695951e-10],
       [1.56275749e-01, 8.47102404e-01, 0.00000000e+00, ...,
        2.69847783e-10, 2.17005039e-10, 3.04154729e-10],
       ...,
       [2.35283293e-10, 1.25843225e-10, 2.69847783e-10, ...,
        0.00000000e+00, 2.79651607e-10, 2.05088599e-10],
       [1.14111331e-10, 1.22302460e-10, 2.17005039e-10, ...,
        2.79651607e-10, 0.00000000e+00, 2.65128614e-10],
       [2.21240345e-10, 1.91695951e-10, 3.04154729e-10, ...,
        2.05088599e-10, 2.65128614e-10, 0.00000000e+00]])

In [6]:
G0 = nx.from_numpy_array(np.round(output_novel_graphs[0].edge_index), create_using = nx.MultiGraph())
G1 = nx.from_numpy_array(np.round(output_novel_graphs[2].edge_index), create_using = nx.MultiGraph())

In [7]:
nx.is_isomorphic(G0, G1, node_match=None, edge_match=None)

False

In [8]:
type(output_novel_graphs[0].edge_index)

numpy.ndarray

In [21]:
def unpad_graphs(generated_graphs):
    
    temp = []
    unconnected = np.zeros(len(generated_graphs))
    valid_graphs = []
    
    for i, G in enumerate(generated_graphs):
        adj_matrix = np.round(G.edge_index)
        dim = adj_matrix.shape[0] # track the dimensions to know how much padding to remove
        row_sum = adj_matrix.sum(axis=0) # get the sums within each row, which is the same as the column sum
        zero_indices = np.where(row_sum == 0)[0] # get all zero indices
        
        # see if zero indices has increments of one
        # this indicates if the graph is unconnected
        for p in range(1, len(zero_indices)):
            if zero_indices[p] != zero_indices[p - 1] + 1:
                unconnected[i] = 1 # if the graph is unconnected, flag it
                
        idx = zero_indices[0] if zero_indices.size > 0 else -1 # get first zero index
        adjusted_matrix = adj_matrix[0:idx, 0:idx] # only take the nonzero matrix portion of valid graphs
        
        G_update = copy.deepcopy(G) # copy to avoid overwriting
        G_update.edge_index = adjusted_matrix
        
        if unconnected[i] == 0:
            valid_graphs.append(G_update) # this only keeps the valid graphs
        
    print(f'Percent of unconnected (invalid) graphs: {100*sum(unconnected)/len(unconnected)}')
    return valid_graphs

In [71]:
cleaned_graphs = unpad_graphs(output_novel_graphs)

Percent of unconnected (invalid) graphs: 30.0


In [72]:
temp = cleaned_graphs.copy()
holder = copy.deepcopy(initial_graphs[0])
convert = to_networkx(holder, to_undirected=True)
convert = nx.adjacency_matrix(convert)
convert = convert.toarray()
holder.edge_index = convert
temp.append(holder)

In [73]:
temp

[Data(x=[44, 3], edge_index=[7, 7], edge_attr=[88, 1], y=0.0),
 Data(x=[44, 3], edge_index=[11, 11], edge_attr=[88, 1], y=0.2222222222222222),
 Data(x=[44, 3], edge_index=[7, 7], edge_attr=[88, 1], y=0.6666666666666666),
 Data(x=[44, 3], edge_index=[11, 11], edge_attr=[88, 1], y=0.8888888888888888),
 Data(x=[44, 3], edge_index=[11, 11], edge_attr=[88, 1], y=1.3333333333333333),
 Data(x=[44, 3], edge_index=[11, 11], edge_attr=[88, 1], y=1.7777777777777777),
 Data(x=[44, 3], edge_index=[11, 11], edge_attr=[88, 1], y=2.0),
 Data(x=[11, 3], edge_index=[11, 11], edge_attr=[22, 1], y=1.492900013923645)]

In [75]:
def graph_isomorphism_within_set(generated_graphs):
    
    isomorphic_tracker = np.zeros(len(generated_graphs))
    
    for i, new_graph0 in enumerate(generated_graphs):
        
        G0 = nx.from_numpy_array(np.round(new_graph0.edge_index), create_using = nx.MultiGraph())
        
        for p, new_graph1 in enumerate(generated_graphs):
            G1 = nx.from_numpy_array(np.round(new_graph1.edge_index), create_using = nx.MultiGraph())
            if i != p:
                if nx.is_isomorphic(G0, G1, node_match=None, edge_match=None) == True:
                    isomorphic_tracker[i] = p+1 # move these values off of zero
            if i == p:
                continue
    
    n_unique_graphs = 0
    duplicate_tracker = []
    
    for val in isomorphic_tracker:
        if val == 0:
            n_unique_graphs += 1
        if val != 0:
            duplicate_tracker.append(val)

    n_unique_graphs += len(set(duplicate_tracker))/2

    percent_unique = 100*n_unique_graphs/len(generated_graphs)
    
    return print(f'There are {n_unique_graphs} unique graphs, with an overall {percent_unique}% unique graphs generated')

In [76]:
graph_isomorphism_within_set(temp)

There are 3.0 unique graphs, with an overall 37.5% unique graphs generated


In [66]:
def graph_isomorphism_between_sets(initial_graphs, generated_graphs):
    
    isomorphic_tracker = np.zeros(len(generated_graphs))
    
    for i, new_graph in enumerate(generated_graphs):
        
        
        G_new = nx.from_numpy_array(np.round(new_graph.edge_index), create_using = nx.MultiGraph())
        
        
        for original_graph in initial_graphs:
            G_old = to_networkx(original_graph, to_undirected=True)
            if nx.is_isomorphic(G_new, G_old, node_match=None, edge_match=None) == True:
                isomorphic_tracker[i] = 1
    percent_unique = 100*(len(generated_graphs) - sum(isomorphic_tracker))/len(generated_graphs)
    return print(f'Compared to the initial graphs used for training, generated graphs are {percent_unique}% unique')

In [77]:
graph_isomorphism_between_sets(initial_graphs[0:2], temp)

Compared to the initial graphs used for training, generated graphs are 87.5% unique


In [81]:
temp[7]

Data(x=[11, 3], edge_index=[11, 11], edge_attr=[22, 1], y=1.492900013923645)

In [84]:
import rdkit
from rdkit.Chem.QED import properties
from rdkit import Chem

from functions.data_generation import *

In [96]:
G_test= nx.from_numpy_array(np.round(temp[7].edge_index), create_using = nx.MultiGraph())

In [107]:
test = nx.write_adjlist(G_test, "test.adjlist")
#mol = to_smiles(temp[7])

In [108]:
test

In [109]:
test = nx.read_adjlist("test.adjlist")

In [110]:
test

<networkx.classes.graph.Graph at 0x1c435cb7fb0>

In [158]:
test_mol = copy.deepcopy(temp[7])
edges_converted = test.edges
test_mol.edge_index = torch.swapaxes(torch.tensor(np.array(edges_converted, dtype=int)), 0, 1)

In [171]:
print(test_mol)
print(test_mol.x)
print(test_mol.x[0, 0].item())
print(test_mol.edge_index)
print(test_mol.edge_attr)

Data(x=[11, 3], edge_index=[2, 11], edge_attr=[22, 1], y=1.492900013923645)
tensor([[ 6.,  4.,  5.],
        [ 6.,  4.,  5.],
        [ 6.,  4.,  5.],
        [ 6.,  3.,  5.],
        [ 8.,  1.,  5.],
        [ 7.,  3.,  5.],
        [17.,  1.,  5.],
        [ 6.,  3.,  5.],
        [ 8.,  1.,  5.],
        [ 7.,  3.,  5.],
        [35.,  1.,  5.]])
6.0
tensor([[ 0,  1,  1,  1,  3,  3,  9,  9,  5,  5,  7],
        [ 1,  2,  3,  9,  4,  5,  7, 10,  6,  7,  8]], dtype=torch.int32)
tensor([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [2.],
        [1.],
        [2.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [2.],
        [1.],
        [2.],
        [1.],
        [1.],
        [1.],
        [1.]])


In [170]:
print(initial_graphs[0])
print(initial_graphs[0].x)
print(initial_graphs[0].edge_index)
print(initial_graphs[0].edge_attr)

Data(x=[11, 3], edge_index=[2, 22], edge_attr=[22, 1], y=1.492900013923645)
tensor([[ 6.,  4.,  5.],
        [ 6.,  4.,  5.],
        [ 6.,  4.,  5.],
        [ 6.,  3.,  5.],
        [ 8.,  1.,  5.],
        [ 7.,  3.,  5.],
        [17.,  1.,  5.],
        [ 6.,  3.,  5.],
        [ 8.,  1.,  5.],
        [ 7.,  3.,  5.],
        [35.,  1.,  5.]])
tensor([[ 0,  1,  1,  1,  1,  2,  3,  3,  3,  4,  5,  5,  5,  6,  7,  7,  7,  8,
          9,  9,  9, 10],
        [ 1,  0,  2,  3,  9,  1,  1,  4,  5,  3,  3,  6,  7,  5,  5,  8,  9,  7,
          1,  7, 10,  9]])
tensor([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [2.],
        [1.],
        [2.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [2.],
        [1.],
        [2.],
        [1.],
        [1.],
        [1.],
        [1.]])


In [176]:
convert_test = test_mol

In [185]:
mol = Chem.RWMol()

for i in range(convert_test.num_nodes):
    atom = Chem.Atom(int(convert_test.x[i, 0].item()))
    atom.SetFormalCharge(x_map['formal_charge'][int(convert_test.x[i, 2].item())])
    mol.AddAtom(atom)

edges = [tuple(i) for i in convert_test.edge_index.t().tolist()]
visited = set()

for i in range(len(edges)):
    src, dst = edges[i]
    #if tuple(sorted(edges[i])) in visited:
    #    continue

    bond_type = Chem.BondType.values[int(convert_test.edge_attr[i, 0].item())]
    mol.AddBond(src, dst, bond_type)

    #visited.add(tuple(sorted(edges[i])))

mol = mol.GetMol()

#Chem.SanitizeMol(mol)
#Chem.AssignStereochemistry(mol)

Chem.MolToSmiles(mol, isomericSmiles=True)

'CC1(C)C(O)N(Cl)=C(O)N1=Br'

In [152]:
edges

[(0, 1, 1, 1, 3, 3, 9, 9, 5, 5, 7), (1, 2, 3, 9, 4, 5, 7, 10, 6, 7, 8)]