In [None]:
#Generate Pathways using ProcessedRels with Thermo data
# from google.colab import drive
# drive.mount('/content/drive')
import pandas as pd
import numpy as np
from treelib import Node, Tree
from ast import literal_eval
import time
from func_timeout import *
import signal
from contextlib import contextmanager
import time
from copy import deepcopy

In [None]:
import os
os.environ['kmp_duplicate_lib_ok']='true'

In [None]:
class Molecule(object): #initialises an object which is the molecule a node corresponds to, with the property of the path traced to that molecule
    def __init__(self, path):
        self.path = path

In [None]:
def split(word):
    return[char for char in word]

In [None]:
def tree_copier(tree, identifier_factor):
    tree_copy = Tree()
    nodes = tree.all_nodes()
    for i in range(len(nodes)): #iterates over all nodes of the tree to be copied
        dummy_tag = nodes[i].tag
        dummy_identifier = nodes[i].identifier
        dummy_data = nodes[i].data
        try:
            dummy_parent = (tree.parent(dummy_identifier)).identifier #identifies if the node has a parent
        except:
            dummy_parent = -1
        if dummy_parent == -1:
            tree_copy.create_node(dummy_tag, (dummy_identifier+identifier_factor), data = dummy_data)
        else:
            tree_copy.create_node(dummy_tag, (dummy_identifier+identifier_factor), parent=(dummy_parent+identifier_factor), data = dummy_data) #creates a node in the copied tree corresponding to one of the original tree, the identifier factor is used because node identifiers are global across all trees so this distinguishes between nodes of different trees (issues would only arise if a tree contained >1000 nodes)
    return(tree_copy)

In [None]:
def index_finder(Product, rels, path):
    indexes = []
    for i in range(len(rels['Index'])):
        place = literal_eval(rels['Products'][i])
        for j in range(len(place)):
            if place[j] == Product:
                indexes.append(rels['Index'][i]) #identifies reactions in the rels data which have the active molecule as a product
    valid_indexes = []
    for i in range(len(indexes)):
        valid = True
        precursors = precursor_finder(indexes[i], rels) #identifies the reagents for each possible reaction which generates the active molecules
        for j in range(len(precursors)):
            if precursors[j] in path:
                valid = False
                break #discards reactions which use reagent molecules already in the path traced for the active molecule
        if valid == True:
            valid_indexes.append(indexes[i])
    return(valid_indexes) #returns all valid reactions which generate the active molecule

def precursor_finder(index, rels):
    precursors = []
    for i in range(len(rels['Index'])):
        if rels['Index'][i] == index:
            dummy = literal_eval(rels['Reagents'][i])
            for j in range(len(dummy)):
                precursors.append(dummy[j]) #identifies the reagents of a specific reaction of the rels file from its index
    return(precursors)

In [None]:
def find_component(node, rels):
    Product = node.tag
    path = (node.data).path
    indexes = index_finder(Product, rels, path)
    precursors = []
    for i in range(len(indexes)):
        precursors.append(precursor_finder(indexes[i], rels)) #collects the reagents for all valid reactions which generate the active molecule
    return(indexes, precursors) #returns the reactions and associated reagents which generate the active molecule

In [None]:
@contextmanager
def timeout(time):
    # Register a function to raise a TimeoutError on the signal.
    signal.signal(signal.SIGALRM, raise_timeout)
    # Schedule the signal to be sent after ``time``.
    signal.alarm(time)
    try:
        yield
    except TimeoutError:
        pass
    finally:
        # Unregister the signal so it won't be triggered
        # if the timeout is not reached.
        signal.signal(signal.SIGALRM, signal.SIG_IGN)
def raise_timeout(signum, frame):
    raise TimeoutError

In [None]:
def map_tree(Smiles, network, rels, time_limit):
    #Choose the set of G0 molecules for the network of interest
    mol_dict = {"FormoseAmm":['N', 'C=O', 'C(CO)=O', 'O'], "Formose":['C=O', 'C(CO)=O', 'O'], "GlucoseAmm":['N', 'O', 'C(C(C(C(C(CO)O)O)O)O)=O'],
                "Glucose":['C(C(C(C(C(CO)O)O)O)O)=O', 'O'], "PyruvicAcid":['C(C(C)=O)(O)=O', 'O'], "UreyMiller":['C=O', 'N', 'C#N', 'C#CC#N', 'O'],
                "Maillard":['NCC(=O)O', 'O=CC(O)C(O)C(O)C(O)C(O)', 'O'], "Test":['D', 'E']}
    base_molecules = mol_dict[network]

    min_length = 1e10
    min_reactions = []
    min_pathway_energy = 0
    shortest_pathway = 'NaN'

    all_trees = []
    tree_statuses = []
    tree1 = Tree()
    tree1.create_node(Smiles, 0, data=Molecule([Smiles])) #sets the base node of the tree to the molecule we want to identify pathways for, with the tag set to its Smiles and the data set to an instance of the Molecule class
    all_trees.append(tree1)
    tree_statuses.append(False)
    reactions = [[]]
    complete = False

    with timeout(time_limit):
        while complete == False:
            print(len(all_trees))
            for i in range(len(all_trees)):
                if tree_statuses[i] == False: #iterates over all incomplete trees
                    finished = False
                    current_nodes = all_trees[i].all_nodes() #collects all nodes of the current tree
                    node_counter = len(current_nodes)-1 #identifies index of last node
                    while finished == False:
                        nodes = all_trees[i].all_nodes()
                        active_nodes = []
                        nodes_with_children = []
                        for k in range(len(nodes)):
                            try:
                                temp = (all_trees[i].parent(nodes[k].identifier)).identifier #returns index of node which is the parent of the current node
                            except:
                                temp = -1
                            if temp != -1:
                                nodes_with_children.append(temp) #identifies nodes with children
                        for k in range(len(nodes)):
                            tag = nodes[k].tag
                            level = all_trees[i].depth(nodes[k])
                            if nodes[k].identifier not in nodes_with_children and tag not in base_molecules:
                                active_nodes.append(nodes[k]) #identifies 'active' molecules which have not had reactions to them identified and are not in the base set of the network
                        if active_nodes == []:
                            finished = True #if there are no active molecules, the tree is complete
                            break
                        else:
                            if all_trees[i].depth() >= min_length:
                                all_trees[i] = 'NaN'
                                tree_statuses[i] = 'NaN'
                                finished = True #Kill any incomplete tree is already equal in length to the shortest complete tree
                                break
                            else:
                                for z in range(len(active_nodes)): #iterates over active molecules of the tree
                                    indexes, precursors = find_component(active_nodes[z], rels) #identifies precursors for an active molecule (i.e. molecules which react to form the active molecule)
                                    if len(precursors) == 0:
                                        all_trees[i] = 'NaN'
                                        tree_statuses[i] = 'NaN'
                                        finished = True #if an active molecule has no precursors, a complete pathway cannot be traced
                                        break
                                    else:
                                        product = active_nodes[z].identifier #identifies the index of the active molecule
                                        dummy = (active_nodes[z].data).path #identifies the path for the active molecule
                                        place = []
                                        num_trees = len(all_trees)
                                        for m in range(len(dummy)):
                                            place.append(dummy[m]) #creates a list containing all molecules in the path for the active molecule
                                        if len(precursors) > 1:
                                            for p in range(1, len(precursors)): #iterating over all valid reactions except for the first
                                                identifier_factor = num_trees*1000
                                                tree = tree_copier(all_trees[i], identifier_factor) #creates a new tree copying the active one, with an identifier factor to distinguish its nodes from all other trees
                                                dummy_product = product
                                                tree_statuses.append(False)
                                                dummy_node_counter = identifier_factor + node_counter
                                                for q in range(len(precursors[p])):
                                                    dummy_node_counter += 1
                                                    tree.create_node(precursors[p][q], dummy_node_counter, parent=(product+identifier_factor), data=Molecule(place + [precursors[p][q]])) #creates nodes in the copied tree for each reagent of the valid reaction
                                                all_trees.append(tree)
                                                reactions.append(reactions[i] + [indexes[p]]) #creates a new reaction index list for the copied tree with all the reactions of the copied tree, plus the index for the valid reaction
                                                num_trees+=1
                                        for n in range(len(precursors[0])): #modfiying the current tree for the first valid reaction generating the active molecule, iterating over its reagents
                                            node_counter +=1
                                            all_trees[i].create_node(precursors[0][n], node_counter, parent=product, data=Molecule(place + [precursors[0][n]])) #creates a new node with a tag given by a reagent of the first valid reaction, its parent being the active molecule and its path being that of the active molecule plus itself
                                        reactions[i].append(indexes[0]) #adds the first valid reaction to the list of the reactions for the current tree
                    if all_trees[i] != 'NaN':
                        if all_trees[i].depth() >= min_length:
                          tree_statuses[i] = 'NaN'
                          all_trees[i] = 'NaN'
                        else:
                            tree_statuses[i] = True
                            min_length = all_trees[i].depth()
                            min_reactions = reactions[i]
                            shortest_pathway = all_trees[i]

            if tree_statuses.count(False) == 0:
                complete = True

        EnergyChanges = []
        ReactionIDs = []
        for i in range(len(rels['Index'])):
            EnergyChanges.append(rels['Energy Change'][i])
            ReactionIDs.append(rels['Index'][i]) #creates lists containing the reaction IDs and energy changes for all reactions of the rels file
        for i in range(len(min_reactions)):
            min_pathway_energy += EnergyChanges[ReactionIDs.index(min_reactions[i])].round(2) #identifies the sum of the energy changes associated with all reactions of a complete tree
    return shortest_pathway, min_length, min_reactions, min_pathway_energy

In [None]:
def generations_counter(input_data, num_generations): #function which returns arrays containing the generation numbers and index of the last match of each generation, passed the matches dataframe and number of generations of the network
    gen_data = np.zeros(num_generations+1)
    value_data = np.zeros(num_generations+1)

    for i in range(len(gen_data)):
        gen_data[i] = i #sets the generation numbers for the network

    for i in range(len(input_data['Generation'])):
        if type(input_data['Generation'][i]) == str:
            dummy = split(input_data['Generation'][i])
            value = int(dummy[-1])
        else:
            value = input_data['Generation'][i]
        value_data[value]+=1 #identifies the number of matches of each generation

    final_value_data = np.zeros(num_generations+1)
    final_value_data[0] = value_data[0]
    for i in range(1, len(value_data)):
        final_value_data[i] += value_data[i]
        for j in range(i):
            final_value_data[i] += value_data[j] #identifies the cumulative number of matches up to the end of each generation (including all prior generations)

    return(gen_data, final_value_data)

In [None]:
def pathway_finder(matches_file, network, NoDegen, max_generation, num_generations, TimeLimit, outdir, mol_names=None, output_filename=None, write=True):
    matches_data = pd.read_csv(matches_file, sep='\t')
    gen_data, final_value_data = generations_counter(matches_data, num_generations) #identifies the index of the last match of each generation in the matches data
    if NoDegen:
      rels_data = pd.read_csv(f'{no_degen_rels_dir}/NoDegenSpontaneous{network}G3RelsWithThermo.tsv', sep='\t')
    else:
      rels_data = pd.read_csv(f'{degen_rels_dir}/Spontaneous{network}G3RelsWithThermo.tsv', sep='\t')
    output_df = deepcopy(matches_data.iloc[:int(final_value_data[max_generation])])
    min_pathway_lengths = []
    min_pathway_energies = []
    if os.path.exists(outdir) == False:
        os.makedirs(outdir)
    if write:
        filepath = f'{outdir}/{output_filename}{network}ShortestPathways.txt'
        f = open(filepath, 'w')
    for i in range(int(final_value_data[max_generation])): #iterates through all matches up to the end of the max generation set in the function input
        print('---------------------')
        print(f"Match {i}")
        shortest_tree, min_length, min_reactions, min_pathway_energy = map_tree(matches_data['NetworkSmiles'][i], network, rels_data, TimeLimit)
        if int(min_pathway_energy) == 0:
            min_pathway_energies.append('NaN')
            min_pathway_lengths.append('NaN')
            if shortest_tree == 'NaN':
                fail_reason = 'No spontaneous pathways'
            else:
                shortest_tree = 'NaN' #to make it clear the true shortest pathway has not been traced within the time limit
                fail_reason = 'Out of time'
        else:
            min_pathway_energies.append(min_pathway_energy)
            min_pathway_lengths.append(min_length)
        if write:
            if mol_names is not None:
                f.write(f'Molecule: {mol_names[i]}\n')
            f.write(f"Network Smiles: {matches_data['NetworkSmiles'][i]}\n")
            f.write(f"Generation: {matches_data['Generation'][i]}\n")
            f.write(f"INCHIKEY: {matches_data['INCHIKEY'][i]}\n")
            f.write(f"Analogue Smiles: {matches_data['AnalogueSmiles'][i]}\n")
            if shortest_tree != 'NaN':
                f.write(f'Shortest Pathway \n')
                f.close()
                shortest_tree.save2file(filepath)
                f = open(filepath, 'a+')
                f.write(f'Pathway Length: {min_length}\n')
                f.write(f'Energy Change: {min_pathway_energy}\n')
                f.write(f'Reaction IDs: {min_reactions}\n')
            else:
                f.write(f'{fail_reason}\n')
            f.write('---------------------\n')
    if write:
        f.close()
    output_df['MinimumPathwayLength'] = min_pathway_lengths
    output_df['MinimumPathwayEnergy'] = min_pathway_energies
    del output_df['AnalogueSmiles']
    output_df = output_df.set_index('INCHIKEY')
    if write:
        output_df.to_csv(f'{outdir}/{output_filename}{network}ShortestPathways.tsv', sep='\t')
    return output_df

In [None]:
#  %%time
#  network = 'GlucoseAmm'
#  max_generation = 2
#  degen_rels_dir = '/content/drive/MyDrive/BMSIS /MinimalDirectory/ProcessedData/SpontaneousRelsWithThermoFiles'
#  no_degen_rels_dir = '/content/drive/MyDrive/BMSIS /TestData'
#  matches_dir = '/content/drive/MyDrive/BMSIS /MinimalDirectory/ProcessedData/MatchesFiles'
#  output_dir = '/content/drive/MyDrive/BMSIS /MinimalDirectory/ProcessedData/ShortestPathways'
#  df = pathway_finder(f'{matches_dir}/{network}Matches.tsv', network=network, NoDegen=True,
#                      max_generation=max_generation, num_generations=5, TimeLimit=300, outdir=output_dir,
#                      output_filename=f'G{max_generation}')