In [109]:
import networkx as nx
import matplotlib.pyplot as plt
import os
from rdkit import Chem
from rdkit.Chem import Draw
import pandas as pd

In [18]:
def mol2network(n2n, file_dir='.', file_name='_', draw_network=False):
    """
    
    :param n2n: node to neighbors
    :return: a network
    """
    g = nx.Graph()
    for i in n2n.keys():
        edges = [(i, j) for j in n2n[i]]
        g.add_edges_from(edges)
    if draw_network:
        draw_graph(g, file_dir, file_name)
    return g

def draw_graph(g, file_dir, file_name):
    """
    draw molecular graph
    :param g: molecular graph
    :param file_dir: where to save figure
    :param file_name: file name
    :return:
    """
    nx.draw(g, with_labels=True, font_weight='bold')
    plt.savefig(os.path.join(file_dir, file_name + '.png'), dpi=300)
    plt.close()
    
def mol_with_atom_index(mol):
    atoms = mol.GetNumAtoms()
    for idx in range(atoms):
        mol.GetAtomWithIdx(idx).SetProp('molAtomMapNumber', str(mol.GetAtomWithIdx(idx).GetIdx()))
    return mol

In [14]:
def basic_test():
#     SMILES = 'C#CCN(CC#C)C(=O)c1cc2ccccc2cc1OC(F)F'
#     mol = Chem.MolFromSmiles(SMILES)
    id2smiles = {1: 'C#C', 2: 'CC', 3: 'CN', 4: 'CN', 5: 'CC', 6: 'C#C', 7: 'CN',
                 8: 'C=O', 9: 'CC', 10: 'CO', 11: 'CO', 12: 'CF', 13: 'CF',
                 14: 'C1=CCCC=C1', 15: 'C1=CC=CC=C1', 16: 'N', 17: 'C', 18: 'C'}
    n2n = {1: [2], 2: [1, 3], 3: [2, 16], 4: [5, 16], 5: [4, 6], 6: [5],
           7: [16, 17], 8: [17], 9: [14, 17], 10: [11, 14], 11: [10, 18], 12: [18],
           13: [18], 14: [9, 10, 15], 15: [14], 16: [3, 4, 7], 17: [7, 8, 9], 18: [11, 12, 13]}
    g = mol2network(n2n)
    # mol_path = get_mol_path(n2n, g)

#     print('>>> SMILES: ', SMILES)
    print('    n2n: ', n2n)
    draw_graph(g, file_dir='.', file_name='test')

In [15]:
basic_test()

    n2n:  {1: [2], 2: [1, 3], 3: [2, 16], 4: [5, 16], 5: [4, 6], 6: [5], 7: [16, 17], 8: [17], 9: [14, 17], 10: [11, 14], 11: [10, 18], 12: [18], 13: [18], 14: [9, 10, 15], 15: [14], 16: [3, 4, 7], 17: [7, 8, 9], 18: [11, 12, 13]}


In [19]:
counter = 1
SMILES = 'C#CCN(CC#C)C(=O)c1cc2ccccc2cc1OC(F)F'
mol = Chem.MolFromSmiles(SMILES)
Draw.MolToFile(mol, os.path.join('figure', 'mol_structure_{}.png'.format(counter)))
mol_with_inx = mol_with_atom_index(mol)
Draw.MolToFile(mol_with_inx,
               os.path.join('figure',  'mol_with_inx_{}.png'.format(counter)))

In [20]:
n2n = {1: [2], 2: [1, 3], 3: [2, 16], 4: [5, 16], 5: [4, 6], 6: [5],
       7: [16, 17], 8: [17], 9: [14, 17], 10: [11, 14], 11: [10, 18], 12: [18],
       13: [18], 14: [9, 10, 15], 15: [14], 16: [3, 4, 7], 17: [7, 8, 9], 18: [11, 12, 13]}
g = mol2network(n2n)

In [33]:
g.nodes.items

<bound method Mapping.items of NodeView((1, 2, 3, 16, 4, 5, 6, 7, 17, 8, 9, 14, 10, 11, 18, 12, 13, 15))>

In [265]:
g2 = nx.contracted_nodes(g, 17, 8, self_loops=False)

In [35]:
draw_graph(g2, file_dir='.', file_name='test_merge17_8')

In [123]:
g2.nodes(data=True)

NodeDataView({1: {}, 2: {}, 3: {}, 16: {}, 4: {}, 5: {}, 6: {}, 7: {}, 17: {'contraction': {8: {}}}, 9: {}, 14: {}, 10: {}, 11: {}, 18: {}, 12: {}, 13: {}, 15: {}})

In [267]:
set(n for n in g2.neighbors(17))

{7, 9}

In [266]:
g2.degree(17)

2

In [44]:
list(g2.nodes)

[1, 2, 3, 16, 4, 5, 6, 7, 17, 9, 14, 10, 11, 18, 12, 13, 15]

In [49]:
g2.nodes.get(17)

{'contraction': {8: {}}}

In [59]:
[i for i in nx.all_neighbors(g2, 1)]

[2]

In [61]:
nx.is_forest(g2)

False

In [63]:
# Return oriented tree constructed from a depth-first-search from source
# https://pelegm-networkx.readthedocs.io/en/latest/reference/algorithms.traversal.html
g3 = nx.dfs_tree(g2)

In [64]:
draw_graph(g3, file_dir='.', file_name='test_merge_dfs_tree')

In [65]:
nx.is_tree(g3)

True

In [67]:
g3.neighbors(16)

<dict_keyiterator at 0x7f0667e48598>

In [69]:
dir(g3)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adj',
 '_node',
 '_pred',
 '_succ',
 'add_edge',
 'add_edges_from',
 'add_node',
 'add_nodes_from',
 'add_weighted_edges_from',
 'adj',
 'adjacency',
 'adjlist_inner_dict_factory',
 'adjlist_outer_dict_factory',
 'clear',
 'copy',
 'degree',
 'edge_attr_dict_factory',
 'edge_subgraph',
 'edges',
 'get_edge_data',
 'graph',
 'graph_attr_dict_factory',
 'has_edge',
 'has_node',
 'has_predecessor',
 'has_successor',
 'in_degree',
 'in_edges',
 'is_directed',
 'is_multigraph',
 'name',
 'nbunch_iter',
 'neighbors',
 'node_attr_dict_factory',
 'node_dict_factory',
 'n

In [74]:
dict(g3.out_degree)

{1: 1,
 2: 1,
 3: 1,
 16: 2,
 4: 1,
 5: 1,
 6: 0,
 7: 1,
 17: 1,
 9: 1,
 14: 2,
 10: 1,
 11: 1,
 18: 2,
 12: 0,
 13: 0,
 15: 0}

In [77]:
g2.nodes(data=True)

NodeDataView({1: {}, 2: {}, 3: {}, 16: {}, 4: {}, 5: {}, 6: {}, 7: {}, 17: {'contraction': {8: {}}}, 9: {}, 14: {}, 10: {}, 11: {}, 18: {}, 12: {}, 13: {}, 15: {}})

In [79]:
g4 = nx.contracted_nodes(g3, 14, 15)

In [80]:
g4.nodes(data=True)

NodeDataView({1: {}, 2: {}, 3: {}, 16: {}, 4: {}, 5: {}, 6: {}, 7: {}, 17: {}, 9: {}, 14: {'contraction': {15: {}}}, 10: {}, 11: {}, 18: {}, 12: {}, 13: {}})

In [84]:
for node in g4.nodes(data=True):
    print(node)

(1, {})
(2, {})
(3, {})
(16, {})
(4, {})
(5, {})
(6, {})
(7, {})
(17, {})
(9, {})
(14, {'contraction': {15: {}}})
(10, {})
(11, {})
(18, {})
(12, {})
(13, {})


In [93]:
g4.nodes.get(14)

{'contraction': {15: {}}}

In [104]:
list(g4.successors(14)), list(g4.predecessors(14)), list(g4.successors(18))

([10, 14], [9, 14], [12, 13])

In [108]:
list(g4.neighbors(14)), list(g2.neighbors(14))

([10, 14], [9, 10, 15])

In [107]:
print(dict(g4.in_degree()))

{1: 0, 2: 1, 3: 1, 16: 1, 4: 1, 5: 1, 6: 1, 7: 1, 17: 1, 9: 1, 14: 2, 10: 1, 11: 1, 18: 1, 12: 1, 13: 1}


In [112]:
g.degree?

[0;31mType:[0m        property
[0;31mString form:[0m <property object at 0x7f0698429728>
[0;31mDocstring:[0m  
A DegreeView for the Graph as G.degree or G.degree().

The node degree is the number of edges adjacent to the node.
The weighted node degree is the sum of the edge weights for
edges incident to that node.

This object provides an iterator for (node, degree) as well as
lookup for the degree for a single node.

Parameters
----------
nbunch : single node, container, or all nodes (default= all nodes)
    The view will only report edges incident to these nodes.

weight : string or None, optional (default=None)
   The name of an edge attribute that holds the numerical value used
   as a weight.  If None, then each edge has weight 1.
   The degree is the sum of the edge weights adjacent to the node.

Returns
-------
If a single node is requested
deg : int
    Degree of the node

OR if multiple nodes are requested
nd_view : A DegreeView object capable of iterating (node, degree)

In [111]:
print(dict(g.degree))

{1: 1, 2: 2, 3: 2, 16: 3, 4: 2, 5: 2, 6: 1, 7: 2, 17: 3, 8: 1, 9: 2, 14: 3, 10: 2, 11: 2, 18: 3, 12: 1, 13: 1, 15: 1}


In [115]:
[k for k,v in dict(g.degree).items() if v == 1]

[1, 6, 8, 12, 13, 15]

In [231]:
list(g.neighbors(14))

[9, 10, 15]

In [131]:
g.nodes[1]['smiles'] = 'sds'

In [134]:
g.nodes[1].get('smiles', '')

'C#C'

In [135]:
g.name = 'ss'

In [137]:
dir(g)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_adj',
 '_node',
 'add_edge',
 'add_edges_from',
 'add_node',
 'add_nodes_from',
 'add_weighted_edges_from',
 'adj',
 'adjacency',
 'adjlist_inner_dict_factory',
 'adjlist_outer_dict_factory',
 'clear',
 'copy',
 'degree',
 'edge_attr_dict_factory',
 'edge_subgraph',
 'edges',
 'get_edge_data',
 'graph',
 'graph_attr_dict_factory',
 'has_edge',
 'has_node',
 'is_directed',
 'is_multigraph',
 'name',
 'nbunch_iter',
 'neighbors',
 'node_attr_dict_factory',
 'node_dict_factory',
 'nodes',
 'number_of_edges',
 'number_of_nodes',
 'order',
 'remove_edge',
 'remove_edg

In [178]:
g.copy?

[0;31mSignature:[0m [0mg[0m[0;34m.[0m[0mcopy[0m[0;34m([0m[0mas_view[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Returns a copy of the graph.

The copy method by default returns an independent shallow copy
of the graph and attributes. That is, if an attribute is a
container, that container is shared by the original an the copy.
Use Python's `copy.deepcopy` for new containers.

If `as_view` is True then a view is returned instead of a copy.

Notes
-----
All copies reproduce the graph structure, but data attributes
may be handled in different ways. There are four types of copies
of a graph that people might want.

Deepcopy -- A "deepcopy" copies the graph structure as well as
all data attributes and any objects they might contain.
The entire graph object is new so that changes in the copy
do not affect the original object. (see Python's copy.deepcopy)

Data Reference (Shallow) -- For a shallow copy the graph structure
is copied bu

### test refragment

In [288]:
fragment2frequency = pd.read_csv('demo_data_refragment/step1_frag2num.csv', index_col=0)
fragment2frequency.head()

Unnamed: 0_level_0,count,frequency
fragment,Unnamed: 1_level_1,Unnamed: 2_level_1
CC,5338219,0.251859
CN,4079814,0.192487
C,2128532,0.100425
C=O,1856788,0.087604
CO,1551646,0.073207


In [213]:
fragment2frequency.loc['CF', :]

count        491767.000000
frequency         0.023202
Name: CF, dtype: float64

In [215]:
def sanitize(mol):
    try:
        smiles = get_smiles(mol)
        mol = get_mol(smiles)
    except Exception as e:
        return None
    return mol

def get_smiles(mol):
    """
    mol obj -> SMILES
    :param mol:
    :return:
    """
    return Chem.MolToSmiles(mol, kekuleSmiles=True)

def get_mol(smiles):
    """
    SMILES -> mol obj
    :param smiles:
    :return:
    """
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    Chem.Kekulize(mol)
    return mol

In [359]:
class Refragment(object):
    def __init__(self, g, f2f, smiles, test=False):
        """
        g: moelcular graph created by networkx with SMILES and mol_inx for each node
        f2f: fragment2frequency, a dataframe with fragment(SMILES) as index, count/frequency
        smiles: the SMILES of the whold molecule
        mol_inx means the index of each atom in the whole molecule, it's a unique id for each atom
        """
        self.smiles = smiles
        self.g = g
        self.f2f = f2f
        self.test = test

    def get_node_by_degree(self, d=1):
        """
        # The node degree is the number of edges adjacent to the node
        degree equals to 1 means all leaves on the end of graph
        :param d: degree, 1/2
        :return: return all nodes with specific degree
        """
        node_degree = dict(self.g.degree)
        return [k for k, v in node_degree.items() if v == d]
    
    def get_degree_by_node(self, node_id):
        node_degree = dict(self.g.degree)
        return node_degree[node_id]

    def get_neighbors(self, node_id):
        neigs = list(self.g.neighbors(node_id))
        return neigs

    def check_if_merge(self, node1_id, node2_id):
        """
        check if need to merge these two nodes depend on the frequency of each node
        """
        mean_freq = self.get_mean_frequency()
        if self.test:
            print('    >check if merge: node1 {}, node2 {}'.format(node1_id, node2_id))
        node1_freq = self.get_freq(node1_id)
        node2_freq = self.get_freq(node2_id)
        if self.test:
            print('    >node1_freq: {}, node2_freq: {}'.format(node1_freq, node2_freq))
        if (node1_freq >= mean_freq) and (node2_freq >= mean_freq):
            return True
        return False

    def get_freq(self, node_id):
        """
        get frequency by node id
        """
        smiles = self.get_node_attr(node_id, 'smiles')
        if self.test:
            print('    >node id is: {}, smiles is: {}'.format(node_id, smiles))
        return self.f2f.loc[smiles, 'frequency']

    def get_node_attr(self, node_id, attr):
        """
        get node attribute by node id
        attr: smiles/mol_inx
        """
        if self.test:
            print('    >node id is: ', node_id)
            print(type(self.g.nodes[node_id]), self.g.nodes(data=True))
        return self.g.nodes[node_id].get(attr, '')

    # def set_node_attr()

    def get_mean_frequency(self, min_count=3):
        """
        mean of the frequency for all fragments which count >= min_count
        """
        mean_freq = self.f2f.loc[self.f2f['count'] >= min_count, 'frequency'].mean()
        return mean_freq

    def _merge_smiles(self, node1_id, node2_id):
        node1_inx_cluster = self.get_node_attr(node1_id, 'mol_inx')
        node2_inx_cluster = self.get_node_attr(node2_id, 'mol_inx')
        if self.test:
            print('    >The mol_inx of node {} is {}'.format(node1_id, node1_inx_cluster))
            print('    >The mol_inx of node {} is {}'.format(node2_id, node2_inx_cluster))
        inx_cluster = set(node1_inx_cluster) | set(node2_inx_cluster)
        merged_smiles = self._get_smiles_by_inx(inx_cluster)
        return {'merged_smiles': merged_smiles, 'merged_inx': inx_cluster}

    def merge_two_nodes(self, left_id, right_id):
        """
        remove right node to left node, and right_id will be delete;
        merge SMILES of these two nodes;
        add new fragment to self.f2f;
        update count and frequency in self.f2f
        """
        if self.check_if_merge(left_id, right_id):
            raw_smiles_left = self.g.nodes[left_id]['smiles']
            raw_smiles_right = self.g.nodes[right_id]['smiles']
            g2 = nx.contracted_nodes(self.g, left_id, right_id, self_loops=False)
            merged_result = self._merge_smiles(left_id, right_id)
            merged_smiles = merged_result['merged_smiles']
            g2.nodes[left_id]['smiles'] = merged_smiles
            g2.nodes[left_id]['mol_inx'] = list(merged_result['merged_inx'])
            if self.test:
                print('    >Merged result is: {}'.format(merged_result))
                print('    >New network: {}'.format(g2.nodes(data=True)))

            if merged_smiles not in self.f2f.index:
                self.f2f.loc[merged_smiles, 'count'] = 0
            self.f2f.loc[merged_smiles, 'count'] += 1
            self.f2f.loc[raw_smiles_left, 'count'] -= 1
            self.f2f.loc[raw_smiles_right, 'count'] -= 1
            self.f2f['frequency'] = self.f2f['count'] / self.f2f['count'].sum()
            self.g = g2.copy()

    def _get_mol(self):
        """
        SMILES -> mol obj
        :param smiles:
        :return:
        """
        mol = Chem.MolFromSmiles(self.smiles)
        if mol is None:
            return None
        Chem.Kekulize(mol)
        return mol

    def _get_smiles_by_inx(self, inx_cluster):
        """
        get a subset smiles in the whole molecule by inx_cluster
        :param inx_cluster: a set of atom index in molecule, at least contains two elements
        :return:
        """
        mol = self._get_mol()
        if self.test:
            print('    >atom index cluster: {}'.format(inx_cluster))
        smiles = Chem.MolFragmentToSmiles(mol, inx_cluster, kekuleSmiles=True)
        new_mol = Chem.MolFromSmiles(smiles, sanitize=False)
        # new_mol = copy_edit_mol(new_mol).GetMol()
        new_mol = sanitize(new_mol)  # We assume this is not None
        return get_smiles(new_mol)

    def update(self):
        """
        main part of this class
        find all leaves (only have one neighbor) and merge with their neighbor if needed
        """
        for d in range(1,3):
            # d is 1 or 2
            if self.test:
                print('---------------------------------degree {}--------------------------'.format(d))
            nodes = self.get_node_by_degree(d=d)  # a list of node id
            for node in nodes:
                if node in list(self.g.nodes):
                    neighbors = self.get_neighbors(node)  # a list of node id
                    if self.test:
                        print()
                        print('## Current node is: {}'.format(node))
                        print('  >>> Neighbors of this node are : {}'.format(','.join([str(i) for i in neighbors])))
                    for neighbor in neighbors:
                        # neighbor may be deleted on this process, so need to check if it exists
                        if d == 1: # degree = 1, only leaves
                            if self.test:
                                print('  >>> Start to check if {} and {} can be merged...'.format(neighbor, node))
                            if (neighbor in list(self.g.nodes)) and self.check_if_merge(neighbor, node):
                                if self.test:
                                    print('  >>> Start to merge {} to {}...'.format(node, neighbor))
                                self.merge_two_nodes(left_id=neighbor, right_id=node)
                        if d == 2:
                            if self.get_degree_by_node(neighbor)==2:  # degree = 2, only merge with the neighbor which degree is 2
                                if self.test:
                                    print('    >the degree of neighbor {} is {}'.format(neighbor, self.get_degree_by_node(neighbor)))
                                    print('  >>> Start to check if {} and {} can be merged...'.format(neighbor, node))
                                if (neighbor in list(self.g.nodes)) and self.check_if_merge(neighbor, node):
                                    if self.test:
                                        print('  >>> Start to merge {} to {}...'.format(neighbor, node))
                                    self.merge_two_nodes(left_id=node, right_id=neighbor)

        n2n = {n: list(self.g.neighbors(n)) for n in list(self.g.nodes())}  # node 2 neighbors, {id: [], ... }
        id2smiles = nx.get_node_attributes(self.g, 'smiles')
        id2mol_inx = nx.get_node_attributes(self.g, 'mol_inx')
        return {'n2n': n2n, 'id2smiles': id2smiles, 'f2f': self.f2f, 'id2mol_inx': id2mol_inx}

In [360]:
SMILES = 'Cc1nc2c([nH]1)c(=O)n(C)c(=O)n2CC1CC=CCC1'

n2n = {"1": [7], "2": [8], "3": [8], "4": [8], "5": [6, 8], "6": [5, 9], "7": [1, 8], "8": [2, 3, 4, 5, 7], "9": [6]}

id2smiles = {"1": "CC", "2": "C=O", "3": "CN", "4": "C=O", "5": "CN", "6": "CC", "7": "C1=C[NH]C=N1", "8": "C1=CNCNC1", "9": "C1=CCCCC1"}

id2mol_inx = {"1": [0, 1], "2": [6, 7], "3": [8, 9], "4": [10, 11], "5": [12, 13], "6": [13, 14], "7": [1, 2, 3, 4, 5], "8": [6, 8, 10, 12, 3, 4], "9": [15, 16, 17, 18, 19, 14]}

id2smile_attr = {int(k): {'smiles': v} for k,v in id2smiles.items()}
id2mol_inx_attr = {int(k): {'mol_inx': v} for k,v in id2mol_inx.items()}
n2n = {int(i): j for i,j in n2n.items()}

g = mol2network(n2n)
nx.set_node_attributes(g, id2smile_attr)
nx.set_node_attributes(g, id2mol_inx_attr)

In [361]:
g.nodes[1].get('smiles', '')

'CC'

In [362]:
refragment = Refragment(g=g, f2f=fragment2frequency, smiles=SMILES, test=True)
refragment_result = refragment.update()
refragment_result

---------------------------------degree 1--------------------------

## Current node is: 1
  >>> Neighbors of this node are : 7
  >>> Start to check if 7 and 1 can be merged...
    >check if merge: node1 7, node2 1
    >node id is:  7
<class 'dict'> [(1, {'smiles': 'CC', 'mol_inx': [0, 1]}), (7, {'smiles': 'C1=C[NH]C=N1', 'mol_inx': [1, 2, 3, 4, 5]}), (2, {'smiles': 'C=O', 'mol_inx': [6, 7]}), (8, {'smiles': 'C1=CNCNC1', 'mol_inx': [6, 8, 10, 12, 3, 4]}), (3, {'smiles': 'CN', 'mol_inx': [8, 9]}), (4, {'smiles': 'C=O', 'mol_inx': [10, 11]}), (5, {'smiles': 'CN', 'mol_inx': [12, 13]}), (6, {'smiles': 'CC', 'mol_inx': [13, 14]}), (9, {'smiles': 'C1=CCCCC1', 'mol_inx': [15, 16, 17, 18, 19, 14]})]
    >node id is: 7, smiles is: C1=C[NH]C=N1
    >node id is:  1
<class 'dict'> [(1, {'smiles': 'CC', 'mol_inx': [0, 1]}), (7, {'smiles': 'C1=C[NH]C=N1', 'mol_inx': [1, 2, 3, 4, 5]}), (2, {'smiles': 'C=O', 'mol_inx': [6, 7]}), (8, {'smiles': 'C1=CNCNC1', 'mol_inx': [6, 8, 10, 12, 3, 4]}), (3, {'smi

{'n2n': {1: [7],
  7: [1, 8],
  2: [8],
  8: [7, 2, 3, 4, 5],
  3: [8],
  4: [8],
  5: [8, 9],
  9: [5]},
 'id2smiles': {1: 'CC',
  7: 'C1=C[NH]C=N1',
  2: 'C=O',
  8: 'C1=CNCNC1',
  3: 'CN',
  4: 'C=O',
  5: 'CCN',
  9: 'C1=CCCCC1'},
 'f2f':                         count     frequency
 fragment                                   
 CC                  5338208.0  2.518584e-01
 CN                  4079802.0  1.924864e-01
 C                   2128530.0  1.004247e-01
 C=O                 1856787.0  8.760382e-02
 CO                  1551644.0  7.320707e-02
 ...                       ...           ...
 C1C2CC3CC(CC1O3)N2        1.0  4.718033e-08
 FCF                       1.0  4.718033e-08
 CCN                      11.0  5.189836e-07
 NC=O                      1.0  4.718033e-08
 COC                       1.0  4.718033e-08
 
 [537 rows x 2 columns],
 'id2mol_inx': {1: [0, 1],
  7: [1, 2, 3, 4, 5],
  2: [6, 7],
  8: [6, 8, 10, 12, 3, 4],
  3: [8, 9],
  4: [10, 11],
  5: [12, 13, 14],
  9: [15, 

In [303]:
fragment2frequency.loc['C1=C[NH]C=N1', :]

count        57913.000000
frequency        0.002732
Name: C1=C[NH]C=N1, dtype: float64

In [310]:
fragment2frequency.loc['CC', :]

count        5.338217e+06
frequency    2.518587e-01
Name: CC, dtype: float64

In [270]:
g_new.nodes(data=True)   # merge didn't successful and also need to update mol_inx in merged node

NodeDataView({1: {'smiles': 'C#C', 'mol_inx': [0, 1]}, 3: {'smiles': 'CCN', 'mol_inx': [1, 2, 3], 'contraction': {2: {'smiles': 'CC', 'mol_inx': [1, 2]}}}, 16: {'smiles': 'N', 'mol_inx': [3]}, 5: {'smiles': 'CCN', 'mol_inx': [3, 4, 5], 'contraction': {4: {'smiles': 'CN', 'mol_inx': [3, 4]}}}, 6: {'smiles': 'C#C', 'mol_inx': [5, 6]}, 17: {'smiles': 'NC=O', 'mol_inx': [8, 3, 7], 'contraction': {8: {'smiles': 'C=O', 'mol_inx': [7, 8]}, 7: {'smiles': 'CN', 'mol_inx': [3, 7]}}}, 9: {'smiles': 'CC', 'mol_inx': [7, 9]}, 14: {'smiles': 'C1=CCCC=C1', 'mol_inx': [9, 18, 17, 16, 11, 10]}, 11: {'smiles': 'COC', 'mol_inx': [18, 19, 20], 'contraction': {10: {'smiles': 'CO', 'mol_inx': [18, 19]}}}, 18: {'smiles': 'FCF', 'mol_inx': [20, 21, 22], 'contraction': {12: {'smiles': 'CF', 'mol_inx': [20, 21]}, 13: {'smiles': 'CF', 'mol_inx': [20, 22]}}}, 15: {'smiles': 'C1=CC=CC=C1', 'mol_inx': [12, 13, 14, 15, 16, 11]}})

In [282]:
nx.get_node_attributes(g_new, 'smiles')

{1: 'C#C',
 3: 'CCN',
 16: 'N',
 5: 'CCN',
 6: 'C#C',
 17: 'NC=O',
 9: 'CC',
 14: 'C1=CCCC=C1',
 11: 'COC',
 18: 'FCF',
 15: 'C1=CC=CC=C1'}

In [276]:
list(g_new.nodes())

[1, 3, 16, 5, 6, 17, 9, 14, 11, 18, 15]

In [278]:
n2n_new = {n: list(g_new.neighbors(n)) for n in list(g_new.nodes())}
n2n_new

{1: [3],
 3: [1, 16],
 16: [3, 5, 17],
 5: [16, 6],
 6: [5],
 17: [16, 9],
 9: [17, 14],
 14: [9, 15, 11],
 11: [14, 18],
 18: [11],
 15: [14]}

In [256]:
dict(g_new.degree)[17]

4

In [254]:
set(g_new.neighbors(17))

{7, 9, 17}

In [302]:
draw_graph(g, './figure/', 'refragment_debug.png')

In [203]:
f2f_new.head()

Unnamed: 0_level_0,count,frequency
fragment,Unnamed: 1_level_1,Unnamed: 2_level_1
CC,5338219.0,0.251858
CN,4079814.0,0.192487
C,2128532.0,0.100425
C=O,1856798.0,0.087604
CO,1551646.0,0.073207


In [209]:
f2f_new.loc['C1=CCCC=C1',:]

count        29003.000000
frequency        0.001368
Name: C1=CCCC=C1, dtype: float64

In [222]:
f2f_new.loc['FCF',:]

count        1.000000e+00
frequency    4.718030e-08
Name: FCF, dtype: float64

In [246]:
fragment2frequency.loc['C=O',:]

count        1.856788e+06
frequency    8.760389e-02
Name: C=O, dtype: float64

In [229]:
17 in list(g_new.nodes)

True

In [291]:
g.name = 'sd'
g.name

'sd'

In [297]:
f2f_new.loc['CC']

count        5.338204e+06
frequency    2.518586e-01
Name: CC, dtype: float64