# Step-by-step Add All Edges Algorithm

This notebook will illustrate the use of numpy vector based methods to speed up the process of identifying and adding edges to a graph when using distance between nodes (for which the nodes are numbers)

In [1]:
from pathlib import Path
from multiprocessing import Pool
from typing import Generator
import sys

from tqdm import tqdm
from docopt import docopt
import numpy as np
from pandas import read_csv, concat, DataFrame
import networkx as nx

In [30]:
args = {
    'insertion_dir': Path('/project/cs-myers/MathewF/projects/Laura-SB-Analysis/NetCIS/toy-data/2020_SB-insertions/'),
    'output': Path('/project/cs-myers/MathewF/projects/Laura-SB-Analysis/NetCIS/toy-data/2020_SB-graphs/'),
    
    # 'insertion_dir': Path("/research/labs/immunology/rogerslm/m277102/projects/NetCIS/2020_SB/all_files-insertions/"),
    # 'output': Path("/research/labs/immunology/rogerslm/m277102/projects/NetCIS/2020_SB/all_files-graphs"),

    'verbose': 1,
    'jobs': 8,
    'threshold': 50000,
}

In [31]:
# prepare output
out_dir_case = args['output'] / "case"
out_dir_case.mkdir(parents=True, exist_ok=True)
out_dir_control = args['output'] / "control"
out_dir_control.mkdir(parents=True, exist_ok=True)

In [35]:
# get all files in data dir, load each file as pandas.DataFrame, and add meta data based on the file name
insert_list = []
for file in args["insertion_dir"].iterdir():
    tmp_df = read_csv(file, sep="\t")
    tumor_model, sample_id, tissue_type = file.name.split("-")
    tmp_df["tumor_model"] = tumor_model
    tmp_df["sample_id"] = sample_id
    tmp_df["tissue"] = tissue_type.split(".")[0]  # RT/LT/S
    insert_list.append(tmp_df)
inserts_df = concat(insert_list, ignore_index=True)
display(inserts_df)

Unnamed: 0,chr,pos,strand,ref_length,query_length,read_length,mapping_quality,read_name,TA_location,read_first_last,ref_first_last,tpn_promoter_orient,library,tumor_model,sample_id,tissue
0,chr2,17857638,-,110,148,148,42,K00275:54:HHHJTBBXX:8:2123:25185:28850,none,TGGTAATACG-CTGGTCTCTT,AAGAGACCAG-AGAATGGATG,+,IRL,B16,1_1,RT
1,chr2,17857638,-,110,148,148,42,K00275:54:HHHJTBBXX:8:2120:7456:22854,none,TGGTAATACG-CTGGTCTCTT,AAGAGACCAG-AGAATGGATG,+,IRL,B16,1_1,RT
2,chr11,26361587,+,100,148,148,41,K00275:54:HHHJTBBXX:8:1121:10541:11741,none,GTTTCACCCG-GTTTTTCAAC,ATGTCAGATT-TTCGTTTTTC,-,IRL,B16,1_1,RT
3,chr19,17668823,-,86,147,147,28,K00275:54:HHHJTBBXX:8:1110:7994:47454,none,TCGCCGGATT-AAGTCTCTGC,CCTTCACATC-CtTGACATTT,+,IRL,B16,1_1,RT
4,chr1,24917063,+,61,122,122,28,K00275:54:HHHJTBBXX:8:2111:19015:6660,none,GCTTGGTAAT-TACCGTAGCA,GCTTGGTAAT-GCCATGAGAT,+,IRR,B16,1_1,RT
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
593,chr18,70501609,+,123,123,123,39,K00275:54:HHHJTBBXX:8:2228:19796:1930,none,GGGCATTGCT-TGGATGTCTT,GGGCATTGCT-TGGATGTCTT,+,IRR,B16,1_1,S
594,chr19,32636552,+,120,120,120,44,K00275:54:HHHJTBBXX:8:1216:30553:45502,none,ATCTGTCTTC-ATTCATTACC,ATCTGTCTTC-ATTCATTACC,+,IRR,B16,1_1,S
595,chrX,18488703,+,49,121,121,22,K00275:54:HHHJTBBXX:8:1215:31964:33985,none,GAGGGCCCTC-TACTCTGGAC,GAGGGCCCTC-CTCTTACTAT,+,IRR,B16,1_1,S
596,chrX,97213468,-,46,122,122,22,K00275:54:HHHJTBBXX:8:1206:26098:32297,first,TAGCATTGAG-TGGACGAATC,ATTTTGATTT-CTCAATGCTA,-,IRR,B16,1_1,S


In [36]:
# separate data into case/controls
insert_case = inserts_df[inserts_df["tissue"] == "S"]
insert_control = inserts_df[inserts_df["tissue"] != "S"]

# get all chromosomes to separate further the case/controls dataframes
chrom_list = np.unique(inserts_df["chr"].to_numpy())

## Construct graph (network) of insertions

In [185]:
def add_nodes(insertion_df):
    def add_node(insert):
        node = insert.pos
        attr = {
            "position": insert.pos,
            "chrom": insert.chr,
            "counts": insert.counts,
            "counts_irr": insert.counts_irr,
            "counts_irl": insert.counts_irl,
            "counts_trp_orient_pos": insert.counts_trp_orient_pos,
            "counts_trp_orient_neg": insert.counts_trp_orient_neg,
        }
        return (node, attr)
    return [ add_node(x) for x in insertion_df.itertuples() ]

def find_edges(ordered_nodes, threshold):
    # nodes are inherently ordered as they are added in the graph,
    # however, the ordering doens't have to numerically make sense for this function
    
    
    # remove the transposon orientation from the end of the node name
    tmp_order = [ int(x.split("|")[0]) for x in ordered_nodes ]
    # check if this changes the number of unique nodes.
    # If we have + and - at the same location, this assert will fail.
    # This isn't a bad thing but I want to know when it is happening
    assert len(np.unique(ordered_nodes)) == len(np.unique(tmp_order))
    
    # cast the nodes into a numpy array that can be used to broadcast into a symmetric matrix of distances
    nodes = np.array(tmp_order).reshape(-1, 1)
    dist_nodes = np.abs(nodes - nodes.T)  # symmetric 2d array
    
    # cis nodes are those that are under the threshold
    cis_nodes = dist_nodes <= threshold  # symmetric 2d array
    
    # get the indices of the lower left triangle of the symmetric matrix.
    # edges_ind is a tuple of two array. The same index location in both arrays is used 
    # to index a single value from the symmetric matrix. This results in two very long 
    # arrays that will index all the values of the lower left triangle of the matrix
    edges_ind = np.tril_indices_from(cis_nodes, k=-1) # tuple of two 1d arrays
    
    # keep nodes that are under the threshold
    keep_nodes = cis_nodes[edges_ind]  # 1d array
    
    # set up the nodes to be a numpy array for easy indexing
    ordered_nodes = np.array(ordered_nodes)  # 1d array
    
    # get the actual node names for the lower left triangle via as the column
    nodes1 = ordered_nodes[edges_ind[1][keep_nodes]]  # 1d array
    # the rows
    nodes2 = ordered_nodes[edges_ind[0][keep_nodes]]  # 1d array
    # and edge weights (TODO: which can be modified for a differnt weighting method, maybe 1 / log10(x) instead?)
    nodes_dist = 1 / dist_nodes[edges_ind][keep_nodes]  # 1d array
    # combine the nodes and weights into an iterable that can be passed wholly into the graph
    # an edge is defined as the first node, the second node, and then a dict of attributes, such as weight
    edges_to_add = [ (x, y, {"weight": z}) for x, y, z in zip(nodes1, nodes2, nodes_dist) ]
    return edges_to_add

def create_graph(chrom_df: DataFrame, threshold, save_file, verbose=0) -> None:
    G = nx.Graph()
    
    # prepare the insertions by grouping them together
    # find the total count of insertions and the counts per sequencing library (IRR/IRL)
    insert_cols = ['chr', 'pos', 'tpn_promoter_orient', 'library']
    tmp = chrom_df.groupby(by=insert_cols, as_index=False, dropna=False)['read_name'].count()
    tmp['count'] = tmp.pop('read_name')
    count_irr = np.where(tmp['library'] == 'IRR', tmp['count'], 0)
    count_irl = np.where(tmp['library'] == 'IRL', tmp['count'], 0)
    tmp.insert(5, "count_irr", count_irr)
    tmp.insert(6, "count_irl", count_irl)
    
    # group insertions without the sequencing library. 
    # As long as the transposon orientation, chromosome, and position are the same, 
    # then it does not matter which library the insertion came from
    node_cols = ['chr', 'pos', 'tpn_promoter_orient']
    insertion_nodes = tmp.groupby(by=node_cols, as_index=False, dropna=False).sum(numeric_only=True)
    insertion_nodes['read_names'] = chrom_df.groupby(by=node_cols, dropna=False, group_keys=False)['read_name'].apply(list).reset_index(drop=True)
    
    # TODO: need to get count of tpn_promoter_orient at each position. Then the nodes can be simply positions
    
    # TODO: for some reason there are few insertions that occur both in IRR and IRL. 
    # Why is that and does this change with the new preprocessing scripts?
    # both_libs = insertion_nodes[ (insertion_nodes['count_irl'] != 0) & (insertion_nodes['count_irr'] != 0) ]
    
    # add nodes and edges to graph
    G.add_nodes_from(add_nodes(insertion_nodes))
    G.add_edges_from(find_edges(G.nodes(), threshold))
    
    return G

def graph_properties(G):
    print(f"number of nodes: {G.number_of_nodes()}")
    print(f"number of edges: {G.number_of_edges()}")
    num_inserts = 0
    for node in G.nodes:
        num_inserts += G.nodes[node]['counts']
    print(f"number of insertions: {num_inserts}")

In [123]:
# test run of toy-data
G = create_graph(insert_control[insert_control['chr'] == "chr6"] , 50000, out_dir_case / "chr1.graphml")
graph_properties(G)

AssertionError: 

# Breakdown of find_edges()

## prepare input data

In [186]:
def create_graph_generator(chrom_list, insert_case, insert_control, threshold, case_dir, control_dir) -> Generator[tuple, None, None]:
    for chrom in chrom_list:
        print(chrom)
        insert_case_chrom = insert_case[insert_case['chr'] == chrom]    
        insert_control_chrom = insert_control[insert_control['chr'] == chrom]
        case_file = case_dir / f"{chrom}.graphml"
        control_file = control_dir / f"{chrom}.graphml"
        yield ( insert_case_chrom, insert_control_chrom, threshold, case_file, control_file )

iter_gen = create_graph_generator(chrom_list, insert_case, insert_control, args['threshold'], out_dir_case, out_dir_control)
for _ in range(15):
    next(iter_gen)
insert_case_chrom, insert_control_chrom, threshold, case_file, control_file = next(iter_gen)


chr1
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr2
chr3
chr4
chr5
chr6


## make graph with nodes

In [190]:
chrom_df = insert_control_chrom.copy()



G = nx.Graph()

# get counts for library and tpn orientation
chrom_df.insert(4, "counts_irr", np.where(chrom_df['library'] == 'IRR', 1, 0))
chrom_df.insert(5, "counts_irl", np.where(chrom_df['library'] == 'IRL', 1, 0))
chrom_df.insert(6, "counts_trp_orient_pos", np.where(chrom_df['tpn_promoter_orient'] == '+', 1, 0))
chrom_df.insert(7, "counts_trp_orient_neg", np.where(chrom_df['tpn_promoter_orient'] == '-', 1, 0))
insertion_nodes = chrom_df.groupby(by=['chr', 'pos'], as_index=False, dropna=False)[["counts_irr", "counts_irl", "counts_trp_orient_pos", "counts_trp_orient_neg"]].sum()
insertion_nodes.insert(2, "counts", chrom_df.groupby(by=['chr', 'pos'], as_index=False, dropna=False)['read_name'].count().pop('read_name'))
display(insertion_nodes)


# TODO: for some reason there are few insertions that occur both in IRR and IRL. 
# Why is that and does this change with the new preprocessing scripts?
# both_libs = insertion_nodes[ (insertion_nodes['count_irl'] != 0) & (insertion_nodes['count_irr'] != 0) ]


# add nodes and edges to graph
G.add_nodes_from(add_nodes(insertion_nodes))

Unnamed: 0,chr,pos,counts,counts_irr,counts_irl,counts_trp_orient_pos,counts_trp_orient_neg
0,chr6,47788799,1,1,0,0,1
1,chr6,47788840,1,1,0,0,1
2,chr6,47788891,1,1,0,0,1
3,chr6,53233181,1,1,0,0,1
4,chr6,53233182,1,1,0,0,1
...,...,...,...,...,...,...,...
76,chr6,86910875,1,1,0,1,0
77,chr6,86910879,1,1,0,0,1
78,chr6,86910884,2,2,0,0,2
79,chr6,133573821,1,1,0,1,0


## find all edges 

In [126]:
# nodes are inherently ordered as they are added in the graph,
# however, the ordering doens't have to numerically make sense for this function
ordered_nodes = G.nodes()
ordered_nodes

NodeView(('47788799|-', '47788840|-', '47788891|-', '53233181|-', '53233182|-', '53233183|-', '53233184|-', '53233191|-', '53233197|-', '53233203|-', '53233205|-', '53233206|-', '53233207|-', '53233209|-', '53233210|-', '53233215|-', '53233216|-', '53233218|-', '53233220|-', '53233222|+', '53233222|-', '53233223|-', '53233225|-', '53233226|+', '53233230|-', '53233231|-', '53233232|-', '53233234|-', '53233235|-', '53233236|-', '53233240|-', '53233241|-', '53233242|+', '53233243|+', '53233244|-', '53233252|-', '53233254|-', '53233255|-', '53233256|-', '53233257|-', '53233259|-', '53233260|-', '53233262|-', '53233263|-', '53233265|-', '53233266|-', '53233267|-', '53233272|-', '53233274|-', '53233276|+', '53233276|-', '53233278|+', '53233285|+', '53233288|+', '86910668|+', '86910791|-', '86910794|-', '86910809|-', '86910827|+', '86910831|-', '86910835|-', '86910839|-', '86910840|-', '86910842|-', '86910845|+', '86910845|-', '86910849|-', '86910853|-', '86910856|-', '86910858|-', '86910860|

In [127]:
# remove the transposon orientation from the end of the node name
tmp_order = [ int(x.split("|")[0]) for x in ordered_nodes ]
tmp_order

[47788799,
 47788840,
 47788891,
 53233181,
 53233182,
 53233183,
 53233184,
 53233191,
 53233197,
 53233203,
 53233205,
 53233206,
 53233207,
 53233209,
 53233210,
 53233215,
 53233216,
 53233218,
 53233220,
 53233222,
 53233222,
 53233223,
 53233225,
 53233226,
 53233230,
 53233231,
 53233232,
 53233234,
 53233235,
 53233236,
 53233240,
 53233241,
 53233242,
 53233243,
 53233244,
 53233252,
 53233254,
 53233255,
 53233256,
 53233257,
 53233259,
 53233260,
 53233262,
 53233263,
 53233265,
 53233266,
 53233267,
 53233272,
 53233274,
 53233276,
 53233276,
 53233278,
 53233285,
 53233288,
 86910668,
 86910791,
 86910794,
 86910809,
 86910827,
 86910831,
 86910835,
 86910839,
 86910840,
 86910842,
 86910845,
 86910845,
 86910849,
 86910853,
 86910856,
 86910858,
 86910860,
 86910861,
 86910862,
 86910863,
 86910867,
 86910868,
 86910869,
 86910870,
 86910874,
 86910875,
 86910879,
 86910884,
 133573821,
 133744646]

In [128]:
# check if this changes the number of unique nodes.
# If we have + and - at the same location, this assert will fail.
# This isn't a bad thing but I want to know when it is happening
assert len(np.unique(ordered_nodes)) == len(np.unique(tmp_order))

AssertionError: 

In [114]:
# cast the nodes into a numpy array that can be used to broadcast into a symmetric matrix of distances
nodes = np.array(tmp_order).reshape(-1, 1)
# find absolute difference between all pairwise nodes
dist_nodes = np.abs(nodes - nodes.T)  # symmetric 2d array
dist_nodes

array([[        0,  15365519,  18285690,  50390311,  54866502,  75260569,
         94221039,  94221125,  94221132, 137548273],
       [ 15365519,         0,   2920171,  35024792,  39500983,  59895050,
         78855520,  78855606,  78855613, 122182754],
       [ 18285690,   2920171,         0,  32104621,  36580812,  56974879,
         75935349,  75935435,  75935442, 119262583],
       [ 50390311,  35024792,  32104621,         0,   4476191,  24870258,
         43830728,  43830814,  43830821,  87157962],
       [ 54866502,  39500983,  36580812,   4476191,         0,  20394067,
         39354537,  39354623,  39354630,  82681771],
       [ 75260569,  59895050,  56974879,  24870258,  20394067,         0,
         18960470,  18960556,  18960563,  62287704],
       [ 94221039,  78855520,  75935349,  43830728,  39354537,  18960470,
                0,        86,        93,  43327234],
       [ 94221125,  78855606,  75935435,  43830814,  39354623,  18960556,
               86,         0,        

In [115]:
# cis nodes are those that are under the threshold
cis_nodes = dist_nodes <= threshold  # symmetric 2d array
cis_nodes

array([[ True, False, False, False, False, False, False, False, False,
        False],
       [False,  True, False, False, False, False, False, False, False,
        False],
       [False, False,  True, False, False, False, False, False, False,
        False],
       [False, False, False,  True, False, False, False, False, False,
        False],
       [False, False, False, False,  True, False, False, False, False,
        False],
       [False, False, False, False, False,  True, False, False, False,
        False],
       [False, False, False, False, False, False,  True,  True,  True,
        False],
       [False, False, False, False, False, False,  True,  True,  True,
        False],
       [False, False, False, False, False, False,  True,  True,  True,
        False],
       [False, False, False, False, False, False, False, False, False,
         True]])

In [116]:
# get the indices of the lower left triangle of the symmetric matrix.
# edges_ind is a tuple of two array. The same index location in both arrays is used 
# to index a single value from the symmetric matrix. This results in two very long 
# arrays that will index all the values of the lower left triangle of the matrix
edges_ind = np.tril_indices_from(cis_nodes, k=-1) # tuple of two 1d arrays
edges_ind

(array([1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 7,
        7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
        9]),
 array([0, 0, 1, 0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 5, 0,
        1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
        8]))

In [117]:
# keep nodes that are under the threshold
keep_nodes = cis_nodes[edges_ind]  # 1d array
keep_nodes

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False,  True,  True,
       False, False, False, False, False, False, False, False, False])

In [118]:
# set up the nodes to be a numpy array for easy indexing
ordered_nodes = np.array(G.nodes())  # 1d array
ordered_nodes

array(['17857615|-', '33223134|+', '36143305|-', '68247926|-',
       '72724117|+', '93118184|-', '112078654|-', '112078740|-',
       '112078747|+', '155405888|+'], dtype='<U11')

In [119]:
# get the actual node names for the lower left triangle via as the column
nodes1 = ordered_nodes[edges_ind[1][keep_nodes]]  # 1d array
nodes1

array(['112078654|-', '112078654|-', '112078740|-'], dtype='<U11')

In [120]:
# the rows
nodes2 = ordered_nodes[edges_ind[0][keep_nodes]]  # 1d array
nodes2

array(['112078740|-', '112078747|+', '112078747|+'], dtype='<U11')

In [121]:
# and edge weights (TODO: which can be modified for a differnt weighting method, maybe 1 / log10(x) instead?)
nodes_dist = 1 / dist_nodes[edges_ind][keep_nodes]  # 1d array
nodes_dist

array([0.01162791, 0.01075269, 0.14285714])

In [122]:
# combine the nodes and weights into an iterable that can be passed wholly into the graph
# an edge is defined as the first node, the second node, and then a dict of attributes, such as weight
edges_to_add = [ (x, y, {"weight": z}) for x, y, z in zip(nodes1, nodes2, nodes_dist) ]
edges_to_add

[('112078654|-', '112078740|-', {'weight': 0.011627906976744186}),
 ('112078654|-', '112078747|+', {'weight': 0.010752688172043012}),
 ('112078740|-', '112078747|+', {'weight': 0.14285714285714285})]