# Step-by-step Add All Edges Algorithm

This notebook will illustrate the use of numpy vector based methods to speed up the process of identifying and adding edges to a graph when using distance between nodes (for which the nodes are numbers)

In [1]:
from pathlib import Path
from multiprocessing import Pool
from typing import Generator
import pickle
import sys

from tqdm import tqdm
from docopt import docopt
import numpy as np
import pandas as pd
from pandas import read_csv, concat, DataFrame
import networkx as nx

In [2]:
module_path = "/home/fisch872/mat/projects/Laura-SB-Analysis/NetCIS/"
sys.path.append(module_path)
from netcis import cis_networks as cn

In [3]:
args = {
    # 'insertion_dir': Path('/project/cs-myers/MathewF/projects/Laura-SB-Analysis/NetCIS/toy-data/2020_SB-insertions/'),
    # 'output': Path('/project/cs-myers/MathewF/projects/Laura-SB-Analysis/NetCIS/toy-data/2020_SB-graphs/'),
    
    # 'insertion_dir': Path("/research/labs/immunology/rogerslm/m277102/projects/NetCIS/2020_SB/all_files-insertions/"),
    # 'output': Path("/research/labs/immunology/rogerslm/m277102/projects/NetCIS/2020_SB/all_files-graphs"),
    

    'output_prefix': '/home/fisch872/mat/projects/Laura-SB-Analysis/2020_SB-output/GRCm39/results',
    'verbose': 1,
    'njobs': 22,
    'threshold': 50000,
}
args["insertion_dir"] = Path(args["output_prefix"] + "-insertions")
args["output"] = Path(args["output_prefix"] + "-graphs")

In [4]:
# get all files in data dir, load each file as pandas.DataFrame
insert_list = [ read_csv(file, sep="\t") for file in args["insertion_dir"].iterdir() ]
inserts_df = concat(insert_list, ignore_index=True)

chrom_list = np.unique(inserts_df["chr"].to_numpy())
treatment_list = inserts_df["treatment"].unique()
print(chrom_list)
print(treatment_list)

['chr1' 'chr10' 'chr11' 'chr12' 'chr13' 'chr14' 'chr15' 'chr16' 'chr17'
 'chr18' 'chr19' 'chr2' 'chr3' 'chr4' 'chr5' 'chr6' 'chr7' 'chr8' 'chr9'
 'chrM' 'chrX' 'chrY']
['LT' 'RT' 'S']


In [5]:
display(inserts_df)
inserts_df["sampleID"].unique().shape[0]

Unnamed: 0,chr,pos,strand,ref_length,query_length,read_length,mapping_quality,read_name,TA_location,read_first_last,ref_first_last,tpn_promoter_orient,library,treatment,sampleID
0,chr1,53601100,+,135,147,147,14,K00274:76:HHHM2BBXX:8:2128:16224:28446,none,GATAAATTTG-AGGTCCTAGA,GGAGcAGTTG-AGGTCCTAGA,-,IRL,LT,32_2
1,chr1,68392291,+,70,71,71,42,K00274:76:HHHM2BBXX:8:1108:29193:25984,first,TATATATAGA-GCCTCATTTT,TATATATAGA-TGCCTCATTT,+,IRR,LT,32_2
2,chr1,68392291,+,71,71,71,42,K00274:76:HHHM2BBXX:8:1227:5974:17685,first,TATATATAGA-GCCTCATTTC,TATATATAGA-GCCTCATTTC,+,IRR,LT,32_2
3,chr1,68392291,+,68,68,68,28,K00274:76:HHHM2BBXX:8:2108:24870:26265,first,TATATATAGA-TCTGCCTCAT,TATATATAGA-TCTGCCTCAT,+,IRR,LT,32_2
4,chr1,68392291,+,67,67,67,28,K00274:76:HHHM2BBXX:8:2202:3133:47084,first,TATATATAGA-TTCTGCCTCA,TATATATAGA-TTCTGCCTCA,+,IRR,LT,32_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242346,chr6,53233280,-,43,148,148,22,K00275:54:HHHJTBBXX:8:1124:3407:34477,none,GATTAAATGT-CTGCTTGAAA,ATTAAGATAT-AATTCCTGAC,-,IRR,LT,456_4
242347,chr6,53233280,-,43,72,72,22,K00275:54:HHHJTBBXX:8:1221:22445:26986,none,AGGATTAAAT-GCGGAGCCCT,ATTAAGATAT-AATTCCTGAC,-,IRR,LT,456_4
242348,chr6,86910729,+,102,146,146,41,K00274:188:HW2KTBBXX:5:2125:20557:21852,none,TTTTGTAAAC-CACCAGTGCT,TAACTAACAA-CACCAGTGCT,+,IRR,RT,46_2
242349,chr6,86910822,-,108,148,148,41,K00274:188:HW2KTBBXX:5:1101:6948:19619,none,GTGGATTAAA-ATAGTGAGTC,ACCAGTGCTG-ACAATTCCTG,+,IRL,RT,46_2


132

In [6]:
# total unique samples across all treatments
total_samples = inserts_df["sampleID"].unique().shape[0]
metadata = {"total": total_samples}

for treatment in treatment_list:
    print(treatment)
    # prepare output
    out_dir = args['output'] / treatment
    out_dir.mkdir(parents=True, exist_ok=True)
    
    treatment_df = inserts_df[inserts_df["treatment"] == treatment]
    
    treatment_samples = treatment_df["sampleID"].unique().shape[0]
    metadata[treatment] = treatment_samples

    # don't allow more jobs than there are chromosomes
    jobs = args["njobs"]
    num_chr = len(chrom_list)
    if num_chr < jobs:
        print(f"Reducing number of jobs from {jobs} to {num_chr}, since there are only {num_chr} chromosomes present.")
        jobs = len(chrom_list)
        
    # construct CIS network per chromosome for treatment insertion
    iter_gen = cn.create_graph_generator(chrom_list, treatment_df, out_dir, args)
    iter_gen = tqdm(iter_gen, total=num_chr)
    with Pool(jobs) as p:
        for _ in p.imap_unordered(cn.create_graph_helper, iter_gen):
            pass
        p.close()
        
# save sample numbers as meta data for network analysis
samples, counts = zip(*metadata.items())
meta_df = pd.DataFrame({"samples": samples, "counts": counts})
meta_df.to_csv(args['output'].parent / "samples_with_insertions.csv", index=False)


100%|██████████| 22/22 [00:00<00:00, 60.50it/s]
100%|██████████| 22/22 [00:00<00:00, 51.75it/s]
100%|██████████| 22/22 [00:00<00:00, 39.99it/s]


KeyError: 'sampleID'

# Breakdown of find_edges()

## make graph with nodes

In [7]:
treatment_chrom_dir = args['output'] / "S"
treatment_df = inserts_df[inserts_df["treatment"] == "S"]
chrom_df = treatment_df[treatment_df['chr'] == "chr1"]
threshold = 50000
verbose = 1




# def create_graph(chrom_df: DataFrame, save_dir, threshold=50000, verbose=0) -> None:
G = nx.Graph()

# get counts for library and tpn orientation
chrom_df.insert(4, "counts_irr", np.where(chrom_df['library'] == 'IRR', 1, 0))
chrom_df.insert(5, "counts_irl", np.where(chrom_df['library'] == 'IRL', 1, 0))
chrom_df.insert(6, "counts_trp_orient_pos", np.where(chrom_df['tpn_promoter_orient'] == '+', 1, 0))
chrom_df.insert(7, "counts_trp_orient_neg", np.where(chrom_df['tpn_promoter_orient'] == '-', 1, 0))
cols = ["counts_irr", "counts_irl", "counts_trp_orient_pos", "counts_trp_orient_neg"]

# find read counts at each insertions site
tmp_group = chrom_df.groupby(by=['chr', 'pos'], sort=False, as_index=False, dropna=False)
insertion_nodes_df = tmp_group[cols].sum()
insertion_nodes_df.insert(2, "counts", tmp_group['read_name'].count().pop('read_name'))

# add in info about which samples are in each insertion site
tmp_samples = chrom_df.groupby(by=['chr', 'pos'], sort=False, as_index=False, dropna=False)["sampleID"].apply(lambda x: x.unique())
insertion_nodes_df.insert(7, "n_samples", tmp_samples["sampleID"].apply(lambda x: len(x)))
insertion_nodes_df.insert(7, "sample_IDs", tmp_samples["sampleID"])

display(insertion_nodes_df)



Unnamed: 0,chr,pos,counts,counts_irr,counts_irl,counts_trp_orient_pos,counts_trp_orient_neg,sample_IDs,n_samples
0,chr1,19071377,7,7,0,7,0,[521_2],1
1,chr1,47830960,19,19,0,19,0,[521_2],1
2,chr1,91139017,4,4,0,4,0,[521_2],1
3,chr1,97098822,1,1,0,1,0,[521_2],1
4,chr1,102741246,4,4,0,0,4,[521_2],1
...,...,...,...,...,...,...,...,...,...
2653,chr1,172898694,46,0,46,46,0,[22_3],1
2654,chr1,172898696,1,0,1,1,0,[22_3],1
2655,chr1,191673991,1,1,0,0,1,[22_3],1
2656,chr1,17245274,6,0,6,6,0,[485_2],1


In [None]:
tmp_samples["sampleID"].apply(lambda x: list(x)).to_list()

In [35]:
chrom_df

Unnamed: 0,chr,pos,strand,ref_length,counts_irr,counts_irl,counts_trp_orient_pos,counts_trp_orient_neg,query_length,read_length,mapping_quality,read_name,TA_location,read_first_last,ref_first_last,tpn_promoter_orient,library,treatment,sampleID
1673,chr1,19071377,+,64,1,0,1,0,69,69,18,K00275:54:HHHJTBBXX:8:1108:30107:44517,first,TATAGGGATC-ACTTTGTGTC,GGATCCCTGG-ACTTTGTGTC,+,IRR,S,521_2
1674,chr1,19071377,+,62,1,0,1,0,67,67,18,K00275:54:HHHJTBBXX:8:1214:26382:3987,first,TATAGGGATC-AAACTTTGTG,GGATCCCTGG-AAACTTTGTG,+,IRR,S,521_2
1675,chr1,19071377,+,66,1,0,1,0,71,71,25,K00275:54:HHHJTBBXX:8:1223:22039:37572,first,TATAGGGATC-TTTGTGTCTG,GGATCCCTGG-TTTGTGTCTG,+,IRR,S,521_2
1676,chr1,19071377,+,64,1,0,1,0,69,69,17,K00275:54:HHHJTBBXX:8:2121:27367:22924,first,TATAGGGATC-ACTTTGTGTC,GGATCCCTGG-ACTTTGTGTC,+,IRR,S,521_2
1677,chr1,19071377,+,63,1,0,1,0,68,68,25,K00275:54:HHHJTBBXX:8:2127:19887:21043,first,TATAGGGATC-AACTTTGTGT,GGATCCCTGG-AACTTTGTGT,+,IRR,S,521_2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234445,chr1,52561297,-,53,0,1,1,0,75,75,22,K00275:54:HHHJTBBXX:7:1213:18254:26406,first,TATAGGGATC-AAGCGGAGCC,CCTTTAAACT-TTTTGGATCC,+,IRL,S,485_2
234446,chr1,52561297,-,53,0,1,1,0,75,75,22,K00275:54:HHHJTBBXX:7:1213:18538:26934,first,TATAGGGATC-AAGCGGAGCC,CCTTTAAACT-TTTTGGATCC,+,IRL,S,485_2
234447,chr1,52561297,-,53,0,1,1,0,71,71,24,K00275:54:HHHJTBBXX:7:1227:17117:9104,first,TATAGGGATC-CCTTAAGCGG,CCTTTAAACT-TTTTGGATCC,+,IRL,S,485_2
234448,chr1,52561297,-,53,0,1,1,0,76,76,22,K00275:54:HHHJTBBXX:7:2120:18122:11196,first,TATAGGGATC-AGCGGAGCCC,CCTTTAAACT-TTTTGGATCC,+,IRL,S,485_2


In [None]:
# TODO: for some reason there are few insertions that occur both in IRR and IRL. 
# Why is that and does this change with the new preprocessing scripts?
# both_libs = insertion_nodes[ (insertion_nodes['count_irl'] != 0) & (insertion_nodes['count_irr'] != 0) ]

In [None]:
# add nodes and edges to graph
G.add_nodes_from(cn.add_nodes(insertion_nodes_df))

## find all edges 

In [22]:
# nodes are inherently ordered as they are added in the graph,
# however, the ordering doens't have to numerically make sense for this function
ordered_nodes = np.array(G.nodes())
ordered_nodes

array([ 19071377,  47830960,  91139017, ..., 191673991,  17245274,
        52561297])

In [23]:
# reshape nodes to be used to broadcast into a symmetric matrix of distances
nodes = ordered_nodes.reshape(-1, 1)
dist_nodes = np.abs(nodes - nodes.T)  # symmetric 2d array

In [24]:
# cis nodes are those that are under the threshold
cis_nodes = dist_nodes <= threshold  # symmetric 2d array

In [25]:
# get the indices of the lower left triangle of the symmetric matrix.
# edges_ind is a tuple of two array. The same index location in both arrays is used 
# to index a single value from the symmetric matrix. This results in two very long 
# arrays that will index all the values of the lower left triangle of the matrix
edges_ind = np.tril_indices_from(cis_nodes, k=-1) # tuple of two 1d arrays

In [26]:
# keep nodes that are under the threshold
keep_nodes = cis_nodes[edges_ind]  # 1d array

In [28]:
# get the actual node names for the lower left triangle via as the column
nodes1 = ordered_nodes[edges_ind[1][keep_nodes]]  # 1d array

# the rows
nodes2 = ordered_nodes[edges_ind[0][keep_nodes]]  # 1d array

# and edge weights (TODO: which can be modified for a differnt weighting method, maybe 1 / log10(x) instead?)
nodes_dist = 1 / dist_nodes[edges_ind][keep_nodes]  # 1d array

In [29]:
# combine the nodes and weights into an iterable that can be passed wholly into the graph
# an edge is defined as the first node, the second node, and then a dict of attributes, such as weight
edges_to_add = [ (x, y, {"weight": z}) for x, y, z in zip(nodes1, nodes2, nodes_dist) ]
G.add_edges_from(edges_to_add)

In [32]:
cn.graph_properties(G)

{'nodes': 2658, 'edges': 4510, 'num_inserts': 14843, 'num_subgraphs': 970}