In [1]:
import pickle
from pathlib import Path
from multiprocessing import Pool

import pandas as pd
import numpy as np
import seaborn.objects as so
from seaborn import axes_style
import networkx as nx
from scipy.stats import chisquare, binomtest, ranksums, mannwhitneyu, skewtest, kurtosistest
from tqdm import tqdm

from IPython.display import display

In [2]:
import sys, os
dir1 = os.path.dirname(os.path.abspath(''))
if not dir1 in sys.path: sys.path.append(dir1)
from netcis import network_analysis as na
from importlib import reload
reload(na)

<module 'netcis.network_analysis' from '/project/cs-myers/MathewF/projects/Laura-SB-Analysis/NetCIS/netcis/network_analysis.py'>

In [3]:
args = {
    "output_prefix": "/project/cs-myers/MathewF/projects/Laura-SB-Analysis/2020_SB-output/GRCm39/results",
    "ta_dir": Path("/project/cs-myers/MathewF/software/bowtie2-2.4.5/indexes/GRCm39_TAs/"),
    "gene_annot": Path("/project/cs-myers/MathewF/projects/Laura-SB-Analysis/NetCIS/toy-data/MRK_List2.rpt"),
    "ta_error": 5,
    "pval_threshold": 0.05,
    "verbose": 1,
    "case": "LT",
    "control": "S",
    "npara": 21,
}

args["graph_dir"] = Path(args["output_prefix"] + "-graphs/")

output = Path(args["output_prefix"] + "-analysis")
output.mkdir(exist_ok=True)

ta_dir = args["ta_dir"]
gene_annot = args["gene_annot"]
ta_error = args["ta_error"]
pval_threshold = args["pval_threshold"]
verbose = args["verbose"]
case = args["case"]
control = args["control"]

In [96]:
annot_df = pd.read_csv(gene_annot, sep="\t")
annot_df = annot_df[pd.notna(annot_df["genome coordinate start"])].drop("Status", axis=1)
annot_df["chrom"] = annot_df["Chr"].apply(lambda x: f"chr{x}")
annot_df = annot_df.sort_values(["chrom"]).reset_index(drop=True)
# TODO: what about the strand in annot_df?

bed_files = {file.name.split(".")[0]: file for file in args["ta_dir"].iterdir()}

chroms = sorted([ chrom.name for chrom in (args["graph_dir"] / case).iterdir() ])
print(chroms)

['chr1', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chrM', 'chrX', 'chrY']


In [106]:
reload(na)
# get chromosome subsets for annotation file and TA bed file
# iter_args = tqdm([ (chrom, annot_df[annot_df["chrom"] == chrom], bed_files[chrom], args) for chrom in chroms ])
iter_args = [ (chrom, annot_df[annot_df["chrom"] == chrom], bed_files[chrom], args) for chrom in chroms ]
with Pool(args["npara"]) as p:
    res_dict_list = [ x for x in p.imap_unordered(na.chrom_analysis, iter_args) ]


chrM	no sig. genomic features found
chr19	sig. genomic features: 332/19099
chrY	no sig. genomic features found
chr18	sig. genomic features: 201/19494
chr17	sig. genomic features: 493/27635
chr16	sig. genomic features: 332/21683
chr15	sig. genomic features: 532/25463
chr14	sig. genomic features: 423/25866
chr11	sig. genomic features: 510/43002
chr8	sig. genomic features: 645/31828
chr12	sig. genomic features: 392/26305
chr13	sig. genomic features: 506/27803
chr7	sig. genomic features: 415/41139
chr10	sig. genomic features: 909/31345
chr6	sig. genomic features: 644/36077
chr5	sig. genomic features: 822/41290
chr3	sig. genomic features: 412/32767
chrX	sig. genomic features: 352/17078
chr9	sig. genomic features: 1355/35025
chr2	sig. genomic features: 885/48037
chr4	sig. genomic features: 1671/38690
chr1	sig. genomic features: 1962/42646


In [107]:
# save data  
ta_list = []
overall_list = []
sig_list = []
genomic_features_list = []
graphs_stats = []
for res_dict in res_dict_list:
    ta_list.append(res_dict["ta"])
    overall_list.append(res_dict["overall"])
    sig_list.append(res_dict["sig"])
    genomic_features_list.append(res_dict["genomic_features"])
    graphs_stats.append(res_dict["graph_stats"])

TA_df = pd.concat(ta_list, ignore_index=True)
TA_df.to_csv(output / "TA.tsv", sep="\t", index=False)

overall_df = pd.concat(overall_list, ignore_index=True)
overall_df.to_csv(output / "overall.tsv", sep="\t", index=False)

sig_df = pd.concat(sig_list, ignore_index=True)
sig_df.to_csv(output / "sig.tsv", sep="\t", index=False)

genomic_features_df = pd.concat(genomic_features_list, ignore_index=True)
genomic_features_df.to_csv(output / "genomic_features.tsv", sep="\t", index=False)

graph_stats_df = pd.concat(graphs_stats, ignore_index=True)
graph_stats_df.to_csv(output / "graph_stats.tsv", sep="\t", index=False)



# # load the saved data
# TA_df = pd.read_csv(output / "TA.tsv", sep="\t")
# overall_df = pd.read_csv(output / "overall.tsv", sep="\t")
# sig_df = pd.read_csv(output / "sig.tsv", sep="\t")
# genomic_features_df = pd.read_csv(output / "genomic_features.tsv", sep="\t")
# graph_stats_df = pd.read_csv(output / "graph_stats.tsv", sep="\t")


print(TA_df.duplicated().sum())
print(overall_df.duplicated().sum())
print(sig_df.duplicated().sum())
print(genomic_features_df.duplicated().sum())
print(graph_stats_df.duplicated().sum())

0
0
0
6
0


In [111]:
# TODO: 
# get candidate gene list 
genes_tmp = genomic_features_df[(genomic_features_df["marker_type"] == "Gene") & (genomic_features_df["marker_feature_type"] == "protein coding gene")]
genes_only = genes_tmp.groupby(["type_index"]).agg(list)["marker_symbol"].reset_index()
candidate_genes = pd.DataFrame({"marker_symbol": [ x for x in sorted(genes_tmp["marker_symbol"].unique()) ]})
candidate_genes.to_csv(output / "candidate_genes.tsv", sep="\t", index=False)

display(candidate_genes)
# validate gene candidates in a gene-centric way

# save this new conservative list



Unnamed: 0,marker_symbol
0,1700024G13Rik
1,1700029F12Rik
2,1700031L13Rik
3,1700057G04Rik
4,2310002L09Rik
...,...
604,Zfp989
605,Zkscan1
606,Zmpste24
607,Zscan21


In [161]:
graph_dir = args["graph_dir"]
new_genes = []
for i, gene in enumerate(candidate_genes["marker_symbol"].values):
    if gene != "Aak1":
        continue
    tmp_annot = genomic_features_df[genomic_features_df["marker_symbol"] == gene]
    display(tmp_annot)
    
    tmp_sig = sig_df[sig_df["target_index"].isin(tmp_annot["type_index"]) & sig_df["chrom"].isin(tmp_annot["chrom"])].sort_values(["target_index"])
    display(tmp_sig)
    
    ta_list = []
    for ta_subset in tmp_annot.itertuples():
        tmp = TA_df[TA_df["target_index"].isin([ta_subset.type_index]) & TA_df["chrom"].isin([ta_subset.chrom]) & TA_df["target"].isin([ta_subset.type_name])]
        ta_list.append(tmp)
    tmp_ta = pd.concat(ta_list, ignore_index=True).sort_values(["target_index", "reference_index", "target", "pos"])
    display(tmp_ta)
    
    stats_list = []
    for ta_subset in tmp_annot.itertuples():
        tmp = graph_stats_df[graph_stats_df["subgraph"].isin([ta_subset.type_index]) & graph_stats_df["chrom"].isin([ta_subset.chrom]) & graph_stats_df["type"].isin([ta_subset.type_name])]
        stats_list.append(tmp)
    tmp_g_stats = pd.concat(stats_list, ignore_index=True).sort_values(["subgraph"])
    display(tmp_g_stats)
    
    break

Unnamed: 0,type,type_name,type_index,chrom,marker_symbol,marker_name,marker_type,marker_feature_type,marker_annot_index,genome coordinate start,genome coordinate end,genome coordinate expander
9822,target,LT,1,chr6,Aak1,AP2 associated kinase 1,Gene,protein coding gene,483237,86826499.0,86980205.0,50000
10522,target,S,1,chr6,Aak1,AP2 associated kinase 1,Gene,protein coding gene,483237,86826499.0,86980205.0,50000


Unnamed: 0,target_index,reference_index,target_pos_min,target_pos_max,reference_pos_min,reference_pos_max,mannwhitneyu,ranksums,total_TA,TA_sig,sig_ratio,target,reference,chrom
202,1,1.0,86910504,86911206,86886566,86910899,1.39784e-09,2.278325e-09,108,53,0.490741,LT,S,chr6
213,1,0.0,86886566,86910899,86910504,86911206,1.39784e-09,2.278325e-09,108,53,0.490741,S,LT,chr6


Unnamed: 0,pos,target_count,reference_count,reference_index,target_index,target_binom_pval,target_binom_sig,LFC,p_target_binom_pval,p_target_binom_sig,target,reference,chrom
2,86910504,1,0,0.0,1,1.000000e+00,False,1.000000,1.000000e+00,False,LT,S,chr6
3,86910670,1,0,0.0,1,1.000000e+00,False,1.000000,1.000000e+00,False,LT,S,chr6
4,86910714,1,0,0.0,1,1.000000e+00,False,1.000000,1.000000e+00,False,LT,S,chr6
6,86910723,1,0,0.0,1,1.000000e+00,False,1.000000,1.000000e+00,False,LT,S,chr6
7,86910728,1,0,0.0,1,1.000000e+00,False,1.000000,1.000000e+00,False,LT,S,chr6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
211,86910894,1,6,1.0,1,1.250000e-01,False,-1.807355,1.796875e-01,False,S,LT,chr6
212,86910895,5,139,1.0,1,4.472363e-35,True,-4.544321,2.837819e-34,True,S,LT,chr6
213,86910897,9,50,1.0,1,5.264832e-08,True,-2.350497,9.624425e-08,True,S,LT,chr6
214,86910899,12,63,1.0,1,1.693839e-09,True,-2.299560,3.023635e-09,True,S,LT,chr6


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
0,LT,chr6,1,101,5050,5160,86910504,86911206,702,101,19,25
1,S,chr6,1,77,2926,845,86886566,86910899,24333,77,881,19


In [11]:
# TODO: use list of Laura genes and see why that genomic region did not show up
top10genes = ["Aak1", "Ehhadh", "Macrod2", "Ckb", "Rnf214", "Sprr1b", "Cpd", "Rpl48-ps1", "Son", "Eif3b"]
top10annot = annot_df[annot_df["Marker Symbol"].isin(top10genes)]
for row in top10annot.itertuples():
    chrom = f"chr{row.Chr}"
    case_chrom_df = case_df[case_df["chrom"] == chrom]
    control_chrom_df = control_df[control_df["chrom"] == chrom]
    res1 = int(row._4 - 50000) <= case_chrom_df["max_pos"]
    res2 = int(row._5 + 50000) >= case_chrom_df["min_pos"]
    res3 = int(row._4 - 50000) <= control_chrom_df["max_pos"]
    res4 = int(row._5 + 50000) >= control_chrom_df["min_pos"]

    print(row._7, (res1 & res2).sum(), (res3 & res4).sum())
    if (res1 & res2).sum() != 0:
        display(case_chrom_df.loc[res1 & res2])
    if (res3 & res4).sum() != 0:
        display(control_chrom_df.loc[res3 & res4])


Rpl48-ps1 1 1


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
919,case,chr10,292,1,0,4,57474423,57474423,0,1,0,0


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
1089,control,chr10,119,1,0,1,57513025,57513025,0,1,0,0


Cpd 0 1


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
1478,control,chr11,142,1,0,1,76722877,76722877,0,1,0,0


Ckb 0 0
Ehhadh 1 1


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
2364,case,chr16,1,62,1891,790,21600392,21614302,13910,62,602,32


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
2993,control,chr16,1,26,325,89,21614194,21614302,108,26,7,14


Son 0 0
Macrod2 2 4


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
3245,case,chr2,83,2,1,2,141177054,141177560,506,2,0,0
3449,case,chr2,287,1,0,1,141377758,141377758,0,1,0,0


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
4020,control,chr2,96,2,1,34,140313463,140313466,3,2,0,0
4141,control,chr2,217,1,0,1,140190257,140190257,0,1,0,0
4178,control,chr2,254,1,0,1,140580324,140580324,0,1,0,0
4210,control,chr2,286,1,0,15,140753859,140753859,0,1,0,0


Sprr1b 1 1


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
3581,case,chr3,7,6,15,438,92347697,92347714,17,6,0,0


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
4538,control,chr3,145,1,0,41,92347698,92347698,0,1,0,0


Eif3b 0 0
Aak1 2 2


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
4779,case,chr6,1,115,6555,10437,86910504,86911206,702,115,19,26
4825,case,chr6,47,3,3,16,86858016,86858023,7,3,0,0


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
6001,control,chr6,1,77,2926,845,86886566,86910899,24333,77,881,19
6211,control,chr6,211,1,0,55,86976664,86976664,0,1,0,0


Rnf214 1 0


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
5794,case,chr9,5,9,36,774,45852161,45852171,10,9,0,0


In [12]:
# What about breaking up large CISs? It appears they are too large and span multiple genes

# Or do now we check each individual gene that was found to further refine our search?

# What's stopping me from testing literally every gene with +/-50 kb region of insertions with a ranksums test?
# because we don't want this to be gene centric, however, it's not a bad way to validate the candidate gene list


In [None]:
# TODO: check for each gene with an extended proomoter the case vs controls insertions
# check with ranksums test as well as binomial test
# What does the candidate gene list look like now?


# TODO: I need to go through my code again. Go step by step for each function in another notebook
# double check this code and cis_networks.py


# TODO: rerun without mapq thresholding in preprocess_reads and preprocess_insertions


# TODO: should we be using normalized read counts? idk, what stage would this occur at
# the variation would come from read depth, so we would have to normalize based on this BEFORE preprocess_reads.py
# What is done with RNAseq? How to normalize on read depth while still keeping count data

In [None]:
# volcano plot of p-values and LFC for each CIS if wanted


In [None]:
# output list of pCIS (now can be called CIS) that are sig.
# per CIS, output list of TAs that are sig.


In [None]:
# take identified candidate CIS/genes and check for co-occurrence with all other CIS/genes
