In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
import seaborn.objects as so
from seaborn import axes_style
import networkx as nx
from scipy.stats import chisquare, binomtest, ranksums, mannwhitneyu, skewtest, kurtosistest

from IPython.display import display

In [2]:
import sys, os
dir1 = os.path.dirname(os.path.abspath(''))
if not dir1 in sys.path: sys.path.append(dir1)
from netcis import network_analysis as na

from importlib import reload
reload(na)

<module 'netcis.network_analysis' from '/project/cs-myers/MathewF/projects/Laura-SB-Analysis/NetCIS/netcis/network_analysis.py'>

In [3]:
graph_dir = Path("/project/cs-myers/MathewF/projects/Laura-SB-Analysis/output/2020_SB-no_threshold-graphs/")
ta_dir = Path("/project/cs-myers/MathewF/software/bowtie2-2.4.5/indexes/GRCm39_TAs/")
gene_annot = Path("/project/cs-myers/MathewF/projects/Laura-SB-Analysis/NetCIS/toy-data/MRK_List2.rpt")
output_prefix = "/project/cs-myers/MathewF/projects/Laura-SB-Analysis/output/2020_SB-no_threshold"
output = Path(output_prefix + "-analysis")
output.mkdir(exist_ok=True)
ta_error = 5
pval_threshold = 0.05
verbose = 0

In [4]:
annot_df = pd.read_csv(gene_annot, sep="\t")
annot_df = annot_df[pd.notna(annot_df["genome coordinate start"])].drop("Status", axis=1)
annot_df["chrom"] = annot_df["Chr"].apply(lambda x: f"chr{x}")
annot_df = annot_df.sort_values(["chrom"]).reset_index(drop=True)

In [5]:
bed_files = { file.name.split(".")[0]: pd.read_csv(file, sep="\t", header=None) for file in ta_dir.iterdir() }

In [6]:
case_subgraph_dict = na.get_subgraphs(graph_dir, "case")
case_df = na.get_subgraph_stats(case_subgraph_dict, "case", bed_files, ta_error)
case_df.sort_values(["chrom", "subgraph", "nodes"]).to_csv(output / "case_df.tsv", sep="\t", index=False)

control_subgraph_dict = na.get_subgraphs(graph_dir, "control")
control_df = na.get_subgraph_stats(control_subgraph_dict, "control", bed_files, ta_error)
control_df.sort_values(["chrom", "subgraph", "nodes"]).to_csv(output / "control_df.tsv", sep="\t", index=False)

In [44]:
reload(na)
chroms = case_df["chrom"].sort_values().unique()
all_features_list = []
# TODO: parallelize this?
for chrom in chroms:
    # if chrom != "chrM":
    #     continue
    print(chrom)
    
    # get chromosome subsets for case and control
    case_chrom_df = case_df[case_df["chrom"] == chrom]
    control_chrom_df = control_df[control_df["chrom"] == chrom]
    case_chrom_subgraphs = case_subgraph_dict[chrom]
    control_chrom_subgraphs = control_subgraph_dict[chrom]
    annot_chrom_df = annot_df[annot_df["chrom"] == chrom]
    
    # cases as the target
    case_overlaps = na.pcis_overlaps(case_chrom_df, control_chrom_df)
    if not case_overlaps:  # if empty
        case_genes = None
    else:
        case_TA_df, case_overall_df = na.compare_pcis(case_overlaps, case_chrom_subgraphs, control_chrom_subgraphs)
        case_sig_df = na.pcis_to_cis(case_overall_df, pval_threshold)
        if len(case_sig_df) != 0:
            case_genes = na.cis_annotate(case_sig_df, annot_chrom_df)
            case_genes["class"] = "case"
        else:
            case_genes = None
    
    # controls as the target
    control_overlaps = na.pcis_overlaps(control_chrom_df, case_chrom_df)
    if not control_overlaps:  # if empty
        control_genes = None
    else:
        control_TA_df, control_overall_df = na.compare_pcis(control_overlaps, control_chrom_subgraphs, case_chrom_subgraphs)
        control_sig_df = na.pcis_to_cis(control_overall_df, pval_threshold)
        if len(control_sig_df) != 0:
            control_genes = na.cis_annotate(control_sig_df, annot_chrom_df)
            control_genes["class"] = "control"
        else:
            control_genes = None
    
    if case_genes is not None and control_genes is not None:
        both_genes = pd.concat([case_genes, control_genes], ignore_index=True)
    elif case_genes is not None:
        both_genes = case_genes
    elif control_genes is not None:
        both_genes = control_genes
    else:  # both are none
        print("no significant genomic features found")
        continue
    
    both_genes["chrom"] = chrom
    all_features_list.append(both_genes)
    if verbose > 0:
        # print(len(both_genes))
        print(len(both_genes["marker_symbol"].unique()))

    # TODO: are there too many repeated genes? Does this make sense that they would be repeated?
    # it appears that sometimes there are multiple CIS in a gene, because the CIS range is quite small
    print(f"""\tsig. genomic features: {both_genes["marker_symbol"].unique().shape[0]}/{annot_chrom_df["Marker Symbol"].unique().shape[0]}""")

# get all genomic features
all_features_df = pd.concat(all_features_list, ignore_index=True)
# all_genes_df.to_csv(output / "all_genes.csv", index=False)

chr1
	sig. genomic features: 749/42646
chr10
	sig. genomic features: 219/31345
chr11
	sig. genomic features: 235/43002
chr12
	sig. genomic features: 83/26305
chr13
	sig. genomic features: 154/27803
chr14
	sig. genomic features: 64/25866
chr15
	sig. genomic features: 181/25463
chr16
	sig. genomic features: 110/21683
chr17
	sig. genomic features: 186/27635
chr18
	sig. genomic features: 57/19494
chr19
	sig. genomic features: 81/19099
chr2
	sig. genomic features: 356/48037
chr3
	sig. genomic features: 147/32767
chr4
	sig. genomic features: 663/38690
chr5
	sig. genomic features: 261/41290
chr6
	sig. genomic features: 233/36077
chr7
	sig. genomic features: 189/41139
chr8
	sig. genomic features: 200/31828
chr9
	sig. genomic features: 341/35025
chrM
chrX
	sig. genomic features: 100/17078
chrY


In [8]:
display(all_features_df)

Unnamed: 0,type,type_index,marker_symbol,marker_name,marker_type,marker_feature_type,marker_annot_index,class,chrom
0,target,2.0,Feml5,"femur length 5, 5 week",QTL,QTL,755,case,chr1
1,target,2.0,Fearet1,fear retrieval 1,QTL,QTL,967,case,chr1
2,target,2.0,Rr12852,regulatory region 12852,Other Genome Feature,CTCF binding site,1037,case,chr1
3,target,2.0,Gcsfis,G-CSF induced splenomegaly,QTL,QTL,1693,case,chr1
4,target,2.0,Eila1,ethanol induced locomotor activity 1,QTL,QTL,3147,case,chr1
...,...,...,...,...,...,...,...,...,...
20514,target,130.0,Femd9,"femur midshaft diameter 9, 10 week",QTL,QTL,632357,control,chrX
20515,target,130.0,Femd12,"femur midshaft diameter 12, 16 week",QTL,QTL,632358,control,chrX
20516,target,130.0,Fnld,faint-lined,Gene,heritable phenotypic marker,632402,control,chrX
20517,target,130.0,Gct6,granulosa cell tumorigenesis 6,QTL,QTL,632436,control,chrX


In [9]:
gene_tmp = all_features_df[(all_features_df["marker_type"] == "Gene") & (all_features_df["marker_feature_type"] == "protein coding gene")]
display(gene_tmp)

Unnamed: 0,type,type_index,marker_symbol,marker_name,marker_type,marker_feature_type,marker_annot_index,class,chrom
45,target,7.0,Dnah7a,"dynein, axonemal, heavy chain 7A",Gene,protein coding gene,2192,case,chr1
175,target,15.0,Fam135a,"family with sequence similarity 135, member A",Gene,protein coding gene,726,case,chr1
436,target,64.0,Cdh7,"cadherin 7, type 2",Gene,protein coding gene,37082,case,chr1
596,target,109.0,Myo1b,myosin IB,Gene,protein coding gene,21759,case,chr1
718,target,153.0,Ncoa2,nuclear receptor coactivator 2,Gene,protein coding gene,20423,case,chr1
...,...,...,...,...,...,...,...,...,...
20386,target,24.0,Ppef1,protein phosphatase with EF hand calcium-bindi...,Gene,protein coding gene,622696,control,chrX
20395,target,30.0,Il1rapl2,interleukin 1 receptor accessory protein-like 2,Gene,protein coding gene,621712,control,chrX
20416,target,32.0,Akap14,A kinase anchor protein 14,Gene,protein coding gene,616573,control,chrX
20447,target,64.0,Tex11,testis expressed gene 11,Gene,protein coding gene,620727,control,chrX


In [10]:
gene_only = gene_tmp.groupby(["type_index", "class"]).agg(list)["marker_symbol"].reset_index()
display(gene_only)

Unnamed: 0,type_index,class,marker_symbol
0,0.0,case,"[Sfi1, Prxl2c, Vapa, Gm10801, Gm10800, Rab5if,..."
1,0.0,control,"[Sp140l2, Gm53567, Prxl2c, Vapa, Gm10801, Gm10..."
2,1.0,case,"[Adamts20, Aak1]"
3,1.0,control,"[Sfi1, Ppard, Trpm3, Aak1, Zfp954, Zfp773]"
4,2.0,case,"[Chpt1, Mybpc1, Hs6st3, Zfpm2, Tspan18, Barx2]"
...,...,...,...
169,351.0,control,[Kif17]
170,357.0,case,[Astn2]
171,371.0,control,[Zcchc7]
172,446.0,case,[Zcchc7]


In [11]:
uniq_count = gene_tmp["marker_symbol"].unique().shape[0]
print(f"unique genomic features: {uniq_count}")

candidate_genes = gene_tmp["marker_symbol"].unique()
[ x for x in sorted(candidate_genes) ]

unique genomic features: 298


['2310002L09Rik',
 '4930402F06Rik',
 '4932414N04Rik',
 'A830018L16Rik',
 'Aadacl4fm4',
 'Aak1',
 'Abcb1b',
 'Acot12',
 'Acvr2a',
 'Adamts20',
 'Adarb2',
 'Aff3',
 'Agbl4',
 'Agps',
 'Aida',
 'Aifm1',
 'Akap14',
 'Aldh1l1',
 'Alk',
 'Ankrd45',
 'Aox4',
 'Apba2',
 'Arfgef1',
 'Arid4b',
 'Ark2n',
 'Arsg',
 'Asic2',
 'Asph',
 'Astn2',
 'Atad1',
 'Atic',
 'BC147527',
 'Bach2',
 'Barx2',
 'Caap1',
 'Cab39',
 'Cacnb2',
 'Cadm1',
 'Camta1',
 'Capn1',
 'Cask',
 'Cc2d2a',
 'Ccdc12',
 'Ccdc80',
 'Cd109',
 'Cdh7',
 'Cdk5rap2',
 'Celsr1',
 'Cfhr3',
 'Chat',
 'Chl1',
 'Chpt1',
 'Chrna7',
 'Cimip2c',
 'Cngb3',
 'Cntn4',
 'Cntnap2',
 'Cntnap3',
 'Coro2b',
 'Cracd',
 'Csnk1g1',
 'Cul4a',
 'D7Ertd443e',
 'Dab1',
 'Dach1',
 'Dcc',
 'Ddx27',
 'Ddx51',
 'Dgka',
 'Dip2c',
 'Dnah7a',
 'Dock10',
 'Dop1a',
 'Drd3',
 'Dsel',
 'Elmo1',
 'En2',
 'Ep400',
 'Etl4',
 'Ext1',
 'Fam135a',
 'Fam172a',
 'Fam210a',
 'Fbxl13',
 'Fgfr4',
 'Fkbp15',
 'Fn1',
 'Frmd3',
 'Frmd4a',
 'Galnt10',
 'Gcnt2',
 'Gfod2',
 'Gfral',
 'Gg

In [12]:
# What about breaking up large CISs? It appears they are too large and span multiple genes

# Or do now we check each individual gene that was found to further refine our search?

# What's stopping me from testing literally every gene with +/-50 kb region of insertions with a ranksums test?
# because we don't want this to be gene centric, however, it's not a bad way to validate the candidate gene list


In [13]:
# TODO: use list of Laura genes and see why that genomic region did not show up
top10genes = ["Aak1", "Ehhadh", "Macrod2", "Ckb", "Rnf214", "Sprr1b", "Cpd", "Rpl48-ps1", "Son", "Eif3b"]
top10annot = annot_df[annot_df["Marker Symbol"].isin(top10genes)]
for row in top10annot.itertuples():
    chrom = f"chr{row.Chr}"
    case_chrom_df = case_df[case_df["chrom"] == chrom]
    control_chrom_df = control_df[control_df["chrom"] == chrom]
    res1 = int(row._4 - 50000) <= case_chrom_df["max_pos"]
    res2 = int(row._5 + 50000) >= case_chrom_df["min_pos"]
    res3 = int(row._4 - 50000) <= control_chrom_df["max_pos"]
    res4 = int(row._5 + 50000) >= control_chrom_df["min_pos"]

    print(row._7, (res1 & res2).sum(), (res3 & res4).sum())
    if (res1 & res2).sum() != 0:
        display(case_chrom_df.loc[res1 & res2])
    if (res3 & res4).sum() != 0:
        display(control_chrom_df.loc[res3 & res4])


Rpl48-ps1 1 1


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
3081,case,chr10,292,1,0,4,57474423,57474423,0,1,0,0


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
3544,control,chr10,119,1,0,1,57513025,57513025,0,1,0,0


Cpd 0 1


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
4359,control,chr11,142,1,0,1,76722877,76722877,0,1,0,0


Ckb 0 0
Ehhadh 1 1


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
351,case,chr16,1,62,1891,790,21600392,21614302,13910,62,602,32


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
410,control,chr16,1,26,325,89,21614194,21614302,108,26,7,14


Son 0 0
Macrod2 2 4


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
725,case,chr2,83,2,1,2,141177054,141177560,506,2,0,0
929,case,chr2,287,1,0,1,141377758,141377758,0,1,0,0


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
963,control,chr2,96,2,1,34,140313463,140313466,3,2,0,0
1084,control,chr2,217,1,0,1,140190257,140190257,0,1,0,0
1121,control,chr2,254,1,0,1,140580324,140580324,0,1,0,0
1153,control,chr2,286,1,0,15,140753859,140753859,0,1,0,0


Sprr1b 1 1


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
2218,case,chr3,7,6,15,438,92347697,92347714,17,6,0,0


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
2850,control,chr3,145,1,0,41,92347698,92347698,0,1,0,0


Eif3b 0 0
Aak1 2 2


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
5793,case,chr6,1,115,6555,10437,86910504,86911206,702,115,19,26
5839,case,chr6,47,3,3,16,86858016,86858023,7,3,0,0


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
7469,control,chr6,1,77,2926,845,86886566,86910899,24333,77,881,19
7679,control,chr6,211,1,0,55,86976664,86976664,0,1,0,0


Rnf214 1 0


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
3108,case,chr9,5,9,36,774,45852161,45852171,10,9,0,0


In [None]:
# TODO: check for each gene with an extended proomoter the case vs controls insertions
# check with ranksums test as well as binomial test
# What does the candidate gene list look like now?


# TODO: I need to go through my code again. Go step by step for each function in another notebook
# double check this code and cis_networks.py


# TODO: rerun without mapq thresholding in preprocess_reads and preprocess_insertions


# TODO: should we be using normalized read counts? idk, what stage would this occur at
# the variation would come from read depth, so we would have to normalize based on this BEFORE preprocess_reads.py
# What is done with RNAseq? How to normalize on read depth while still keeping count data

In [None]:
# volcano plot of p-values and LFC for each CIS if wanted


In [None]:
# output list of pCIS (now can be called CIS) that are sig.
# per CIS, output list of TAs that are sig.


In [None]:
# take identified candidate CIS/genes and check for co-occurrence with all other CIS/genes
