In [1]:
import pickle
from pathlib import Path
from multiprocessing import Pool

import pandas as pd
import numpy as np
import seaborn.objects as so
from seaborn import axes_style
import networkx as nx
from scipy.stats import chisquare, binomtest, ranksums, mannwhitneyu, skewtest, kurtosistest
from tqdm import tqdm

from IPython.display import display

In [2]:
import sys, os
dir1 = os.path.dirname(os.path.abspath(''))
if not dir1 in sys.path: sys.path.append(dir1)
from netcis import network_analysis as na
from importlib import reload
reload(na)

<module 'netcis.network_analysis' from '/research/labs/immunology/rogerslm/m277102/projects/NetCIS/netcis/network_analysis.py'>

In [3]:
args = {
    "output_prefix": "/project/cs-myers/MathewF/projects/Laura-SB-Analysis/2020_SB-output/GRCm39/results",
    "ta_dir": Path("/project/cs-myers/MathewF/software/bowtie2-2.4.5/indexes/GRCm39_TAs/"),
    "gene_annot": Path("/project/cs-myers/MathewF/projects/Laura-SB-Analysis/NetCIS/toy-data/MRK_List2.rpt"),
    "ta_error": 5,
    "pval_threshold": 0.05,
    "verbose": 1,
    "case": "LT",
    "control": "S",
    "npara": 21,
}

args["graph_dir"] = Path(args["output_prefix"] + "-graphs/")

output = Path(args["output_prefix"] + "-analysis")
output.mkdir(exist_ok=True)

ta_dir = args["ta_dir"]
gene_annot = args["gene_annot"]
ta_error = args["ta_error"]
pval_threshold = args["pval_threshold"]
verbose = args["verbose"]
case = args["case"]
control = args["control"]

In [96]:
annot_df = pd.read_csv(gene_annot, sep="\t")
annot_df = annot_df[pd.notna(annot_df["genome coordinate start"])].drop("Status", axis=1)
annot_df["chrom"] = annot_df["Chr"].apply(lambda x: f"chr{x}")
annot_df = annot_df.sort_values(["chrom"]).reset_index(drop=True)
# TODO: what about the strand in annot_df?

bed_files = {file.name.split(".")[0]: file for file in args["ta_dir"].iterdir()}

chroms = sorted([ chrom.name for chrom in (args["graph_dir"] / case).iterdir() ])
print(chroms)

['chr1', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chrM', 'chrX', 'chrY']


In [106]:
reload(na)
# get chromosome subsets for annotation file and TA bed file
# iter_args = tqdm([ (chrom, annot_df[annot_df["chrom"] == chrom], bed_files[chrom], args) for chrom in chroms ])
iter_args = [ (chrom, annot_df[annot_df["chrom"] == chrom], bed_files[chrom], args) for chrom in chroms ]
with Pool(args["npara"]) as p:
    res_dict_list = [ x for x in p.imap_unordered(na.chrom_analysis, iter_args) ]


chrM	no sig. genomic features found
chr19	sig. genomic features: 332/19099
chrY	no sig. genomic features found
chr18	sig. genomic features: 201/19494
chr17	sig. genomic features: 493/27635
chr16	sig. genomic features: 332/21683
chr15	sig. genomic features: 532/25463
chr14	sig. genomic features: 423/25866
chr11	sig. genomic features: 510/43002
chr8	sig. genomic features: 645/31828
chr12	sig. genomic features: 392/26305
chr13	sig. genomic features: 506/27803
chr7	sig. genomic features: 415/41139
chr10	sig. genomic features: 909/31345
chr6	sig. genomic features: 644/36077
chr5	sig. genomic features: 822/41290
chr3	sig. genomic features: 412/32767
chrX	sig. genomic features: 352/17078
chr9	sig. genomic features: 1355/35025
chr2	sig. genomic features: 885/48037
chr4	sig. genomic features: 1671/38690
chr1	sig. genomic features: 1962/42646


In [107]:
# save data  
ta_list = []
overall_list = []
sig_list = []
genomic_features_list = []
graphs_stats = []
for res_dict in res_dict_list:
    ta_list.append(res_dict["ta"])
    overall_list.append(res_dict["overall"])
    sig_list.append(res_dict["sig"])
    genomic_features_list.append(res_dict["genomic_features"])
    graphs_stats.append(res_dict["graph_stats"])

TA_df = pd.concat(ta_list, ignore_index=True)
TA_df.to_csv(output / "TA.tsv", sep="\t", index=False)

overall_df = pd.concat(overall_list, ignore_index=True)
overall_df.to_csv(output / "overall.tsv", sep="\t", index=False)

sig_df = pd.concat(sig_list, ignore_index=True)
sig_df.to_csv(output / "sig.tsv", sep="\t", index=False)

genomic_features_df = pd.concat(genomic_features_list, ignore_index=True)
genomic_features_df.to_csv(output / "genomic_features.tsv", sep="\t", index=False)

graph_stats_df = pd.concat(graphs_stats, ignore_index=True)
graph_stats_df.to_csv(output / "graph_stats.tsv", sep="\t", index=False)


print(TA_df.duplicated().sum())
print(overall_df.duplicated().sum())
print(sig_df.duplicated().sum())
print(genomic_features_df.duplicated().sum())
print(graph_stats_df.duplicated().sum())

0
0
0
6
0


In [32]:
# load the saved data
output = Path("/research/labs/immunology/rogerslm/m277102/projects/2023_SB/output/GRCm39/results-analysis/CAR-NoCAR")  # mforge
# output = Path("/research/labs/immunology/rogerslm/m277102/projects/2023_SB/output/GRCm39/results-analysis/ACF-SCF")  # mforge

TA_df = pd.read_csv(output / "TA.tsv", sep="\t")
overall_df = pd.read_csv(output / "overall.tsv", sep="\t")
sig_df = pd.read_csv(output / "sig.tsv", sep="\t")
genomic_features_df = pd.read_csv(output / "genomic_features.tsv", sep="\t")
graph_stats_df = pd.read_csv(output / "graph_stats.tsv", sep="\t")

In [33]:
sig_df

Unnamed: 0,target_index,reference_index,target_pos_min,target_pos_max,reference_pos_min,reference_pos_max,mannwhitneyu,ranksums,total_TA,TA_sig,sig_ratio,target,reference,chrom
0,11,,160691822,160723888,,,,,2,2,1.0,NoCAR,CAR,chr1
1,0,,22940549,22940566,,,,,8,5,0.625,CAR,NoCAR,chr17
2,1,,13559289,13559426,,,,,7,2,0.285714,CAR,NoCAR,chr17
3,2,,13264396,13264507,,,,,4,2,0.5,CAR,NoCAR,chr17
4,0,,8522181,8522193,,,,,3,2,0.666667,CAR,NoCAR,chr3
5,22,,36178970,36183366,,,,,2,2,1.0,CAR,NoCAR,chr4
6,2,,69827057,69827062,,,,,6,3,0.5,NoCAR,CAR,chr4
7,2,,103846357,103846381,,,,,4,3,0.75,NoCAR,CAR,chr5
8,16,,20795708,20795725,,,,,2,2,1.0,CAR,NoCAR,chr9
9,10,,116805992,116805997,,,,,3,2,0.666667,NoCAR,CAR,chr9


In [34]:
overall_df[(overall_df["ranksums"] <= 0.05)]

Unnamed: 0,target_index,reference_index,target_pos_min,target_pos_max,reference_pos_min,reference_pos_max,mannwhitneyu,ranksums,total_TA,TA_sig,sig_ratio,target,reference,chrom
0,0,8.0,70680277,70680281,70680277.0,70680288.0,7.731194e-13,0.01693516,5,2,0.4,CAR,NoCAR,chr1
140,8,0.0,70680277,70680288,70680277.0,70680281.0,7.731194e-13,0.01693516,5,2,0.4,NoCAR,CAR,chr1
405,1,3.0,70611744,70611771,70609217.0,70611759.0,1.582623e-14,1.390546e-07,5,1,0.2,CAR,NoCAR,chr12
411,7,2.0,108139687,108139687,108139643.0,108139689.0,1.08569e-16,2.325316e-07,3,2,0.666667,CAR,NoCAR,chr12
444,2,0.0,108139643,108139689,108139687.0,108139687.0,1.08569e-16,2.325316e-07,3,2,0.666667,NoCAR,CAR,chr12
445,3,0.0,70609217,70611759,70611744.0,70611771.0,1.582623e-14,1.390546e-07,5,1,0.2,NoCAR,CAR,chr12
801,3,0.0,32177545,32177573,32177569.0,32177569.0,0.002425846,0.005545667,4,1,0.25,CAR,NoCAR,chr17
851,17,3.0,32177569,32177569,32177545.0,32177573.0,0.002425846,0.005545667,4,1,0.25,NoCAR,CAR,chr17
912,0,0.0,32765893,32765932,32765913.0,32765933.0,1.5167189999999998e-20,8.771627e-12,11,6,0.545455,CAR,NoCAR,chr19
931,0,0.0,32765913,32765933,32765893.0,32765932.0,1.5167189999999998e-20,8.771627e-12,11,6,0.545455,NoCAR,CAR,chr19


In [35]:
# TODO: 9/7/23 get Laura list of genes with p-values and the associated gff3 file
# get candidate gene list 
genes_tmp = genomic_features_df[(genomic_features_df["marker_type"] == "Gene") & (genomic_features_df["marker_feature_type"] == "protein coding gene")]
genes_only = genes_tmp.groupby(["type_index"]).agg(list)["marker_symbol"].reset_index()
candidate_genes = pd.DataFrame({"marker_symbol": [ x for x in sorted(genes_tmp["marker_symbol"].unique()) ]})
candidate_genes.to_csv(output / "candidate_genes.tsv", sep="\t", index=False)

# display(candidate_genes)
for i in candidate_genes["marker_symbol"]:
    print(i)
# validate gene candidates in a gene-centric way

# save this new conservative list



1700016H13Rik
Aff1
Angptl6
Dnmt1
Eif3g
Gpr31b
Lingo2
Ppan
Rbms3
Rc3h1
Shfl
Sod2
Tcp10b
Tcp10c


In [36]:
new_genes = []
for i, gene in enumerate(candidate_genes["marker_symbol"].values):
    # if i != 1:
    #     continue
    tmp_annot = genomic_features_df[genomic_features_df["marker_symbol"] == gene]
    # display(tmp_annot)
    
    tmp_sig = sig_df[sig_df["target_index"].isin(tmp_annot["type_index"]) & sig_df["chrom"].isin(tmp_annot["chrom"])].sort_values(["target_index"])
    # display(tmp_sig)
    
    ta_list = []
    for ta_subset in tmp_annot.itertuples():
        tmp = TA_df[TA_df["target_index"].isin([ta_subset.type_index]) & TA_df["chrom"].isin([ta_subset.chrom]) & TA_df["target"].isin([ta_subset.type_name])]
        ta_list.append(tmp)
    tmp_ta = pd.concat(ta_list, ignore_index=True).sort_values(["target_index", "reference_index", "target", "pos"])
    # display(tmp_ta)
    
    stats_list = []
    for ta_subset in tmp_annot.itertuples():
        tmp = graph_stats_df[graph_stats_df["subgraph"].isin([ta_subset.type_index]) & graph_stats_df["chrom"].isin([ta_subset.chrom]) & graph_stats_df["type"].isin([ta_subset.type_name])]
        stats_list.append(tmp)
    tmp_g_stats = pd.concat(stats_list, ignore_index=True).sort_values(["subgraph"])
    # display(tmp_g_stats)
    tmp_sig["gene"] = [gene]
    new_genes.append(tmp_sig)


In [37]:
df = pd.concat(new_genes, ignore_index=True)
display(df)

Unnamed: 0,target_index,reference_index,target_pos_min,target_pos_max,reference_pos_min,reference_pos_max,mannwhitneyu,ranksums,total_TA,TA_sig,sig_ratio,target,reference,chrom,gene
0,2,,103846357,103846381,,,,,4,3,0.75,NoCAR,CAR,chr5,1700016H13Rik
1,2,,103846357,103846381,,,,,4,3,0.75,NoCAR,CAR,chr5,Aff1
2,16,,20795708,20795725,,,,,2,2,1.0,CAR,NoCAR,chr9,Angptl6
3,16,,20795708,20795725,,,,,2,2,1.0,CAR,NoCAR,chr9,Dnmt1
4,16,,20795708,20795725,,,,,2,2,1.0,CAR,NoCAR,chr9,Eif3g
5,2,,13264396,13264507,,,,,4,2,0.5,CAR,NoCAR,chr17,Gpr31b
6,22,,36178970,36183366,,,,,2,2,1.0,CAR,NoCAR,chr4,Lingo2
7,16,,20795708,20795725,,,,,2,2,1.0,CAR,NoCAR,chr9,Ppan
8,10,,116805992,116805997,,,,,3,2,0.666667,NoCAR,CAR,chr9,Rbms3
9,11,,160691822,160723888,,,,,2,2,1.0,NoCAR,CAR,chr1,Rc3h1


In [None]:
# add pval to gene list and save for laura

In [12]:
# What about breaking up large CISs? It appears they are too large and span multiple genes

# Or do now we check each individual gene that was found to further refine our search?

# What's stopping me from testing literally every gene with +/-50 kb region of insertions with a ranksums test?
# because we don't want this to be gene centric, however, it's not a bad way to validate the candidate gene list


In [None]:
# TODO: check for each gene with an extended proomoter the case vs controls insertions
# check with ranksums test as well as binomial test
# What does the candidate gene list look like now?


# TODO: I need to go through my code again. Go step by step for each function in another notebook
# double check this code and cis_networks.py


# TODO: rerun without mapq thresholding in preprocess_reads and preprocess_insertions


# TODO: should we be using normalized read counts? idk, what stage would this occur at
# the variation would come from read depth, so we would have to normalize based on this BEFORE preprocess_reads.py
# What is done with RNAseq? How to normalize on read depth while still keeping count data

In [None]:
# volcano plot of p-values and LFC for each CIS if wanted


In [None]:
# output list of pCIS (now can be called CIS) that are sig.
# per CIS, output list of TAs that are sig.


In [None]:
# take identified candidate CIS/genes and check for co-occurrence with all other CIS/genes
