In [2]:
import pickle
from pathlib import Path

import pandas as pd
import numpy as np
import seaborn.objects as so
from seaborn import axes_style
import networkx as nx
from scipy.stats import chisquare, binomtest, ranksums, mannwhitneyu, skewtest, kurtosistest

from IPython.display import display

In [13]:
import sys, os
dir1 = os.path.dirname(os.path.abspath(''))
if not dir1 in sys.path: sys.path.append(dir1)
from netcis import network_analysis as na

from importlib import reload
reload(na)

<module 'netcis.network_analysis' from '/project/cs-myers/MathewF/projects/Laura-SB-Analysis/NetCIS/netcis/network_analysis.py'>

In [18]:
def convert_mapq(x):
    """integer to float 0-1"""
    return np.power(10, x / (-10))

def inverse_mapq(x):
    """float 0-1 to integer"""
    return -10 * np.log10(x)

convert_mapq(0)

1.0

In [16]:
output_prefix = "/project/cs-myers/MathewF/projects/Laura-SB-Analysis/2023_SB-output/GRCm39/results"
graph_dir = Path(output_prefix + "-graphs/")
output = Path(output_prefix + "-analysis")
output.mkdir(exist_ok=True)
ta_dir = Path("/project/cs-myers/MathewF/software/bowtie2-2.4.5/indexes/GRCm39_TAs/")
gene_annot = Path("/project/cs-myers/MathewF/projects/Laura-SB-Analysis/NetCIS/toy-data/MRK_List2.rpt")
ta_error = 5
pval_threshold = 0.05
verbose = 0
case = "CAR"
control = "NoCAR"

In [11]:
annot_df = pd.read_csv(gene_annot, sep="\t")
annot_df = annot_df[pd.notna(annot_df["genome coordinate start"])].drop("Status", axis=1)
annot_df["chrom"] = annot_df["Chr"].apply(lambda x: f"chr{x}")
annot_df = annot_df.sort_values(["chrom"]).reset_index(drop=True)

In [5]:
bed_files = { file.name.split(".")[0]: pd.read_csv(file, sep="\t", header=None) for file in ta_dir.iterdir() }
# 1m 42s

In [6]:
# TODO: is it na.get_subgraphs that makes this take so long?
# is it possible to do this in cis_networks.py and then load it in at this point?
# maybe all of this needs to be run in cis_networks per chromosome???

# case_subgraph_dict = na.get_subgraphs(graph_dir, "case")
# case_df = na.get_subgraph_stats(case_subgraph_dict, "case", bed_files, ta_error)
# case_df.sort_values(["chrom", "subgraph", "nodes"]).to_csv(output / "case_df.tsv", sep="\t", index=False)
# case_df = pd.read_csv(output / "case_df.tsv", sep="\t")

# control_subgraph_dict = na.get_subgraphs(graph_dir, "control")
# control_df = na.get_subgraph_stats(control_subgraph_dict, "control", bed_files, ta_error)
# control_df.sort_values(["chrom", "subgraph", "nodes"]).to_csv(output / "control_df.tsv", sep="\t", index=False)
# control_df = pd.read_csv(output / "control_df.tsv", sep="\t")
# 5m 46s

In [19]:
reload(na)
chroms = sorted([ chrom.name for chrom in (graph_dir / case).iterdir() ])

TA_list = []
overall_list = []
all_features_list = []
# TODO: parallelize this?
for chrom in chroms:
    # if chrom != "chrM":
    #     continue
    print(chrom)
    
    # get chromosome subsets for annotation file, TA bed file, cases, and controls
    annot_chrom_df = annot_df[annot_df["chrom"] == chrom]
    
    chrom_bed_file = bed_files[chrom]
    
    with open(graph_dir / case / chrom / "subgraphs.pickle", 'rb') as f:
        case_chrom_subgraphs = pickle.load(f)
    case_chrom_df = na.get_subgraph_stats(case_chrom_subgraphs, case, chrom, chrom_bed_file, ta_error)
    
    with open(graph_dir / control / chrom / "subgraphs.pickle", 'rb') as f:
        control_chrom_subgraphs = pickle.load(f)
    control_chrom_df = na.get_subgraph_stats(control_chrom_subgraphs, control, chrom, chrom_bed_file, ta_error)

    
    # cases as the target
    case_overlaps = na.pcis_overlaps(case_chrom_df, control_chrom_df)
    if not case_overlaps:  # if empty
        case_genes = None
    else:
        case_TA_df, case_overall_df = na.compare_pcis(case_overlaps, case_chrom_subgraphs, control_chrom_subgraphs)
        case_TA_df["class"] = "case"
        case_overall_df["class"] = "case"
        case_TA_df["chrom"] = chrom
        case_overall_df["chrom"] = chrom
        case_sig_df = na.pcis_to_cis(case_overall_df, pval_threshold)
        if len(case_sig_df) != 0:
            case_genes = na.cis_annotate(case_sig_df, annot_chrom_df)
            case_genes["class"] = "case"            
        else:
            case_genes = None
    
    # controls as the target
    control_overlaps = na.pcis_overlaps(control_chrom_df, case_chrom_df)
    if not control_overlaps:  # if empty
        control_genes = None
    else:
        control_TA_df, control_overall_df = na.compare_pcis(control_overlaps, control_chrom_subgraphs, case_chrom_subgraphs)
        control_TA_df["class"] = "control"
        control_overall_df["class"] = "control"
        control_TA_df["chrom"] = chrom
        control_overall_df["chrom"] = chrom
        control_sig_df = na.pcis_to_cis(control_overall_df, pval_threshold)
        if len(control_sig_df) != 0:
            control_genes = na.cis_annotate(control_sig_df, annot_chrom_df)
            control_genes["class"] = "control"
        else:
            control_genes = None
    
    if case_genes is not None and control_genes is not None:
        both_genes = pd.concat([case_genes, control_genes], ignore_index=True)
    elif case_genes is not None:
        both_genes = case_genes
    elif control_genes is not None:
        both_genes = control_genes
    else:  # both are none
        print("\tno significant genomic features found")
        continue
    
    TA_list.append(pd.concat([case_TA_df, control_TA_df], ignore_index=True))
    overall_list.append(pd.concat([case_overall_df, control_overall_df], ignore_index=True))
    both_genes["chrom"] = chrom
    all_features_list.append(both_genes)
    if verbose > 0:
        # print(len(both_genes))
        print(len(both_genes["marker_symbol"].unique()))

    # TODO: are there too many repeated genes? Does this make sense that they would be repeated?
    # it appears that sometimes there are multiple CIS in a gene, because the CIS range is quite small
    print(f"""\tsig. genomic features: {both_genes["marker_symbol"].unique().shape[0]}/{annot_chrom_df["Marker Symbol"].unique().shape[0]}""")

# 7m 45s

chr1
	sig. genomic features: 76/42646
chr1.graphml


IndexError: list index out of range

In [10]:
# save data

# all_TA_df = pd.concat(TA_list, ignore_index=True)
# all_TA_df.to_csv(output / "all_TA.tsv", sep="\t", index=False)
all_TA_df = pd.read_csv(output / "all_TA.tsv", sep="\t")

# all_overall_df = pd.concat(overall_list, ignore_index=True)
# all_overall_df.to_csv(output / "all_overall.tsv", sep="\t", index=False)
all_overall_df = pd.read_csv(output / "all_overall.tsv", sep="\t")

# all_features_df = pd.concat(all_features_list, ignore_index=True)
# all_features_df.to_csv(output / "all_features.tsv", sep="\t", index=False)
all_features_df = pd.read_csv(output / "all_features.tsv", sep="\t")


# TODO: 
# get candidate gene list 
genes_tmp = all_features_df[(all_features_df["marker_type"] == "Gene") & (all_features_df["marker_feature_type"] == "protein coding gene")]
genes_only = genes_tmp.groupby(["type_index", "class"]).agg(list)["marker_symbol"].reset_index()
candidate_genes = pd.DataFrame({"marker_symbol": [ x for x in sorted(genes_tmp["marker_symbol"].unique()) ]})
candidate_genes.to_csv(output / "candidate_genes.tsv", sep="\t", index=False)

# validate gene candidates in a gene-centric way

# save this new conservative list

# expand gene feature boundaries for annotations and repeat above



In [11]:
# TODO: use list of Laura genes and see why that genomic region did not show up
top10genes = ["Aak1", "Ehhadh", "Macrod2", "Ckb", "Rnf214", "Sprr1b", "Cpd", "Rpl48-ps1", "Son", "Eif3b"]
top10annot = annot_df[annot_df["Marker Symbol"].isin(top10genes)]
for row in top10annot.itertuples():
    chrom = f"chr{row.Chr}"
    case_chrom_df = case_df[case_df["chrom"] == chrom]
    control_chrom_df = control_df[control_df["chrom"] == chrom]
    res1 = int(row._4 - 50000) <= case_chrom_df["max_pos"]
    res2 = int(row._5 + 50000) >= case_chrom_df["min_pos"]
    res3 = int(row._4 - 50000) <= control_chrom_df["max_pos"]
    res4 = int(row._5 + 50000) >= control_chrom_df["min_pos"]

    print(row._7, (res1 & res2).sum(), (res3 & res4).sum())
    if (res1 & res2).sum() != 0:
        display(case_chrom_df.loc[res1 & res2])
    if (res3 & res4).sum() != 0:
        display(control_chrom_df.loc[res3 & res4])


Rpl48-ps1 1 1


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
919,case,chr10,292,1,0,4,57474423,57474423,0,1,0,0


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
1089,control,chr10,119,1,0,1,57513025,57513025,0,1,0,0


Cpd 0 1


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
1478,control,chr11,142,1,0,1,76722877,76722877,0,1,0,0


Ckb 0 0
Ehhadh 1 1


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
2364,case,chr16,1,62,1891,790,21600392,21614302,13910,62,602,32


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
2993,control,chr16,1,26,325,89,21614194,21614302,108,26,7,14


Son 0 0
Macrod2 2 4


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
3245,case,chr2,83,2,1,2,141177054,141177560,506,2,0,0
3449,case,chr2,287,1,0,1,141377758,141377758,0,1,0,0


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
4020,control,chr2,96,2,1,34,140313463,140313466,3,2,0,0
4141,control,chr2,217,1,0,1,140190257,140190257,0,1,0,0
4178,control,chr2,254,1,0,1,140580324,140580324,0,1,0,0
4210,control,chr2,286,1,0,15,140753859,140753859,0,1,0,0


Sprr1b 1 1


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
3581,case,chr3,7,6,15,438,92347697,92347714,17,6,0,0


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
4538,control,chr3,145,1,0,41,92347698,92347698,0,1,0,0


Eif3b 0 0
Aak1 2 2


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
4779,case,chr6,1,115,6555,10437,86910504,86911206,702,115,19,26
4825,case,chr6,47,3,3,16,86858016,86858023,7,3,0,0


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
6001,control,chr6,1,77,2926,845,86886566,86910899,24333,77,881,19
6211,control,chr6,211,1,0,55,86976664,86976664,0,1,0,0


Rnf214 1 0


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
5794,case,chr9,5,9,36,774,45852161,45852171,10,9,0,0


In [12]:
# What about breaking up large CISs? It appears they are too large and span multiple genes

# Or do now we check each individual gene that was found to further refine our search?

# What's stopping me from testing literally every gene with +/-50 kb region of insertions with a ranksums test?
# because we don't want this to be gene centric, however, it's not a bad way to validate the candidate gene list


In [None]:
# TODO: check for each gene with an extended proomoter the case vs controls insertions
# check with ranksums test as well as binomial test
# What does the candidate gene list look like now?


# TODO: I need to go through my code again. Go step by step for each function in another notebook
# double check this code and cis_networks.py


# TODO: rerun without mapq thresholding in preprocess_reads and preprocess_insertions


# TODO: should we be using normalized read counts? idk, what stage would this occur at
# the variation would come from read depth, so we would have to normalize based on this BEFORE preprocess_reads.py
# What is done with RNAseq? How to normalize on read depth while still keeping count data

In [None]:
# volcano plot of p-values and LFC for each CIS if wanted


In [None]:
# output list of pCIS (now can be called CIS) that are sig.
# per CIS, output list of TAs that are sig.


In [None]:
# take identified candidate CIS/genes and check for co-occurrence with all other CIS/genes
