In [1]:
from pathlib import Path

import pandas as pd
import numpy as np
import seaborn.objects as so
from seaborn import axes_style
import networkx as nx
from scipy.stats import chisquare, binomtest, ranksums, mannwhitneyu, skewtest, kurtosistest

from IPython.display import display
Path.cwd()

PosixPath('/project/cs-myers/MathewF/projects/Laura-SB-Analysis/NetCIS/notebooks')

In [2]:
def get_subgraphs(graph_dir, graph_type):
    subgraph_dict = {}
    for graph in (graph_dir / graph_type).iterdir():
        chrom = graph.name.split(".")[0]
        G = nx.read_graphml(graph)
        subgraphs_by_nodes = sorted(nx.connected_components(G), key=len, reverse=True)
        subgraph_dict[chrom] = [ G.subgraph(x) for x in subgraphs_by_nodes ]
    return subgraph_dict

def graph_properties(G, verbose=0):
    nodes = G.number_of_nodes()
    edges = G.number_of_edges()
    num_inserts = sum([ G.nodes[node]['counts'] for node in G.nodes ])
    subgraphs_by_nodes = sorted(nx.connected_components(G), key=len, reverse=True)
    num_subgraphs = len(subgraphs_by_nodes)
    if verbose:
        print(f"number of nodes: {nodes}")
        print(f"number of edges: {edges}")
        print(f"number of insertions: {num_inserts}")
        print(f"number of subgraphs (pCIS) {num_subgraphs}")
    return {"nodes": nodes, "edges": edges, "num_inserts": num_inserts, "num_subgraphs": num_subgraphs}

def subgraph_properties(G, verbose=0):
    nodes = G.number_of_nodes()
    edges = G.number_of_edges()
    num_inserts = sum([ G.nodes[node]['counts'] for node in G.nodes ])
    tmp_pos = sorted([ G.nodes[node]["position"] for node in G.nodes ])
    min_pos = min(tmp_pos)
    max_pos = max(tmp_pos)
    range_pos = max_pos - min_pos
    if verbose:
        print(f"number of nodes: {nodes}")
        print(f"number of edges: {edges}")
        print(f"number of insertions: {num_inserts}")
        print(f"min position: {min_pos}")
        print(f"max position: {max_pos}")
        print(f"range: {range_pos}")
    # TODO: FIXME: total nodes and range are not inclusive. see chr7
    # 7/10/23 - what does this mean...?
    return {"nodes": nodes, "edges": edges, "num_inserts": num_inserts, "min_pos": min_pos, "max_pos": max_pos, "range": range_pos}

def subgraph_TA_sites(G, bed, ta_error, verbose=0):
    num_insert_sites = G.number_of_nodes()
    
    tmp_pos = sorted([ G.nodes[node]["position"] for node in G.nodes ])
    ta_sites = bed[(bed[1] > min(tmp_pos)) & (bed[2] < max(tmp_pos))]
    num_ta_sites = len(ta_sites)
    
    arr1 = np.array(tmp_pos).reshape(-1, 1)
    arr2 = ta_sites[1].to_numpy().reshape(-1, 1)
    arr3 = ta_sites[1].to_numpy().reshape(-1, 1)
    ta_inserts = (arr1 >= (arr2.T - ta_error)) & (arr1 <= (arr3.T + ta_error))
    num_ta_insert_sites = ta_inserts.any(axis=1).sum()
    
    if verbose:
        print(f"number of insertions in subgraph: {num_insert_sites}")
        print(f"number of TA sites in subgraph: {num_ta_sites}")
        print(f"number of insertions within a TA site (+/- {ta_error} bp): {num_ta_insert_sites}")
        
    return {"num_insert_sites": num_insert_sites, "num_ta_sites": num_ta_sites, "num_ta_insert_sites": num_ta_insert_sites}

def get_subgraph_stats(subgraph_chroms, graph_type, bed_files, ta_error):
    subgraph_df_list = []
    for chrom, subgraphs in subgraph_chroms.items():
        for i, subgraph in enumerate(subgraphs):
            sg_meta = {"type": graph_type, "chrom": chrom, "subgraph": i}
            sg_prop = subgraph_properties(subgraph)
            sg_ta = subgraph_TA_sites(subgraph, bed_files[chrom], ta_error)
            sg_df = pd.DataFrame((sg_meta | sg_prop | sg_ta))
            subgraph_df_list.append(sg_df)
    return pd.concat(subgraph_df_list, ignore_index=True)  

# def get_graph(graph_dir, graph_type, bed_files, ta_error):
#     case_subgraphs = {}
#     case_df_list = []
#
#     for graph in (graph_dir / graph_type).iterdir():
#         chrom = graph.name.split(".")[0]
#         G = nx.read_graphml(graph)
#         subgraphs_by_nodes = sorted(nx.connected_components(G), key=len, reverse=True)
#         subgraph_list = []
#         for i, x in enumerate(subgraphs_by_nodes):
#             sg_meta = {"type": graph_type, "chrom": chrom, "subgraph": i}
#             subgraph = G.subgraph(x)
#             subgraph_list.append(subgraph)
#             sg_prop = subgraph_properties(subgraph)
#             sg_ta = subgraph_TA_sites(subgraph, bed_files[chrom], ta_error)
#             sg_df = pd.DataFrame((sg_meta | sg_prop | sg_ta), index=[i])
#             case_df_list.append(sg_df)
#         case_subgraphs[chrom] = subgraph_list
#     case_df = pd.concat(case_df_list, ignore_index=True)
#     return case_df, case_subgraphs
    
def pcis_overlaps(target_df, reference_df):
    # for all pCISs in target_df, find any overlap with reference_df (could be more than one or none)
    
    # start with each target subgraph, look through all reference subgraphs and record any overlap in the positions
    # key: index of target subgraph, value: list of reference subgraph indexes that overlap within the range of target subgraph
    overlap_dict = {}
    for tar_sg in target_df.itertuples():
        overlap_list = []
        for ref_sg in reference_df.itertuples():
            if (tar_sg.min_pos <= ref_sg.max_pos) and (tar_sg.max_pos >= ref_sg.min_pos):
                overlap_list.append(ref_sg.subgraph)
        overlap_dict[tar_sg.subgraph] = overlap_list
    # TODO: would it be easier/more efficient to compute this by whole dataframe?
    # this is the fully coded out logic, but now can I make it better and improve my pandas skills?
    # I can use chrom_overlaps as the ground truth as well.
    
    return overlap_dict

def compare_pcis(target_overlaps, target_subgraphs, reference_subgraphs):
    # do case-control comparison of TAs between overlapping subgraphs (pCIS):
    # - Match TA to TA site per subgraph
    # - calculate log fold changes
    # - use binomtest for significance of each TA
    # - Then use overall statistic for independent sample test between target and all references
    # which can be used for the final determination if the pCIS is now a CIS
    TA_df_list = []
    overall_df_list = []

    for tar_ind, ref_inds in target_overlaps.items():
        tar_G = target_subgraphs[tar_ind]
        tar_pos = [ tar_G.nodes[node]['position'] for node in tar_G.nodes ]
        tmp_tar = pd.DataFrame([ {"target": tar_G.nodes[node]['counts']} for node in tar_G.nodes ], index=tar_pos)        
        
        if len(ref_inds) == 0:
            tmp = tmp_tar
            tmp["reference"] = 0
            tmp["reference_index"] = None
        else:
            tmp_ref_list = []
            for ref_ind in ref_inds:
                ref_G = reference_subgraphs[ref_ind]
                ref_pos = [ ref_G.nodes[node]['position'] for node in ref_G.nodes ]
                ref_cols = [ {"reference": ref_G.nodes[node]['counts'], "reference_index": ref_ind} for node in ref_G.nodes ]
                tmp_ref_list.append(pd.DataFrame(ref_cols, index=ref_pos))
                
            # get union of all insertion sites and make it into a df
            tmp_ref = pd.concat(tmp_ref_list)
            tmp = tmp_tar.join(tmp_ref, how="outer")
    
        tmp = tmp.reset_index(drop=False).rename(columns={"index": "pos"})
        tmp["target_index"] = tar_ind
        tmp["target"] = tmp["target"].fillna(0).astype(int)
        tmp["reference"] = tmp["reference"].fillna(0).astype(int)
        
        # get stats per TA site (only count is used)
        tmp["target_binom_pval"] = tmp.apply(lambda x: binomtest(int(x["target"]), int(x["target"] + x["reference"])).pvalue, axis=1)
        tmp["target_binom_sig"] = tmp["target_binom_pval"] < 0.05
        tmp["LFC"] = tmp.apply(lambda x: np.log2((x["target"]+1) / (x["reference"]+1)), axis=1)
        # used pseudo count of 1 for log fold change, and so I wanted to show the difference in binomial test and significance with this
        tmp["p_target_binom_pval"] = tmp.apply(lambda x: binomtest(x["target"] + 1, x["target"] + x["reference"] + 2).pvalue, axis=1)
        tmp["p_target_binom_sig"] = tmp["p_target_binom_pval"] < 0.05
        
        # overall test stat for independence. use genomic positions. Total samples are each position times counts.
        # ex.) pos: 1001 and count: 3 is [1001, 1001, 1001]
        target_overall = []
        reference_overall = []
        for row in tmp.itertuples():
            for pos_tmp in [row.pos] * row.target:
                target_overall.append(pos_tmp)
            for pos_tmp in [row.pos] * row.reference:
                reference_overall.append(pos_tmp)

        mwu = mannwhitneyu(target_overall, reference_overall).pvalue if len(reference_overall) != 0 else np.nan
        rs = ranksums(target_overall, reference_overall).pvalue if len(reference_overall) != 0 else np.nan
        # case_skewtest = skewtest(target_overall).pvalue if len(target_overall) >= 8 else np.nan
        # case_kurtosistest = kurtosistest(target_overall).pvalue if len(target_overall) >= 20 else np.nan
        # control_skewtest = skewtest(reference_overall).pvalue if len(reference_overall) >= 8 else np.nan
        # control_kurtosistest = kurtosistest(reference_overall).pvalue if len(reference_overall) >= 20 else np.nan
        total_TA = len(tmp)
        TA_sig = tmp["target_binom_sig"].sum()
                            
        # Other stats: Kurtosis, skewness, etc. # "stat type": ["statistic", "pvalue"],
        tmp2 = pd.DataFrame({
            "target_index": [tar_ind],
            # "reference_index": [tmp["reference_index"].values[0]],
            "target_pos_min": [min(tar_pos)],
            "target_pos_max": [max(tar_pos)],
            # "reference_pos_min": [min(ref_pos)] if len(ref_inds) != 0 else [None],
            # "reference_pos_max": [max(ref_pos)] if len(ref_inds) != 0 else [None],
            "mannwhitneyu": [mwu],
            "ranksums": [rs], 
            #  "case-skewtest": [case_skewtest],
            #  "case-kurtosistest": [case_kurtosistest],
            #  "control-skewtest": [control_skewtest],
            #  "control-kurtosistest": [control_kurtosistest],
            "total_TA_sites": [total_TA],
            "sig_TA_sites": [TA_sig],
            })

        TA_df_list.append(tmp)
        overall_df_list.append(tmp2)

    TA_df = pd.concat(TA_df_list, ignore_index=True)
    overall_df = pd.concat(overall_df_list, ignore_index=True)
    overall_df["sig_ratio"] = overall_df["sig_TA_sites"] / overall_df["total_TA_sites"]
    return TA_df, overall_df

def pcis_to_cis(overall_df, threshold):
    # find pcis with significant pvalue that is less than the given threshold
    sig_df = overall_df[ (overall_df["mannwhitneyu"] < threshold) & (overall_df["ranksums"] < threshold) ]
    # test stat below threshold OR ratio that is not 0 and there are more than 1 sig tA
    nan_df =  overall_df[ pd.isna(overall_df["mannwhitneyu"]) & pd.isna(overall_df["ranksums"]) ]
    nan_sig_df = nan_df[ (nan_df["sig_ratio"] != 0) & (nan_df["sig_TA_sites"] > 1) ]
    all_sig_df = pd.concat([sig_df, nan_sig_df])
    return all_sig_df

def cis_annotate(target_sig_df, annotated_df):
    # get genes markers to each CIS
    gene_list = []
    for row in target_sig_df.itertuples():
        sub_gene_list = []
        for gene in annotated_df.itertuples():
            try:
                sub3 = row.reference_pos_min <= gene._5
                sub4 = row.reference_pos_max >= gene._4
            except:
                sub3 = None
                sub4 = None
            # TODO: just use the whole row from the annotation
            if (row.target_pos_min <= gene._5) and (row.target_pos_max >= gene._4):
                sub_gene_list.append(pd.DataFrame({
                    "type": ["target"],
                    "type_index": [int(row.target_index)],
                    "marker_pos_start": [gene._4],
                    "marker_pos_end": [gene._5],
                    # "marker_strand": [gene.strand],
                    "marker_symbol": [gene._7],
                    "marker_name": [gene._8],
                    "marker_type": [gene._9],
                    "marker_feature_type": [gene._10],
                    "marker_annot_index": [gene.Index],
                    }))
            elif sub3 and sub4:
                sub_gene_list.append(pd.DataFrame({
                    "type": ["reference"],
                    "type_index": [int(row.reference_index)],
                    "marker_pos_start": [gene._4],
                    "marker_pos_end": [gene._5],
                    # "marker_strand": [gene.strand],
                    "marker_symbol": [gene._7],
                    "marker_name": [gene._8],
                    "marker_type": [gene._9],
                    "marker_feature_type": [gene._10],
                    "marker_annot_index": [gene.Index],
                    }))
                
        # it is possible that there were no annotations found
        if len(sub_gene_list) == 0:
            sub_gene_list.append(pd.DataFrame({
                "type": ["target"],
                "type_index": [int(row.target_index)],
                "marker_pos_start": [None],
                "marker_pos_end": [None],
                # "marker_strand": [None],
                "marker_symbol": [None],
                "marker_name": [None],
                "marker_type": [None],
                "marker_feature_type": [None],
                "marker_annot_index": [None],
                }))
            
        gene_list.extend(sub_gene_list)
    return pd.concat(gene_list, ignore_index=True)
                
def volcano_plot(data, lfc, pval, threshold=0.05):
    thres = np.log10(threshold) * -1
    data[pval] = np.log10(data[pval]) * -1
    g = (
        so.Plot(data, x=lfc, y=pval, pointsize=pval)
        # .add(so.Dots(), so.Jitter(1))
        .add(so.Dots(color="grey"), so.Jitter(1), data=data.query(f"{pval} < {thres}"))
        .add(so.Dots(color="blue"), so.Jitter(1), data=data.query(f"{pval} >= {thres}"))
        .scale(y="log")
    )
    return g

In [3]:
graph_dir = Path("/project/cs-myers/MathewF/projects/Laura-SB-Analysis/output/2020_SB-graphs/")
ta_dir = Path("/project/cs-myers/MathewF/software/bowtie2-2.4.5/indexes/GRCm39_TAs/")
gene_annot = Path("/project/cs-myers/MathewF/projects/Laura-SB-Analysis/NetCIS/toy-data/MRK_List2.rpt")
output_prefix = "/project/cs-myers/MathewF/projects/Laura-SB-Analysis/output/2020_SB"
output = Path(output_prefix + "-analysis")
output.mkdir(exist_ok=True)
ta_error = 5
pval_threshold = 0.05
verbose = 0

In [4]:
annot_df = pd.read_csv(gene_annot, sep="\t")
annot_df = annot_df[pd.notna(annot_df["genome coordinate start"])].drop("Status", axis=1)
annot_df["chrom"] = annot_df["Chr"].apply(lambda x: f"chr{x}")
annot_df = annot_df.sort_values(["chrom"]).reset_index(drop=True)

In [13]:
# annot_df[annot_df["Marker Symbol"] == "Sfi1"]

Unnamed: 0,MGI Accession ID,Chr,cM Position,genome coordinate start,genome coordinate end,strand,Marker Symbol,Marker Name,Marker Type,Feature Type,Marker Synonyms (pipe-separated),chrom
74139,MGI:1926137,11,2.19,3081850.0,3143463.0,-,Sfi1,"Sfi1 homolog, spindle assembly associated (yeast)",Gene,protein coding gene,2310047I15Rik,chr11


In [None]:
bed_files = { file.name.split(".")[0]: pd.read_csv(file, sep="\t", header=None) for file in ta_dir.iterdir() }

In [7]:
case_subgraph_dict = get_subgraphs(graph_dir, "case")
case_df = get_subgraph_stats(case_subgraph_dict, "case", bed_files, "ta_error")

control_subgraph_dict = get_subgraphs(graph_dir, "control")
control_df = get_subgraph_stats(control_subgraph_dict, "control", bed_files, "ta_error")

UFuncTypeError: ufunc 'subtract' did not contain a loop with signature matching types (dtype('int64'), dtype('<U8')) -> None

In [None]:
chroms = case_df["chrom"].sort_values().unique()
all_sig_annot_list = []
all_sg_annot_list = []
# TODO: parallelize this?
for chrom in chroms:
    # if chrom != "chr6":
    #     continue
    print(chrom)
    
    # get chromosome subsets for case and control
    case_chrom_df = case_df[case_df["chrom"] == chrom]
    control_chrom_df = control_df[control_df["chrom"] == chrom]
    case_chrom_subgraphs = case_subgraph_dict[chrom]
    control_chrom_subgraphs = control_subgraph_dict[chrom]
    annot_chrom_df = annot_df[annot_df["chrom"] == chrom]
    
    # cases as the target
    case_overlaps = pcis_overlaps(case_chrom_df, control_chrom_df)
    case_TA_df, case_overall_df = compare_pcis(case_overlaps, case_chrom_subgraphs, control_chrom_subgraphs)
    case_sig_df = pcis_to_cis(case_overall_df, pval_threshold)
    if len(case_sig_df) != 0:
        case_sig_annot = cis_annotate(case_sig_df, annot_chrom_df)
        case_sig_annot["class"] = "case"
        # keep only genes annotated to each significant subgraph
        case_sg_annot = case_sig_df.join(case_sig_annot[case_sig_annot["marker_type"] == "Gene"].groupby("type_index").agg(list)["marker_symbol"])
        case_sg_annot["class"] = "case"
    else:
        case_sig_annot = None
    
    # controls as the target
    control_overlaps = pcis_overlaps(control_chrom_df, case_chrom_df)
    control_TA_df, control_overall_df = compare_pcis(control_overlaps, control_chrom_subgraphs, case_chrom_subgraphs)
    control_sig_df = pcis_to_cis(control_overall_df, pval_threshold)
    if len(control_sig_df) != 0:
        control_sig_annot = cis_annotate(control_sig_df, annot_chrom_df)
        control_sig_annot["class"] = "control"
        control_sg_annot = control_sig_df.join(control_sig_annot[control_sig_annot["marker_type"] == "Gene"].groupby("type_index").agg(list)["marker_symbol"])
        control_sg_annot["class"] = "control"
    else:
        control_annot = None
    
    if case_sig_annot is not None and control_sig_annot is not None:
        both_sig_annot = pd.concat([case_sig_annot, control_sig_annot], ignore_index=True)
        both_sg_annot = pd.concat([case_sg_annot, control_sg_annot], ignore_index=True)
    elif case_sig_annot is not None:
        both_sig_annot = case_sig_annot
        both_sg_annot = case_sg_annot
    elif control_sig_annot is not None:
        both_sig_annot = control_sig_annot
        both_sg_annot = control_sg_annot
    else:
        continue
    
    if case_sig_annot is not None or control_sig_annot is not None:
        both_sig_annot["chrom"] = chrom
        all_sig_annot_list.append(both_sig_annot)
        both_sg_annot["chrom"] = chrom
        all_sg_annot_list.append(both_sg_annot)
        if verbose > 0:
            print(len(both_sig_annot["marker_symbol"].unique()))

    # TODO: are there too many repeated genes? Does this make sense that they would be repeated?
    # it appears that sometimes there are multiple CIS in a gene, because the CIS range is quite small
    print(f"""\tsig. genomic features: {both_sig_annot["marker_symbol"].unique().shape[0]}/{annot_chrom_df["Marker Symbol"].unique().shape[0]}""")


all_sig_annot_df = pd.concat(all_sig_annot_list, ignore_index=True)
all_sg_annot_df = pd.concat(all_sg_annot_list, ignore_index=True)

In [17]:
# This by default is filtered for only genes by default
display(all_sg_annot_df)

Unnamed: 0,target_index,target_pos_min,target_pos_max,mannwhitneyu,ranksums,total_TA_sites,sig_TA_sites,sig_ratio,marker_symbol,class,chrom
0,4,22891694,22978601,2.319250e-02,2.471693e-02,14,8,0.571429,[Gm7759],case,chr1
1,5,29559897,29629191,1.860003e-02,1.756369e-02,14,0,0.000000,[slck],case,chr1
2,6,26556902,26697890,4.746838e-15,1.785891e-14,17,4,0.235294,"[slck, Gm31136, Gm24064]",case,chr1
3,10,102741240,102765559,2.742386e-02,2.830225e-02,13,1,0.076923,,case,chr1
4,59,6047345,6163765,4.096985e-08,6.809029e-07,7,2,0.285714,,case,chr1
...,...,...,...,...,...,...,...,...,...,...,...
532,37,99977058,100032516,,,3,2,0.666667,"[Stol, Tex11]",case,chrX
533,45,57714231,57729530,,,3,2,0.666667,,case,chrX
534,66,47592563,47599471,,,2,2,1.000000,"[Aifm1, Fnld]",case,chrX
535,17,73543988,73543996,,,2,2,1.000000,[Stol],control,chrX


In [18]:
# these are duplicated sometimes between cases and controls
# but that's ok cause we just want the unique list of genes that we should investigate further
display(all_sig_annot_df)

# What about just looking at only genes?
gene_tmp = all_sig_annot_df[all_sig_annot_df["marker_type"] == "Gene"]

gene_only = gene_tmp.groupby(["type_index", "class"]).agg(list)["marker_symbol"].reset_index()
display(gene_only)

uniq_count = gene_tmp["marker_symbol"].unique().shape[0]
print(f"unique genomic features: {uniq_count}")

candidate_genes = gene_tmp["marker_symbol"].unique()
print(candidate_genes)

Unnamed: 0,type,type_index,marker_pos_start,marker_pos_end,marker_symbol,marker_name,marker_type,marker_feature_type,marker_annot_index,class,chrom
0,target,4,13400224.0,40239160.0,Ftms4,fat mass 4,QTL,QTL,424,case,chr1
1,target,4,12916920.0,26517685.0,Fecq3,fecundity QTL 3,QTL,QTL,879,case,chr1
2,target,4,3267623.0,32550581.0,Ebas2,epidermolysis bullosa acquisita severity 2,QTL,QTL,3915,case,chr1
3,target,4,22969802.0,22971000.0,Rr33131,regulatory region 33131,Other Genome Feature,enhancer,21658,case,chr1
4,target,4,22927202.0,22927799.0,Rr33130,regulatory region 33130,Other Genome Feature,enhancer,21669,case,chr1
...,...,...,...,...,...,...,...,...,...,...,...
14027,target,17,10289113.0,91798267.0,Opfdc9,"open field, distance traveled, center 9",QTL,QTL,621954,control,chrY
14028,target,17,61343606.0,89843606.0,Mlh1fc3,MLH1 foci count 3,QTL,QTL,622131,control,chrY
14029,target,17,7226295.0,150107038.0,Dbts2,diabetes 2,QTL,QTL,631602,control,chrY
14030,target,17,54045360.0,148782996.0,Ferq1,genetic fertility QTL 1,QTL,QTL,632342,control,chrY


Unnamed: 0,type_index,class,marker_symbol
0,0,case,"[nmf172, nmf347, Gn, Gm55427, Hlb290, rsl, Mda..."
1,0,control,"[rsl, Mdac, Prxl2c, clf2, Vapa, Hdlk, Hdp1, Sn..."
2,1,case,"[Wif1, Gm48410, Micu2, Gm24794, Map3k4, Gm3162..."
3,1,control,"[slck, Hycc2, Ndufb3, Bswt, Chpt1, Mybpc1, Gn,..."
4,2,case,"[Gm47343, Gn, Sfi1, Gm33862, 9530037G02Rik, Gm..."
...,...,...,...
168,335,control,"[Ps, Zcchc7, vsd]"
169,352,case,"[Dbf, Gm816]"
170,425,case,[Dbf]
171,430,case,[Tli1]


unique genomic features: 447
['Gm7759' 'slck' 'Gm31136' 'Gm24064' 'Rp1' 'Gm37483' 'Sagg' 'Gm29260'
 'Smg7' 'Ncf2' 'Fam135a' 'Gm37580' 'Tli1' 'Nek7' 'Ncoa2' 'Dbf' 'Gm816'
 'Gm22966' 'Gm54870' 'Gm54871' 'Dock10' 'dde' 'Plcd4' 'Zfp142' 'Sp110'
 'Pbx1' 'Arfgef1' 'Cab39' 'Lmbrd1' 'Spag16' 'Gm26245' '4930533P14Rik'
 'Dsel' 'Klhl20' 'Ankrd45' 'Tgfb2' 'Gpr39' 'Gm31344' 'Vwc2l' 'Gm38376'
 'Hycc2' 'Ndufb3' 'Bswt' 'Aida' 'Gm22681' 'A830018L16Rik' 'Gm37142'
 'Vwa3b' 'Gm53552' 'Aff3' 'Aox4' 'Gm29346' 'Gn' 'Tmtc3' 'Xiaf3' 'Dgka'
 'nmf172' 'nmf347' 'Wif1' 'Gm48410' 'Gm47343' 'Pcdh15' 'Hsp90b1' 'Gm15344'
 'fsq' 'Otogl' 'Igf1' 'Chpt1' 'Mybpc1' 'Timp3' 'Syn3' 'Gm55427' 'Hlb290'
 'Sfi1' 'Gm33862' '9530037G02Rik' 'Gm12735' 'Gm11399' 'Arsg' 'Bolt'
 'Gm11762' 'Tanc2' 'nmf148' 'Alo1' 'nmf65' 'Asic2' 'Galnt10' 'Nlrp1a'
 'nmf420' 'Ism2' 'Scrd2' 'Mta1' 'Eccp' 'Gm48480' 'Gm48479' 'Gm32828'
 'Itgb8' 'rsl' 'Mdac' 'Prxl2c' 'clf2' 'Gm41050' 'Elmo1' 'Hecw1' 'Hcn1'
 '4933413L06Rik' 'Nnt' 'Adarb2' 'Gcnt2' 'Cntnap3' 'Gm

In [None]:
# What about breaking up large CISs? It appears they are too large and span multiple genes

# Or do now we check each individual gene that was found to further refine our search?

# What's stopping me from testing literally every gene with +/-50 kb region of insertions with a ranksums test?
# because we don't want this to be gene centric, however, it's not a bad way to validate the candidate gene list


In [62]:
# TODO: use list of Laura genes and see why that genomic region did not show up
top10genes = ["Aak1", "Ehhadh", "Macrod2", "Ckb", "Rnf214", "Sprr1b", "Cpd", "Rpl48-ps1", "Son", "Eif3b"]
top10annot = annot_df[annot_df["Marker Symbol"].isin(top10genes)]
for row in top10annot.itertuples():
    chrom = f"chr{row.Chr}"
    case_chrom_df = case_df[case_df["chrom"] == chrom]
    control_chrom_df = control_df[control_df["chrom"] == chrom]
    res1 = int(row._4 - 50000) <= case_chrom_df["max_pos"]
    res2 = int(row._5 + 50000) >= case_chrom_df["min_pos"]
    res3 = int(row._4 - 50000) <= control_chrom_df["max_pos"]
    res4 = int(row._5 + 50000) >= control_chrom_df["min_pos"]

    print(row._7, (res1 & res2).sum(), (res3 & res4).sum())
    if (res1 & res2).sum() != 0:
        display(case_chrom_df.loc[res1 & res2])
    if (res3 & res4).sum() != 0:
        display(control_chrom_df.loc[res3 & res4])


Rpl48-ps1 1 0


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
914,case,chr10,66,1,0,1,57513025,57513025,0,1,0,0


Cpd 0 0
Ckb 0 0
Ehhadh 1 1


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
1960,case,chr16,1,26,325,89,21614194,21614302,108,26,7,14


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
1629,control,chr16,1,62,1891,790,21600392,21614302,13910,62,602,32


Son 0 0
Macrod2 2 0


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
2552,case,chr2,63,2,1,33,140313463,140313466,3,2,0,0
2648,case,chr2,159,1,0,15,140753859,140753859,0,1,0,0


Sprr1b 1 1


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
2848,case,chr3,83,1,0,41,92347698,92347698,0,1,0,0


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
2484,control,chr3,7,6,15,438,92347697,92347714,17,6,0,0


Eif3b 0 0
Aak1 2 2


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
3771,case,chr6,1,75,2775,838,86910715,86910899,184,75,4,19
3885,case,chr6,115,1,0,55,86976664,86976664,0,1,0,0


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
3295,control,chr6,1,114,6441,10434,86910504,86910899,395,114,8,26
3329,control,chr6,35,3,3,16,86858016,86858023,7,3,0,0


Rnf214 0 1


Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,num_insert_sites,num_ta_sites,num_ta_insert_sites
3965,control,chr9,4,9,36,774,45852161,45852171,10,9,0,0


In [None]:
# TODO: check for each gene with an extended proomoter the case vs controls insertions
# check with ranksums test as well as binomial test
# What does the candidate gene list look like now?


# TODO: I need to go through my code again. Go step by step for each function in another notebook
# double check this code and cis_networks.py


# TODO: rerun without mapq thresholding in preprocess_reads and preprocess_insertions


# TODO: should we be using normalized read counts? idk, what stage would this occur at
# the variation would come from read depth, so we would have to normalize based on this BEFORE preprocess_reads.py
# What is done with RNAseq? How to normalize on read depth while still keeping count data

In [15]:
# volcano plot of p-values and LFC for each CIS if wanted


In [None]:
# output list of pCIS (now can be called CIS) that are sig.
# per CIS, output list of TAs that are sig.


In [None]:
# take identified candidate CIS/genes and check for co-occurrence with all other CIS/genes
