In [1]:
import pickle
from pathlib import Path
from multiprocessing import Pool

import pandas as pd
import numpy as np
import seaborn.objects as so
from seaborn import axes_style
import networkx as nx
from scipy.stats import binomtest, ranksums, mannwhitneyu, wilcoxon, skewtest, kurtosistest
from tqdm import tqdm

from IPython.display import display

In [2]:
import sys, os
dir1 = os.path.dirname(os.path.abspath(''))
if not dir1 in sys.path: sys.path.append(dir1)
from netcis import network_analysis as na
from importlib import reload
# reload(na)

In [3]:
refdata = Path("/project/cs-myers/MathewF/projects/Laura-SB-Analysis/2023-SB-screen/ref_data/GRCm39")

args = {
    "output_prefix": "/project/cs-myers/MathewF/projects/Laura-SB-Analysis/2023-SB-screen/output/GRCm39/results", 
    "ta_dir": refdata / "ta_files",
    "gene_annot": refdata / "MRK_List2.rpt",
    "ta_error": 5,
    "pval_threshold": 0.05,
    "verbose": 1,
    "case": "CAR",  # CAR ACF LT RT
    "control": "NoCAR",  # NoCAR SCF S S
    "npara": 21,
}

args["graph_dir"] = Path(args["output_prefix"] + "-graphs/")

output = Path(args["output_prefix"] + "-analysis")
output.mkdir(exist_ok=True)

ta_dir = args["ta_dir"]
gene_annot = args["gene_annot"]
ta_error = args["ta_error"]
pval_threshold = args["pval_threshold"]
verbose = args["verbose"]
case = args["case"]
control = args["control"]

output_res = output / f"{case}-{control}"
output_res.mkdir(exist_ok=True)

In [4]:
annot_df = pd.read_csv(gene_annot, sep="\t")
annot_df = annot_df[pd.notna(annot_df["genome coordinate start"])].drop("Status", axis=1)
annot_df["chrom"] = annot_df["Chr"].apply(lambda x: f"chr{x}")
annot_df = annot_df.sort_values(["chrom"]).reset_index(drop=True)
# TODO: what about the strand in annot_df?

bed_files = {file.name.split(".")[0]: file for file in args["ta_dir"].iterdir()}

chroms = sorted([ chrom.name for chrom in (args["graph_dir"] / case).iterdir() ])
print(chroms)
print(len(chroms))

['chr1', 'chr10', 'chr11', 'chr12', 'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18', 'chr19', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6', 'chr7', 'chr8', 'chr9', 'chrX', 'chrY']
21


In [5]:
reload(na)
# get chromosome subsets for annotation file and TA bed file
# iter_args = tqdm([ (chrom, annot_df[annot_df["chrom"] == chrom], bed_files[chrom], args) for chrom in chroms ])
iter_args = [ (chrom, annot_df[annot_df["chrom"] == chrom], bed_files[chrom], args) for chrom in chroms ]
with Pool(args["npara"]) as p:
    res_dict_list = [ x for x in p.imap_unordered(na.chrom_analysis, iter_args) ]
# res_dict_list = na.chrom_analysis(iter_args[-2])


# join chromosomes results together
ta_list = []
overall_list = []
sig_list = []
genomic_features_list = []
graphs_stats = []
for res_dict in res_dict_list:
    ta_list.append(res_dict["ta"])
    overall_list.append(res_dict["overall"])
    sig_list.append(res_dict["sig"])
    genomic_features_list.append(res_dict["genomic_features"])
    graphs_stats.append(res_dict["graph_stats"])

TA_df = pd.concat(ta_list, ignore_index=True)
overall_df = pd.concat(overall_list, ignore_index=True)
sig_df = pd.concat(sig_list, ignore_index=True)
genomic_features_df = pd.concat(genomic_features_list, ignore_index=True)
graph_stats_df = pd.concat(graphs_stats, ignore_index=True)

chrY	sig. genomic features: 6/2258
chr19	sig. genomic features: 185/19111
chr18	sig. genomic features: 228/19513
chr17	sig. genomic features: 423/27642
chr16	sig. genomic features: 258/21700
chr14	sig. genomic features: 300/25888
chr15	sig. genomic features: 373/25472
chr12	sig. genomic features: 409/26317
chr10	sig. genomic features: 324/31354
chr11	sig. genomic features: 509/43017
chr13	sig. genomic features: 497/27823
chr8	sig. genomic features: 456/31843
chrX	sig. genomic features: 275/17094
chr3	sig. genomic features: 390/32790
chr7	sig. genomic features: 475/41167
chr5	sig. genomic features: 658/41312
chr6	sig. genomic features: 532/36110
chr2	sig. genomic features: 847/48061
chr9	sig. genomic features: 1295/35050
chr1	sig. genomic features: 1064/42676
chr4	sig. genomic features: 2202/38706


In [21]:
# remove duplicated entries
def check_dups(gene_df, case, control):
    if len(gene_df) == 1:
        return gene_df
    
    case_tmp = gene_df[gene_df["target"] == case]
    cont_tmp = gene_df[gene_df["target"] == control]

    return_list = []
    remove_list = []
    for _, row in case_tmp.iterrows():
        t1 = cont_tmp["reference_index"].values == row["target_index"]
        t2 = cont_tmp["reference_IS_count"].values == row["target_IS_count"]
        if any(t1) and any(t2) and np.array_equal(t1, t2):
            return_list.append(row.to_frame().T)
            remove_list.append(cont_tmp[t1].index)
            
    if remove_list:
        remove_list = remove_list[0] if len(remove_list) == 1 else [ x[0] for x in remove_list]
        new_cont_tmp = cont_tmp.drop(index=remove_list)
        for _, row in new_cont_tmp.iterrows():
            t1 = case_tmp["target_index"].values == row["reference_index"]
            t2 = case_tmp["target_IS_count"].values == row["reference_IS_count"]
            if any(t1) and any(t2) and np.array_equal(t1, t2):
                return_list.append(row.to_frame().T)
            
    if not return_list:
        return None
    else:
        return pd.concat(return_list)
            
# get candidate gene list 
genes_tmp = genomic_features_df[(genomic_features_df["marker_type"] == "Gene") & (genomic_features_df["marker_feature_type"] == "protein coding gene")]
# genes_only = genes_tmp.groupby(["type_index"]).agg(list)["marker_symbol"].reset_index()
many_genes = pd.DataFrame({"marker_symbol": [ x for x in sorted(genes_tmp["marker_symbol"].unique()) ]})

new_genes = []
for i, gene in enumerate(many_genes["marker_symbol"].values):
    tmp_annot = genomic_features_df[genomic_features_df["marker_symbol"] == gene]
    # display(tmp_annot)
    
    tmp_sig = sig_df[sig_df["target_index"].isin(tmp_annot["type_index"]) & sig_df["chrom"].isin(tmp_annot["chrom"])].sort_values(["target_index"])
    tmp_sig["gene"] = gene
    tmp_gene = check_dups(tmp_sig, case, control)
    if tmp_gene is not None:
        new_genes.append(tmp_gene)
        # display(tmp_gene)
    
        # ta_list = []
        # for ta_subset in tmp_gene.itertuples():
        #     tmp = TA_df[TA_df["target_index"].isin([ta_subset.target_index]) & TA_df["chrom"].isin([ta_subset.chrom]) & TA_df["target"].isin([ta_subset.target])]
        #     ta_list.append(tmp)
        # tmp_ta = pd.concat(ta_list, ignore_index=True).sort_values(["target_index", "reference_index", "target", "pos"])
        # print(pearsonr(tmp_ta["target_count"].values, tmp_ta["reference_count"].values))
        # display(tmp_ta)
    
    # stats_list = []
    # for ta_subset in tmp_annot.itertuples():
    #     tmp = graph_stats_df[graph_stats_df["subgraph"].isin([ta_subset.type_index]) & graph_stats_df["chrom"].isin([ta_subset.chrom]) & graph_stats_df["type"].isin([ta_subset.type_name])]
    #     stats_list.append(tmp)
    # tmp_g_stats = pd.concat(stats_list, ignore_index=True).sort_values(["subgraph"])
    # # display(tmp_g_stats)

    

candidate_genes = pd.concat(new_genes, ignore_index=True)
a = candidate_genes["target_IS_count"][candidate_genes["target"] == case].rename(case)
b = candidate_genes["reference_IS_count"][candidate_genes["reference"] == case].rename(case)
case_read_counts = pd.concat([a, b]).sort_index()
c = candidate_genes["target_IS_count"][candidate_genes["target"] == control].rename(control)
d = candidate_genes["reference_IS_count"][candidate_genes["reference"] == control].rename(control)
control_read_counts = pd.concat([c, d]).sort_index()

output_genes = candidate_genes.groupby(["gene"])[["mannwhitneyu", "ranksums", "binomial"]].mean().reset_index()

print(len(candidate_genes), len(output_genes))
display(candidate_genes)
output_genes.columns = ["gene", "avg-mannwhitneyu", "avg-ranksums",  "avg-binomial"]
output_genes[case] = case_read_counts
output_genes[control] = control_read_counts





# save data
TA_df.to_csv(output_res / "TA.tsv", sep="\t", index=False)
overall_df.to_csv(output_res / "overall.tsv", sep="\t", index=False)
sig_df.to_csv(output_res / "sig.tsv", sep="\t", index=False)
genomic_features_df.to_csv(output_res / "genomic_features.tsv", sep="\t", index=False)
graph_stats_df.to_csv(output_res / "graph_stats.tsv", sep="\t", index=False)
candidate_genes.to_csv(output_res / "candidate_genes.tsv", sep="\t", index=False)
output_genes.to_csv(output_res / "cleaned_output_genes.tsv", sep="\t", index=False)

simple_summary = {}
for treatment in graph_stats_df["type"].unique():
    treatment_df = graph_stats_df[graph_stats_df["type"] == treatment]
    simple_summary[treatment] = {"pCIS_count": len(treatment_df),
                                 "insertion_site_count": treatment_df["nodes"].sum(),
                                 "reads_count": treatment_df["num_inserts"].sum(),
                                 }
df = pd.DataFrame(simple_summary)
df.to_csv(args['graph_dir'].parent / "simple_summary.csv")
df

381 380


Unnamed: 0,target_index,reference_index,target_pos_min,target_pos_max,reference_pos_min,reference_pos_max,mannwhitneyu,ranksums,binomial,target_num_samples,reference_num_samples,total_IS,sig_IS,target_IS_count,reference_IS_count,sig_ratio,target,reference,chrom,gene
0,2,,103846357,103846381,,,0.021071,0.020921,0.0,3,,4,3,169,0,0.75,NoCAR,CAR,chr5,1700016H13Rik
1,125,,138825098,138825098,,,1.0,0.317311,0.03125,1,,1,1,6,0,1.0,CAR,NoCAR,chr1,2310009B15Rik
2,38,,27307799,27307799,,,1.0,0.317311,0.000122,2,,1,1,14,0,1.0,CAR,NoCAR,chr7,2310022A10Rik
3,6,,50676643,50676646,,,0.059346,0.049535,0.0,2,,3,1,84,0,0.333333,CAR,NoCAR,chr9,2310030G06Rik
4,35,,145654994,145654994,,,1.0,0.317311,0.000015,1,,1,1,17,0,1.0,NoCAR,CAR,chr3,2410004B18Rik
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
376,116,,147053363,147053363,,,1.0,0.317311,0.03125,1,,1,1,6,0,1.0,CAR,NoCAR,chr4,Zfp989
377,139,,146675191,146675191,,,1.0,0.317311,0.000977,1,,1,1,11,0,1.0,CAR,NoCAR,chr4,Zfp993
378,6,,25507102,25507102,,,1.0,0.317311,0.000977,8,,1,1,11,0,1.0,CAR,NoCAR,chr14,Zmiz1
379,9,0.0,126941561,126941597,126941571,126941601,0.325364,0.306685,0.0,4,6,7,3,48,142,0.428571,CAR,NoCAR,chr4,Zmym1


Unnamed: 0,CAR,NoCAR
pCIS_count,1220,1128
insertion_site_count,1663,1529
reads_count,22334,15628


In [19]:
graph_stats_df

Unnamed: 0,type,chrom,subgraph,nodes,edges,num_inserts,min_pos,max_pos,range,sample_IDs,num_unique_samples,num_insert_sites,num_ta_sites,num_ta_insert_sites
0,CAR,chrY,0,1,0,1,38017098,38017098,0,[4_4_8],1,1,0,0
1,CAR,chrY,1,1,0,1,12772988,12772988,0,[4_4_14],1,1,0,0
2,CAR,chrY,2,1,0,1,17371977,17371977,0,[4_4_14],1,1,0,0
3,CAR,chrY,3,1,0,1,61879714,61879714,0,[4_4_14],1,1,0,0
4,CAR,chrY,4,1,0,1,62423308,62423308,0,[4_4_14],1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2343,NoCAR,chr4,182,1,0,6,94580864,94580864,0,[4_4_12],1,1,0,0
2344,NoCAR,chr4,183,1,0,2,101363952,101363952,0,[4_4_12],1,1,0,0
2345,NoCAR,chr4,184,1,0,5,122337858,122337858,0,[4_4_12],1,1,0,0
2346,NoCAR,chr4,185,1,0,1,41142724,41142724,0,[3_20_17],1,1,0,0


In [18]:
pCIS_counts = {}
for treatment in graph_stats_df["type"].unique():
    total = len(graph_stats_df[graph_stats_df["type"] == treatment])
    pCIS_counts[treatment] = total
treatment, count = zip(*pCIS_counts.items())
df = pd.DataFrame({"treatment": treatment, "count": count})
df.to_csv(args['graph_dir'].parent / "pCIS_counts.csv", index=False)
df

CAR
NoCAR


Unnamed: 0,treatment,count
0,CAR,1220
1,NoCAR,1128


In [73]:

display(sig_df.sort_values("binomial"))

Unnamed: 0,target_index,reference_index,target_pos_min,target_pos_max,reference_pos_min,reference_pos_max,mannwhitneyu,ranksums,binomial,target_num_samples,reference_num_samples,total_IS,sig_IS,target_IS_count,reference_IS_count,sig_ratio,target,reference,chrom
430,7,0.0,54037701,54037707,54037705,54037705,0.026519,0.020921,0.00,12,4,4,4,2518,4,1.000000,CAR,NoCAR,chr4
360,72,15.0,79335535,79335535,79301992,79335535,0.414216,0.245278,0.00,1,5,2,1,1,1254,0.500000,NoCAR,CAR,chr9
556,130,7.0,54037705,54037705,54037701,54037707,0.026519,0.020921,0.00,4,12,4,4,4,2518,1.000000,NoCAR,CAR,chr4
260,3,46.0,42652132,42652134,42652132,42652132,0.184039,0.126630,0.00,7,3,3,2,1597,5,0.666667,NoCAR,CAR,chr6
312,15,0.0,79301992,79335535,79335535,79335535,0.414216,0.245278,0.00,5,1,2,1,1254,1,0.500000,CAR,NoCAR,chr9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,2,,59015917,59037723,,,0.046854,0.049535,0.25,2,,3,0,3,0,0.000000,NoCAR,CAR,chr14
56,1,,3051513,3053483,,,0.046854,0.049535,0.25,3,,3,0,3,0,0.000000,NoCAR,CAR,chr14
509,21,,104343449,104376596,,,0.046854,0.049535,0.25,2,,3,0,3,0,0.000000,NoCAR,CAR,chr4
342,8,,87365491,87399417,,,0.046854,0.049535,0.25,3,,3,0,3,0,0.000000,NoCAR,CAR,chr9


In [74]:
# load the saved data
# output_res = Path("/research/labs/immunology/rogerslm/m277102/projects/2023_SB/output/GRCm39/results-analysis/CAR-NoCAR")  # mforge
# output_res = Path("/research/labs/immunology/rogerslm/m277102/projects/2023_SB/output/GRCm39/results-analysis/ACF-SCF")  # mforge
# output_res = Path("/project/cs-myers/MathewF/projects/Laura-SB-Analysis/2023-SB-screen/output/GRCm39/results-analysis/CAR-NoCAR")  # cse
# output_res = Path("/project/cs-myers/MathewF/projects/Laura-SB-Analysis/2023-SB-screen/output/GRCm39/results-analysis/ACF-SCF")  # cse


TA_df = pd.read_csv(output_res / "TA.tsv", sep="\t")
overall_df = pd.read_csv(output_res / "overall.tsv", sep="\t")
sig_df = pd.read_csv(output_res / "sig.tsv", sep="\t")
genomic_features_df = pd.read_csv(output_res / "genomic_features.tsv", sep="\t")
graph_stats_df = pd.read_csv(output_res / "graph_stats.tsv", sep="\t")

In [10]:
# assert len(candidate_genes) == len(output_genes)
# TODO: 10/17/23
# this assertion should not happen (or does it...?). Somehow there are genes that have gotten through the duplicate removal process
arr, counts = np.unique(candidate_genes["gene"].to_numpy(), return_counts=True)
candidate_genes[candidate_genes["gene"].isin(arr[counts == 2])]

Unnamed: 0,target_index,reference_index,target_pos_min,target_pos_max,reference_pos_min,reference_pos_max,mannwhitneyu,ranksums,wilcoxon,binomial,total_IS,sig_IS,target_IS_count,reference_IS_count,sig_ratio,target,reference,chrom,gene
76,2,4.0,40034220,40034221,40034220.0,40034220.0,0.414216,0.245278,,0.0,2,1,189,1,0.5,CAR,NoCAR,chr19,Cyp2c37
77,4,2.0,9362961,9362961,9362961.0,9362961.0,1.0,0.317311,,0.011719,1,1,10,1,1.0,CAR,NoCAR,chr19,Cyp2c37
78,2,4.0,40034220,40034221,40034220.0,40034220.0,0.414216,0.245278,,0.0,2,1,189,1,0.5,CAR,NoCAR,chr19,Cyp2c50
79,4,2.0,9362961,9362961,9362961.0,9362961.0,1.0,0.317311,,0.011719,1,1,10,1,1.0,CAR,NoCAR,chr19,Cyp2c50
80,2,4.0,40034220,40034221,40034220.0,40034220.0,0.414216,0.245278,,0.0,2,1,189,1,0.5,CAR,NoCAR,chr19,Cyp2c54
81,4,2.0,9362961,9362961,9362961.0,9362961.0,1.0,0.317311,,0.011719,1,1,10,1,1.0,CAR,NoCAR,chr19,Cyp2c54
83,81,22.0,103793432,103793432,103793432.0,103816367.0,0.184039,0.12663,,0.0,3,2,3,504,0.666667,CAR,NoCAR,chr4,Dab1
84,152,127.0,104512600,104512600,104512600.0,104512600.0,1.0,0.317311,,0.0,1,1,49,3,1.0,CAR,NoCAR,chr4,Dab1


# validate gene candidates in a gene-centric way

test for significance in different ways
1. pCIS-case range
2. pCIS-control range
3. union of pCIS's
4. intersection of pCIS's
5. gene range (if able)
6. gene range +50kb from both ends (if able)

What statistical test should be used? For comparing the distance of two probability distributons, maybe look into Kullback–Leibler divergence?

evaluate overall significance at different levels/amounts. What does the overall list look like if all tests must be significant?

save this new conservative list


In [43]:
def get_pCIS_insertions(row, pcIS_type, ta_df, range_min=None, range_max=None): 
    # pcIS_type can be target or reference
    tmp1 = ta_df[ta_df["target"] == row["target"]]
    tmp2 = tmp1[tmp1[f"target_index"] == row[f"target_index"]]
    tmp3 = tmp2[tmp2["chrom"] == row["chrom"]]
    
    if (range_min == None) and (range_max == None):
        TAs = tmp3[(tmp3["pos"] >= row[f"{pcIS_type}_pos_min"]) & (tmp3["pos"] <= row[f"{pcIS_type}_pos_max"])]
        return TAs
    elif (range_min != None) and (range_max != None):
        TAs = tmp3[(tmp3["pos"] >= range_min) & (tmp3["pos"] <= range_max)]
        return TAs
    else:
        print(f"Nope, can't do that in get_pCIS_insertions: range_min={range_min}, range_max={range_max}")

def get_pCIS_stats(tmp, prepend):
    if not len(tmp):
        mwu, rs, wc, binom = np.NAN, np.NAN, np.NAN, np.NAN
    else:
        mwu = mannwhitneyu(tmp["target_count"], tmp["reference_count"]).pvalue 
        rs = ranksums(tmp["target_count"], tmp["reference_count"]).pvalue
        # wc = wilcoxon(tmp["target_count"], tmp["reference_count"]).pvalue if (tmp["target_count"] - tmp["reference_count"]).sum() != 0 and len(tmp) >= 10 else np.nan 
        # binom = binomtest(tmp["target_count"].sum(), tmp["target_count"].sum() + tmp["reference_count"].sum(), 0.5).pvalue
        
    out_df = pd.DataFrame({
        f"{prepend}-mannwhitneyu": [mwu],  # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.mannwhitneyu.html#scipy-stats-mannwhitneyu
        f"{prepend}-ranksums": [rs],       # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ranksums.html
        # f"{prepend}-wilcoxon": [wc],       # https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.wilcoxon.html#scipy.stats.wilcoxon
        # f"{prepend}-binomial": [binom],
        })
    
    return out_df


# for each gene (or pCIS???)
new_gene_candidates = []
for i, row in candidate_genes.iterrows():
    # if pd.isna(row["reference_index"]):
    #     continue
    # if i != 21:
    #     continue
    
    ### test for significance in the 6 ways using the 4 stat methods
    
    # target pCIS range
    target_TAs = get_pCIS_insertions(row, "target", TA_df)
    target_out = get_pCIS_stats(target_TAs, row["target"])

    
    # check ref range, union, and intersection if there is a ref index
    if pd.notna(row["reference_index"]):
        
        # reference pCIS range if possible
        ref_TAs = get_pCIS_insertions(row, "reference", TA_df)
        ref_out = get_pCIS_stats(ref_TAs, row["reference"])

        # union of pCIS range if possible
        union_TAs = get_pCIS_insertions(row, "target", TA_df,
                                        min(row["target_pos_min"], row["reference_pos_min"]),
                                        max(row["target_pos_max"], row["reference_pos_max"]),
                                        )
        union_out = get_pCIS_stats(union_TAs, "union")
        
        # intersection of pCIS range if possible
        intersect_TAs = get_pCIS_insertions(row, "target", TA_df,
                                        max(row["target_pos_min"], row["reference_pos_min"]),
                                        min(row["target_pos_max"], row["reference_pos_max"]),
                                        )
        intersect_out = get_pCIS_stats(intersect_TAs, "intersect")
        
    else:
        ref_out = get_pCIS_stats([], row["reference"])
        union_out = get_pCIS_stats([], "union")
        intersect_out = get_pCIS_stats([], "intersect")

        
    # gene range (TODO: check for this for all significant results, not just the ones near genes)
    gene_row = annot_df[annot_df["Marker Symbol"] == row["gene"]]
    gene_start = gene_row["genome coordinate start"].tolist()[0]
    gene_end = gene_row["genome coordinate end"].tolist()[0]
    gene_TAs = TA_df[(TA_df["chrom"] == row["chrom"]) & (TA_df["pos"] >= gene_start) & (TA_df["pos"] <= gene_end)]
    gene_TAs = gene_TAs.drop_duplicates(subset=["pos"], keep='first')
    genes_out = get_pCIS_stats(gene_TAs, "gene")
        
    # gene range +/- 50 kb
    gene_extended_TAs = TA_df[(TA_df["chrom"] == row["chrom"]) & (TA_df["pos"] >= (gene_start - 50000)) & (TA_df["pos"] <= (gene_end + 50000))]
    gene_extended_TAs = gene_extended_TAs.drop_duplicates(subset=["pos"], keep='first')
    genes_extended_out = get_pCIS_stats(gene_extended_TAs, "gene_extended")

    # save as new list
    row_out = pd.concat([target_out, ref_out, union_out, intersect_out, genes_out, genes_extended_out], axis=1)
    new_gene_candidates.append(pd.concat([row.to_frame().T.reset_index(drop=True), row_out], axis=1))
    
# view output
new_gene_candidates_df = pd.concat(new_gene_candidates, ignore_index=True)
display(new_gene_candidates_df)

# save new list
new_gene_candidates_df.to_csv(output_res / "validated_candidate_genes.tsv", sep="\t", index=False)


Unnamed: 0,target_index,reference_index,target_pos_min,target_pos_max,reference_pos_min,reference_pos_max,mannwhitneyu,ranksums,wilcoxon,binomial,...,union-binomial,intersect-mannwhitneyu,intersect-ranksums,intersect-binomial,gene-mannwhitneyu,gene-ranksums,gene-binomial,gene_extended-mannwhitneyu,gene_extended-ranksums,gene_extended-binomial
0,2,,103846357,103846381,,,0.021071,0.020921,,0.0,...,,,,,,,,0.021071,0.020921,2.672765e-51
1,58,,138825098,138825098,,,1.0,0.317311,,0.03125,...,,,,,,,,1.000000,0.317311,3.125000e-02
2,54,,27307799,27307799,,,1.0,0.317311,,0.000122,...,,,,,,,,1.000000,0.317311,1.220703e-04
3,6,,50676643,50676646,,,0.059346,0.049535,,0.0,...,,,,,,,,0.059346,0.049535,1.033976e-25
4,10,,145654994,145654994,,,1.0,0.317311,,0.000015,...,,,,,,,,1.000000,0.317311,1.525879e-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
363,29,,147755841,147755844,,,0.220671,0.121335,,0.000244,...,,,,,,,,0.063603,0.049535,6.103516e-05
364,174,,145394123,145394123,,,1.0,0.317311,,0.007812,...,,,,,,,,1.000000,0.317311,7.812500e-03
365,196,,146675191,146675191,,,1.0,0.317311,,0.000977,...,,,,,,,,1.000000,0.317311,9.765625e-04
366,8,0.0,126941561,126941597,126941571.0,126941601.0,0.325364,0.306685,,0.0,...,5.513028e-12,0.883853,0.77283,0.837556,0.325364,0.306685,5.513028e-12,0.325364,0.306685,5.513028e-12


break

In [12]:
# What about breaking up large CISs? It appears they are too large and span multiple genes

# Or do now we check each individual gene that was found to further refine our search?

# What's stopping me from testing literally every gene with +/-50 kb region of insertions with a ranksums test?
# because we don't want this to be gene centric, however, it's not a bad way to validate the candidate gene list


In [None]:
# TODO: check for each gene with an extended proomoter the case vs controls insertions
# check with ranksums test as well as binomial test
# What does the candidate gene list look like now?


# TODO: I need to go through my code again. Go step by step for each function in another notebook
# double check this code and cis_networks.py


# TODO: rerun without mapq thresholding in preprocess_reads and preprocess_insertions


# TODO: should we be using normalized read counts? idk, what stage would this occur at
# the variation would come from read depth, so we would have to normalize based on this BEFORE preprocess_reads.py
# What is done with RNAseq? How to normalize on read depth while still keeping count data

In [None]:
# volcano plot of p-values and LFC for each CIS if wanted


In [None]:
# output list of pCIS (now can be called CIS) that are sig.
# per CIS, output list of TAs that are sig.


In [None]:
# take identified candidate CIS/genes and check for co-occurrence with all other CIS/genes
