In [None]:
!which python

In [None]:
%load_ext autoreload
%autoreload 2
from IPython.core.interactiveshell import InteractiveShell

In [None]:
# basic packages
import os
import re
import sys
import datetime
from typing import List, Dict, Tuple, Optional, Any
from itertools import combinations, product
from pathlib import Path
import glob
#import yaml
import tqdm
import multiprocessing as mp

In [None]:
# data science
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [None]:
# bioinformatics
import pandas as pd
from Bio.Seq import MutableSeq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from bintools.utils.utils import get_yaml_config
from bintools.phylobayes import gtr_parser

In [None]:
ROOT_dir = Path(os.path.abspath(os.path.join(Path("../")))).__str__()
if ROOT_dir not in sys.path:
    sys.path.append(ROOT_dir)

In [None]:
list_of_geneID_simu: List[str] = get_yaml_config(ROOT_dir+"/configs/configs.yaml")["simulation"]["geneID"]
list_of_geneID_emp: List[str] = get_yaml_config(ROOT_dir+"/configs/configs.yaml")["empirical"]["geneID"]

In [None]:
def sign(x):
     return np.sum(x >= 0.95) / x.shape[0] * 100
     
def prop(x):
     return np.sum(x) / x.shape[0]

def tran(x):
    if x <= 1:
        return 0
    else:
        return 1


def concat(input_dir:str, pattern:str):
     files: List[str] = glob.glob(input_dir + pattern)
     assert len(files) > 0
     list_of_df : List[pd.DataFrame] = []
     for f in files:
          cur_df: pd.DataFrame = pd.read_csv(f,sep="\t")
          list_of_df += [cur_df]
     return pd.concat(list_of_df,axis=0,ignore_index=True)

## Simulation GTRG4

In [None]:
input_dir: str = ROOT_dir+"/outputs/simulation/gtrg4/stats_gtrg4/"
pattern: str = "*-A_suffdistatmap_DINT.tsv"
df: pd.DataFrame = concat(input_dir=input_dir, pattern=pattern)

In [None]:
assert df.shape[0] == 10000

In [None]:
df["lambda_CpG"] = df["CG>TG|CA"] / df["CG"]
dict_of_count = {}
k = 0 
for geneID in set(df.geneID):
    print(".",end="")
    for repID in range(0,10):
        for mcmcID in set(df.mcmcID):
            v_post: float = df.loc[(df["geneID"]==geneID)&(df["draw"]==repID)&(df["mcmcID"]==mcmcID)&(df["type"]=="post")]["lambda_CpG"].to_numpy()[0]
            v_pred: float = df.loc[(df["geneID"]==geneID)&(df["draw"]==repID)&(df["mcmcID"]==mcmcID)&(df["type"]=="pred")]["lambda_CpG"].to_numpy()[0]
            dict_of_count[k] = {
                                "geneID": geneID,
                                "repID": repID,
                                "mcmcID": mcmcID,
                                "test" : 1 if v_post > v_pred else 0
                                }
            k+=1

In [None]:
df_test: pd.DataFrame = pd.DataFrame.from_dict(data=dict_of_count, orient="index")
df_test.groupby(by=["geneID","repID"])["test"].agg([prop,"count"]).reset_index().to_csv(ROOT_dir+"/reports/simulation_gtrg4_lambda_CpG.tsv", sep="\t")
df_test = df_test.groupby(by=["geneID","repID"])["test"].agg([prop,"count"]).reset_index()


In [None]:
df_test.loc[df_test["geneID"]=="CSRP2BP"]

In [None]:
dict_of_prop = {}
k = 0 
for geneID in set(df_test.geneID):
    print(".",end="")
    for repID in range(0,10):
            prop_v: float = df_test.loc[(df_test["geneID"]==geneID)&(df_test["repID"]==repID)]["prop"].to_numpy()[0]
            dict_of_prop[k] = {
                "test 95": 1 if prop_v > 0.95 else 0,
                "test 90": 1 if prop_v > 0.90 else 0,
                "test 80": 1 if prop_v > 0.80 else 0,
                "test 70": 1 if prop_v > 0.70 else 0,
                "geneID": geneID,
                "repID": repID,
            }
            k+=1
df_test_count:pd.DataFrame = pd.DataFrame.from_dict(data=dict_of_prop, orient="index")
df_test_count
df_test_count[["test 95","test 90", "test 80","test 70"]].agg([np.sum, "count"]).reset_index()

## Simulation GTR

In [None]:
input_dir: str = ROOT_dir+"/outputs/simulation/gtr/stats_gtrg4/"
pattern: str = "*-A_suffdistatmap_DINT.tsv"
df: pd.DataFrame = concat(input_dir=input_dir, pattern=pattern)

In [None]:
df["lambda_CpG"] = df["CG>TG|CA"] / df["CG"]
dict_of_count = {}
k = 0 
for geneID in set(df.geneID):
    print(".",end="")
    for repID in range(0,10):
        for mcmcID in set(df.mcmcID):
            v_post: float = df.loc[(df["geneID"]==geneID)&(df["draw"]==repID)&(df["mcmcID"]==mcmcID)&(df["type"]=="post")]["lambda_CpG"].to_numpy()[0]
            v_pred: float = df.loc[(df["geneID"]==geneID)&(df["draw"]==repID)&(df["mcmcID"]==mcmcID)&(df["type"]=="pred")]["lambda_CpG"].to_numpy()[0]
            dict_of_count[k] = {
                                "geneID": geneID,
                                "repID": repID,
                                "mcmcID": mcmcID,
                                "test" : 1 if v_post > v_pred else 0
                                }
            k+=1

In [None]:
df_test: pd.DataFrame = pd.DataFrame.from_dict(data=dict_of_count, orient="index")
df_test.groupby(by=["geneID","repID"])["test"].agg([prop,"count"]).to_csv(ROOT_dir+"/reports/simulation_gtr_lambda_CpG.tsv", sep="\t")
df_test = df_test.groupby(by=["geneID","repID"])["test"].agg([prop,"count"]).reset_index()


In [None]:
df_test.loc[df_test["geneID"]=="CSRP2BP"]

In [None]:
dict_of_prop = {}
k = 0 
for geneID in set(df_test.geneID):
    print(".",end="")
    for repID in range(0,10):
            prop_v: float = df_test.loc[(df_test["geneID"]==geneID)&(df_test["repID"]==repID)]["prop"].to_numpy()[0]
            dict_of_prop[k] = {
                "test 95": 1 if prop_v > 0.95 else 0,
                "test 90": 1 if prop_v > 0.90 else 0,
                "test 80": 1 if prop_v > 0.80 else 0,
                "test 70": 1 if prop_v > 0.70 else 0,
                "geneID": geneID,
                "repID": repID,
            }
            k+=1

In [None]:
df_test_count:pd.DataFrame = pd.DataFrame.from_dict(data=dict_of_prop, orient="index")
df_test_count

In [None]:
df_test_count[["test 95","test 90", "test 80","test 70"]].agg([np.sum, "count"]).reset_index()

## Simulations M0GTR

In [None]:
input_dir: str = ROOT_dir+"/outputs/simulation/m0gtr/stats_gtrg4/"
pattern: str = "*-GTRG4-?.?-?-?-A_suffdistatmap_DINT.tsv"
df: pd.DataFrame = concat(input_dir=input_dir, pattern=pattern)
print(df.shape[0])
assert df.shape[0] == 60000

In [None]:
df["lambda_CpG"] = df["CG>TG|CA"] / df["CG"]
dict_of_count = {}
k = 0 
for geneID in set(df.geneID):
    for omega in [0.2, 1]:
        for CpG in [1,4, 8]:
            print(".",end="")
            for repID in range(0,10):
                for mcmcID in set(df.mcmcID):
                    v_post: float = df.loc[(df["CpG"]==CpG)&(df["omega"]==omega)&(df["geneID"]==geneID)&(df["draw"]==repID)&(df["mcmcID"]==mcmcID)&(df["type"]=="post")]["lambda_CpG"].to_numpy()[0]
                    v_pred: float = df.loc[(df["CpG"]==CpG)&(df["omega"]==omega)&(df["geneID"]==geneID)&(df["draw"]==repID)&(df["mcmcID"]==mcmcID)&(df["type"]=="pred")]["lambda_CpG"].to_numpy()[0]
                    dict_of_count[k] = {
                                        "geneID": geneID,
                                        "draw": repID,
                                        "omega":omega,
                                        "CpG":CpG,
                                        "mcmcID": mcmcID,
                                        "test" : 1 if v_post > v_pred else 0
                        
                                        }
                    k+=1

In [None]:
df_test: pd.DataFrame = pd.DataFrame.from_dict(data=dict_of_count, orient="index")
df_test.groupby(by=["geneID","omega","CpG","draw"])["test"].agg([prop,"count"]).reset_index().to_csv(ROOT_dir+"/reports/m0gtr_lambda_CpG.tsv", sep="\t")
df_test = df_test.groupby(by=["geneID","omega","CpG","draw"])["test"].agg([prop,"count"]).reset_index()
df_test

In [None]:
df_test

In [None]:
dict_of_prop = {}
k = 0 
for geneID in set(df_test.geneID):
    for omega in [0.2,1.0]:
        for CpG in [1,4,8]:
            print(".",end="")
            for repID in range(0,10):
                    prop_v: float = df_test.loc[(df_test["CpG"]==CpG)&(df_test["omega"]==omega)&(df_test["geneID"]==geneID)&(df_test["draw"]==repID)]["prop"].to_numpy()[0]
                    dict_of_prop[k] = {
                        "test 95": 1 if prop_v > 0.95 else 0,
                        "test 90": 1 if prop_v > 0.90 else 0,
                        "test 80": 1 if prop_v > 0.80 else 0,
                        "test 70": 1 if prop_v > 0.70 else 0,
                        "geneID": geneID,
                        "draw": repID,
                        "omega":omega,
                        "CpG":CpG,
                    }
                    k+=1

In [None]:
df_test_count:pd.DataFrame = pd.DataFrame.from_dict(data=dict_of_prop, orient="index")
df_test_count

In [None]:
df_test_count.groupby(by=["omega","CpG"])[["test 95","test 90", "test 80","test 70"]].agg([np.sum,"count"]).reset_index()

## Simulations M0GTR 2x, 5x, 10x TBL

In [None]:
input_dir: str = ROOT_dir+"/outputs/simulation/m0gtr/stats_gtrg4/"
pattern: str = "*-GTRG4-0.2-?-*-?-A_suffdistatmap_DINT.tsv"
df: pd.DataFrame = concat(input_dir=input_dir, pattern=pattern)
df.shape

In [None]:
df["lambda_CpG"] = df["CG>TG|CA"] / df["CG"]
dict_of_count = {}
k = 0 
for geneID in set(df.geneID):
    for omega in [0.2]:
        for CpG in [1,4,8]:
            for tbl in [2,5,10]:
                print(".",end="")
                for repID in range(0,10):
                    for mcmcID in set(df.mcmcID):
                        v_post: float = df.loc[(df["tbl"]==tbl)&(df["CpG"]==CpG)&(df["omega"]==omega)&(df["geneID"]==geneID)&(df["draw"]==repID)&(df["mcmcID"]==mcmcID)&(df["type"]=="post")]["lambda_CpG"].to_numpy()[0]
                        v_pred: float = df.loc[(df["tbl"]==tbl)&(df["CpG"]==CpG)&(df["omega"]==omega)&(df["geneID"]==geneID)&(df["draw"]==repID)&(df["mcmcID"]==mcmcID)&(df["type"]=="pred")]["lambda_CpG"].to_numpy()[0]
                        dict_of_count[k] = {
                                            "geneID": geneID,
                                            "draw": repID,
                                            "omega":omega,
                                            "CpG":CpG,
                                            "tbl":tbl,
                                            "mcmcID": mcmcID,
                                            "test" : 1 if v_post > v_pred else 0
                                            }
                        k+=1

In [None]:
df_test: pd.DataFrame = pd.DataFrame.from_dict(data=dict_of_count, orient="index")
df_test.groupby(by=["geneID","omega","CpG","tbl","draw"])["test"].agg([prop,"count"]).reset_index().to_csv(ROOT_dir+"/reports/m0gtr_tbl_lambda_CpG.tsv", sep="\t")
df_test = df_test.groupby(by=["geneID","omega","CpG","tbl","draw"])["test"].agg([prop,"count" ]).reset_index()

In [None]:
df_test

In [None]:
dict_of_prop = {}
k = 0 
for geneID in set(df.geneID):
    for omega in [0.2]:
        for CpG in [1,4,8]:
            for tbl in [2,5,10]:
                print(".",end="")
                for repID in range(0,10):
                        prop_v: float = df_test.loc[(df_test["tbl"]==tbl)&(df_test["CpG"]==CpG)&(df_test["omega"]==omega)&(df_test["geneID"]==geneID)&(df_test["draw"]==repID)]["prop"].to_numpy()[0]
                        dict_of_prop[k] = {
                            "test 95": 1 if prop_v > 0.95 else 0,
                            "test 90": 1 if prop_v > 0.90 else 0,
                            "test 80": 1 if prop_v > 0.80 else 0,
                            "test 70": 1 if prop_v > 0.70 else 0,
                            "geneID": geneID,
                            "draw": repID,
                            "omega":omega,
                            "CpG":CpG,
                            "tbl":tbl
                        }
                        k+=1

In [None]:
df_test_count:pd.DataFrame = pd.DataFrame.from_dict(data=dict_of_prop, orient="index")
df_test_count

In [None]:
df_test_count.groupby(by=["omega","CpG","tbl"])[["test 95","test 90", "test 80","test 70"]].agg([np.sum,"count"]).reset_index()

## Empirical GTR+G

In [None]:
input_dir: str = ROOT_dir+"/outputs/empirical/stats_gtrg4/"
pattern: str = "*-A_suffdistatmap_DINT.tsv"
df: pd.DataFrame = concat(input_dir=input_dir, pattern=pattern)

In [None]:
df["lambda_CpG"] = df["CG>TG|CA"] / df["CG"]
dict_of_count = {}
k = 0
repID = "A"
for geneID in set(df.geneID):
    print(".",end="")
    for mcmcID in set(df.mcmcID):
        v_post: float = df.loc[(df["geneID"]==geneID)&(df["mcmcID"]==mcmcID)&(df["type"]=="post")]["lambda_CpG"].to_numpy()[0]
        v_pred: float = df.loc[(df["geneID"]==geneID)&(df["mcmcID"]==mcmcID)&(df["type"]=="pred")]["lambda_CpG"].to_numpy()[0]
        dict_of_count[k] = {
                            "geneID": geneID,
                            "repID": repID,
                            "mcmcID": mcmcID,
                            "test" : 1 if v_post > v_pred else 0
            
                            }
        k+=1

In [None]:
df_test: pd.DataFrame = pd.DataFrame.from_dict(data=dict_of_count, orient="index")
df_test.groupby(by=["geneID"])["test"].agg([prop,"count"]).to_csv(ROOT_dir+"/reports/empirical_lambda_CpG.tsv", sep="\t")
df_test.groupby(by=["geneID"])["test"].agg([prop,"count"])

In [None]:
np.round(np.sum(df_test.groupby(by=["geneID"])["test"].agg([prop]) >= 0.95) / df_test.groupby(by=["geneID"])["test"].agg([prop]).shape[0],2)

In [None]:
np.round(np.sum(df_test.groupby(by=["geneID"])["test"].agg([prop]) >= 0.90) / df_test.groupby(by=["geneID"])["test"].agg([prop]).shape[0],2)

In [None]:
np.round(np.sum(df_test.groupby(by=["geneID"])["test"].agg([prop]) >= 0.99) / df_test.groupby(by=["geneID"])["test"].agg([prop]).shape[0],2)

 ### Empirical CATGTR+G

In [None]:
input_dir: str = ROOT_dir+"/outputs/empirical/stats_catgtrg4/"
pattern: str = "*-A_suffdistatmap_DINT.tsv"
df: pd.DataFrame = concat(input_dir=input_dir, pattern=pattern)

In [None]:
df["lambda_CpG"] = df["CG>TG|CA"] / df["CG"]
dict_of_count = {}
k = 0
repID = "A"
for geneID in set(df.geneID):
    print(".",end="")
    for mcmcID in set(df.mcmcID):
        v_post: float = df.loc[(df["geneID"]==geneID)&(df["mcmcID"]==mcmcID)&(df["type"]=="post")]["lambda_CpG"].to_numpy()[0]
        v_pred: float = df.loc[(df["geneID"]==geneID)&(df["mcmcID"]==mcmcID)&(df["type"]=="pred")]["lambda_CpG"].to_numpy()[0]
        dict_of_count[k] = {
                            "geneID": geneID,
                            "repID": repID,
                            "mcmcID": mcmcID,
                            "test" : 1 if v_post > v_pred else 0
            
                            }
        k+=1

In [None]:
df_test: pd.DataFrame = pd.DataFrame.from_dict(data=dict_of_count, orient="index")
df_test.groupby(by=["geneID"])["test"].agg([prop,"count"]).reset_index().to_csv(ROOT_dir+"/reports/m0gtr_tbl_lambda_CpG.tsv", sep="\t")
df_test = df_test.groupby(by=["geneID"])["test"].agg([prop,"count" ]).reset_index()

In [None]:
dict_of_prop = {}
k = 0 
for geneID in set(df.geneID):
    prop_v: float = df_test.loc[(df_test["geneID"]==geneID)]["prop"].to_numpy()[0]
    dict_of_prop[k] = {
        "test 95": 1 if prop_v > 0.95 else 0,
        "test 90": 1 if prop_v > 0.90 else 0,
        "test 80": 1 if prop_v > 0.80 else 0,
        "test 70": 1 if prop_v > 0.70 else 0,
        "geneID": geneID,
        "draw": repID,
    }
    k+=1

In [None]:
df_test_count:pd.DataFrame = pd.DataFrame.from_dict(data=dict_of_prop, orient="index")
df_test_count

In [None]:
df_test_count[["test 95","test 90", "test 80","test 70"]].agg([np.sum,"count"]).reset_index()

In [None]:
100/137

In [None]:
106/137

## PPRED 

### Empirical

In [None]:
list_of_df_obs = []
for geneID in list_of_geneID_emp:
    try:
        list_of_df_obs += [pd.read_csv(ROOT_dir+"/outputs/empirical/ppred_test/stats/"+geneID+"-1_0-OBSERVED.tsv",sep="\t",index_col=0)]
    except Exception as e:
        print("something wrong with %s"% f)
df_obs_concat = pd.concat(list_of_df_obs)
assert df_obs_concat.shape[0] == 137

In [None]:
df_obs_concat.round(3).to_csv(ROOT_dir + "/reports/XpY_empirical.csv", sep="\t")

In [None]:
from pandas.plotting import scatter_matrix


In [None]:
c: List[str] = [x+"p"+y for x, y in list(product(["A", "C", "G", "T"], ["A", "C", "G", "T"]))]
scatter_matrix(df_obs_concat[c], alpha=0.2, figsize=(6, 6), diagonal="kde")


#### GTR+G

In [None]:
list_of_df_ppred = []
for geneID in list_of_geneID_emp:
    try:
        list_of_df_ppred += [pd.read_csv(ROOT_dir+"/outputs/empirical/ppred_test/stats/"+geneID+"-GTRG4-A_ppred.tsv",sep="\t",index_col=0)]
    except Exception as e:
        print("something wrong with %s"% geneID)
df_ppred_concat = pd.concat(list_of_df_ppred)

In [None]:
assert len(set(df_ppred_concat["geneID"])) == 137 

In [None]:
dict_of_stats = {}
k=0
for geneID in list(set(df_ppred_concat.geneID)):
    dict_of_stats[k] = {
            "geneID": geneID,
        }
    for x, y in list(product(["A", "C", "G", "T"], ["A", "C", "G", "T"])):
        XpY = x + "p" + y
        XpY_ppred: np.array = df_ppred_concat.loc[(df_ppred_concat["geneID"]==geneID)][XpY].to_numpy()
        XpY_obs: float = df_obs_concat.loc[df_obs_concat["geneID"]==geneID][XpY].to_numpy()[0]
        
        dict_of_stats[k].update({
                XpY+"_test" : np.sum(XpY_ppred > XpY_obs)/len(XpY_ppred),
                XpY+"_mean" : np.mean(XpY_ppred),
                XpY+"_std" : np.std(XpY_ppred),
                XpY+"_obs": XpY_obs,
                })
        
    k+=1

In [None]:
sign(pd.DataFrame.from_dict(data=dict_of_stats,orient="index")[["TpA_test"]])

In [None]:
pd.DataFrame.from_dict(data=dict_of_stats,orient="index")[["CpG_test"]].agg([sign]).sort_values(by=["sign"],axis=1).round(3).to_csv(ROOT_dir + "/reports/CpG_test_empirical_GTRG.csv", sep="\t")

In [None]:
pd.DataFrame.from_dict(data=dict_of_stats,orient="index")[[x+"p"+y+"_test"for x, y in list(product(["A", "C", "G", "T"], ["A", "C", "G", "T"]))]].agg([sign]).sort_values(by=["sign"],axis=1).round(3).to_csv(ROOT_dir + "/reports/XpY_test_empirical_GTRG.csv", sep="\t")

In [None]:
pd.DataFrame.from_dict(data=dict_of_stats,orient="index")[["geneID"]+[x+"p"+y+"_test"for x, y in list(product(["A", "C", "G", "T"], ["A", "C", "G", "T"]))]].to_csv(ROOT_dir + "/reports/XpY_test_empirical_all_GTRG.csv", sep="\t")

In [None]:
fig, axes = plt.subplots(1,2,sharex=True, sharey="row", figsize=(6,4))
axes = axes.ravel()
list_of_subplots = ["A","B"]
k = 0
obs = df_obs_concat.loc[df_obs_concat["geneID"]== "MEP1A",["CpG"]].values[0][0]
pred = df_ppred_concat.loc[(df_ppred_concat["geneID"]=="MEP1A"),["CpG"]].values.reshape(1,-1)[0]
weights_pred = np.ones_like(pred) / pred.shape[0]
v_ratio = str(round(np.sum(obs > pred) / obs.size * 100, 2))
bins = np.histogram(
                    np.hstack([pred]), bins=20
                )[1]
axes[k].set_title(list_of_subplots[k], loc="left")
axes[k].set_xlabel("CpG frequency")
axes[k].hist(
    [pred],
    bins=bins,
    color="blue",
    alpha=0.5,
    stacked=False,
    weights=[weights_pred],
    label=["% " + v_ratio],
)
# plt.axis('off')
# plt.yaxis().set_visible(False)
axes[k].axvline(obs, color="black")
# _ = axes.set_yticks([])
# _ = axes.set_yticklabels([])
axes[k].set_ylabel("Density")

k +=1
obs = df_obs_concat.loc[df_obs_concat["geneID"]== "MEP1A",["ApT"]].values[0][0]
pred = df_ppred_concat.loc[(df_ppred_concat["geneID"]=="MEP1A"),["ApT"]].values.reshape(1,-1)[0]
weights_pred = np.ones_like(pred) / pred.shape[0]
v_ratio = str(round(np.sum(obs > pred) / obs.size * 100, 2))
bins = np.histogram(
                    np.hstack([pred]), bins=20
                )[1]
axes[k].set_title(list_of_subplots[k], loc="left")
axes[k].set_xlabel("ApT frequency")
axes[k].hist(
    [pred],
    bins=bins,
    color="blue",
    alpha=0.5,
    stacked=False,
    weights=[weights_pred],
    label=["% " + v_ratio],
)
# plt.axis('off')
# plt.yaxis().set_visible(False)
axes[k].axvline(obs, color="black")


fig.savefig(ROOT_dir + "/reports/figure1.pdf",dpi=300)

#### M0GTR

In [None]:
list_of_df_ppred = []
for geneID in list_of_geneID_emp:
    try:
        list_of_df_ppred += [pd.read_csv(ROOT_dir+"/outputs/empirical/ppred_test/stats/"+geneID+"-M0GTR-A_ppred.tsv",sep="\t",index_col=0)]
    except Exception as e:
        print("something wrong with %s"% geneID)
df_ppred_concat = pd.concat(list_of_df_ppred)

In [None]:
assert len(set(df_ppred_concat["geneID"])) == 137 

In [None]:
dict_of_stats = {}
k=0
for geneID in list(set(df_ppred_concat.geneID)):
    dict_of_stats[k] = {
            "geneID": geneID,
        }
    for x, y in list(product(["A", "C", "G", "T"], ["A", "C", "G", "T"])):
        XpY = x + "p" + y
        XpY_ppred: np.array = df_ppred_concat.loc[(df_ppred_concat["geneID"]==geneID)][XpY].to_numpy()
        XpY_obs: float = df_obs_concat.loc[df_obs_concat["geneID"]==geneID][XpY].to_numpy()[0]
        
        dict_of_stats[k].update({
                XpY+"_test" : np.sum(XpY_ppred > XpY_obs)/len(XpY_ppred),
                XpY+"_mean" : np.mean(XpY_ppred),
                XpY+"_std" : np.std(XpY_ppred),
                XpY+"_obs": XpY_obs,
                })
        
    k+=1

In [None]:
pd.DataFrame.from_dict(data=dict_of_stats,orient="index")[[x+"p"+y+"_test"for x, y in list(product(["A", "C", "G", "T"], ["A", "C", "G", "T"]))]].agg([np.mean]).sort_values(by=["mean"],axis=1).round(3).to_csv(ROOT_dir + "/reports/XpY_test_GTRG.csv", sep="\t")

### Simulation

#### GTR+G

In [None]:
list_of_df_obs = []
k = 0 
for geneID in list_of_geneID_simu:
    for repID in range(0,10):
        cur_df = pd.read_csv(ROOT_dir+"/outputs/simulation/gtrg4/ppred_test/stats/"+str(geneID)+"-GTRG4-A-"+str(repID)+"-OBSERVED.tsv",sep="\t", index_col=0)
        list_of_df_obs += [cur_df]

df_obs_concat = pd.concat(list_of_df_obs, ignore_index=True)

##### GTR+G

In [None]:
list_of_df_ppred = []
k = 0 
for geneID in list_of_geneID_simu:
    for repID in range(0,10):
        cur_df = pd.read_csv(ROOT_dir+"/outputs/simulation/gtrg4/ppred_test/stats/"+str(geneID)+"-GTRG4-A-"+str(repID)+"-GTRG4-A_ppred.tsv",sep="\t", index_col=0)
        list_of_df_ppred += [cur_df]
df_ppred_concat = pd.concat(list_of_df_ppred, ignore_index=True)

In [None]:
pd.DataFrame.from_dict(data=dict_of_stats,orient="index")[["CpG_test"]].agg([sign, "count"]).round(2)#.to_csv(ROOT_dir + "/reports/CpG_test_simu_GTRG_GTRG.csv", sep="\t")

#### M0GTR

In [None]:
list_of_df_obs = []
k = 0 
for geneID in list_of_geneID_simu:
    for omega in [0.2,1]: #, 1.0
        for CpG in [1,4,8]:
            for TpA in [1,]:#4,8
                for tbl in [1,10]:#2,5,
                    print(".",end="")
                    for repID in range(0,10):
                        cur_df = pd.read_csv(ROOT_dir+"/outputs/simulation/m0gtr/ppred_test/stats/"+str(geneID)+"-M0GTR-"+str(omega)+"-"+str(CpG)+"-"+str(TpA)+"-"+str(tbl)+"-"+str(repID)+"-A-1_0-OBSERVED.tsv",sep="\t", index_col=0)
                        cur_df["tbl"] = [tbl]*cur_df.shape[0]
                        list_of_df_obs += [cur_df]

df_obs_concat = pd.concat(list_of_df_obs, ignore_index=True)

#####  GTR+G 

In [None]:
list_of_df_ppred = []
k = 0 
for geneID in list_of_geneID_simu:
    for omega in [0.2,1]: #1.0
        for CpG in [1,4,8]:
            for TpA in [1]:#,4,8
                for tbl in [1]: #,2,5
                    print(".",end="")
                    for repID in range(0,10):
                        cur_df = pd.read_csv(ROOT_dir+"/outputs/simulation/m0gtr/ppred_test/stats/"+str(geneID)+"-GTRG4-"+str(omega)+"-"+str(CpG)+"-"+str(TpA)+"-"+str(tbl)+"-"+str(repID)+"-A_ppred.tsv",sep="\t", index_col=0)
                        cur_df["tbl"] = [tbl]*cur_df.shape[0]
                        list_of_df_ppred += [cur_df]
df_ppred_concat = pd.concat(list_of_df_ppred, ignore_index=True)

In [None]:
dict_of_stats = {}
k=0
for geneID in list_of_geneID_simu:
    for omega in [0.2,1]:
        for CpG in [1,4,8]:
            for TpA in [1,]:#
                for tbl in [1]:#,2,5,10
                    for drawID in range(0,10):
                        dict_of_stats[k] = {
                            "geneID": geneID,
                            "CpG": CpG,
                            "TpA": TpA,
                            "tbl": tbl,
                            "omega":omega,
                            "drawID":drawID
                        }
                        for x, y in list(product(["A", "C", "G", "T"], ["A", "C", "G", "T"])):
                            XpY = x + "p" + y

                            XpY_ppred: np.array = df_ppred_concat.loc[(df_ppred_concat["tbl"]==tbl)&(df_ppred_concat["geneID"]==geneID)&(df_ppred_concat["omega"]==omega)&(df_ppred_concat["CpGf"]==CpG)&(df_ppred_concat["TpAf"]==TpA)&(df_ppred_concat["draw"]==drawID)][XpY].to_numpy()
                            XpY_obs: float = df_obs_concat.loc[(df_obs_concat["tbl"]==tbl)&(df_obs_concat["geneID"]==geneID)&(df_obs_concat["omega"]==omega)&(df_obs_concat["CpGf"]==CpG)&(df_obs_concat["TpAf"]==TpA)&(df_obs_concat["draw"]==drawID)][XpY].to_numpy()[0]
                            dict_of_stats[k].update({
                                XpY+"_test" : np.sum(XpY_ppred > XpY_obs)/len(XpY_ppred),
                                XpY+"_mean" : np.mean(XpY_ppred),
                                XpY+"_std" : np.std(XpY_ppred),
                                XpY+"_obs":XpY_obs,
                            })
                        k+=1

In [None]:
pd.DataFrame.from_dict(data=dict_of_stats,orient="index").groupby(by=["CpG","TpA","tbl","omega"])[["CpG_test"]].agg([sign, "count"]).round(2).to_csv(ROOT_dir + "/reports/CpG_test_simu_M0GTR_GTRG.csv", sep="\t")

In [None]:
pd.DataFrame.from_dict(data=dict_of_stats,orient="index").groupby(by=["CpG","TpA","tbl","omega"])[[x+"p"+y+"_test"for x, y in list(product(["A", "C", "G", "T"], ["A", "C", "G", "T"]))]].agg([sign]).round(2).to_csv(ROOT_dir + "/reports/XpY_test_simu_M0GTR_GTRG.csv", sep="\t")

##### M0GTR 

In [None]:
list_of_df_ppred = []
k = 0 
for geneID in list_of_geneID_simu:
    for omega in [0.2,1]: #1.0
        for CpG in [1,4,8]:
            for TpA in [1]:#4,8
                for tbl in [1,]:#2,5,
                    print(".",end="")
                    for repID in range(0,10):
                        cur_df = pd.read_csv(ROOT_dir+"/outputs/simulation/m0gtr/ppred_test/stats/"+str(geneID)+"-M0GTR-"+str(omega)+"-"+str(CpG)+"-"+str(TpA)+"-"+str(tbl)+"-"+str(repID)+"-A_ppred.tsv",sep="\t", index_col=0)
                        cur_df["tbl"] = [tbl]*cur_df.shape[0]
                        list_of_df_ppred += [cur_df]
df_ppred_concat = pd.concat(list_of_df_ppred, ignore_index=True)


In [None]:
dict_of_stats = {}
k=0
for geneID in list_of_geneID_simu:
    for omega in [0.2,1]:
        for CpG in [1,4,8]:
            for TpA in [1,]:#
                for tbl in [1,]:#,2,5,10
                    for drawID in range(0,10):
                        dict_of_stats[k] = {
                            "geneID": geneID,
                            "CpG": CpG,
                            "TpA": TpA,
                            "tbl": tbl,
                            "omega":omega,
                            "drawID":drawID
                        }
                        for x, y in list(product(["A", "C", "G", "T"], ["A", "C", "G", "T"])):
                            XpY = x + "p" + y

                            XpY_ppred: np.array = df_ppred_concat.loc[(df_ppred_concat["tbl"]==tbl)&(df_ppred_concat["geneID"]==geneID)&(df_ppred_concat["omega"]==omega)&(df_ppred_concat["CpGf"]==CpG)&(df_ppred_concat["TpAf"]==TpA)&(df_ppred_concat["draw"]==drawID)][XpY].to_numpy()
                            XpY_obs: float = df_obs_concat.loc[(df_obs_concat["tbl"]==tbl)&(df_obs_concat["geneID"]==geneID)&(df_obs_concat["omega"]==omega)&(df_obs_concat["CpGf"]==CpG)&(df_obs_concat["TpAf"]==TpA)&(df_obs_concat["draw"]==drawID)][XpY].to_numpy()[0]
                            dict_of_stats[k].update({
                                XpY+"_test" : np.sum(XpY_ppred > XpY_obs)/len(XpY_ppred),
                                XpY+"_mean" : np.mean(XpY_ppred),
                                XpY+"_std" : np.std(XpY_ppred),
                                XpY+"_obs":XpY_obs,
                            })
                        k+=1

In [None]:
pd.DataFrame.from_dict(data=dict_of_stats,orient="index").groupby(by=["CpG","TpA","tbl","omega"])[["CpG_test"]].agg([sign, "count"]).round(3).to_csv(ROOT_dir + "/reports/CpG_test_simu_M0GTR_M0GTR.csv", sep="\t")

In [None]:
pd.DataFrame.from_dict(data=dict_of_stats,orient="index").groupby(by=["CpG","TpA","tbl","omega"])[[x+"p"+y+"_test"for x, y in list(product(["A", "C", "G", "T"], ["A", "C", "G", "T"]))]].agg([sign]).to_csv(ROOT_dir + "/reports/XpY_test_simu_M0GTR_M0GTR.csv", sep="\t")

## Mappings


### Empirical


In [None]:
def recover_data(input_dir, pattern)-> List[pd.DataFrame]:
    list_of_df: List[pd.DataFrame] = []
    list_of_files: List[str] = glob.glob(input_dir + pattern)
    for f in list_of_files:
        GENEID = f.split("/")[-1].split("-")[0]
        df: pd.DataFrame = pd.read_csv(f, sep="\t")
        df["geneID"] = [GENEID]*df.shape[0]
        list_of_df += [df]
    return pd.concat(list_of_df,ignore_index=True)

#### GTR+G

In [None]:
input_dir = ROOT_dir + "/outputs/empirical/pbmpi_gtrg4/"
pattern = "*-A.TsCpGRate"
df_concat_gtrg4 = recover_data(input_dir=input_dir,pattern=pattern)

In [None]:
df_concat_gtrg4["CpGRate"] = (df_concat_gtrg4["CG>TG"]+df_concat_gtrg4["CG>CA"])/df_concat_gtrg4["CG"]
df_concat_gtrg4["TpARate"] = (df_concat_gtrg4["TA>CA"]+df_concat_gtrg4["TA>TG"])/df_concat_gtrg4["TA"]
df_concat_gtrg4.groupby(["geneID","type"]).agg([np.mean,np.std])[["CpGRate","TpARate","CG>TG","CG>CA","CG","TA>CA","TA>TG","TA"]].round(2)

In [None]:
dict_of_stats = {}
k = 0
rowiter = iter(df_concat_gtrg4.iterrows())
while ((row_post := next(rowiter, None)) is not None):
    row_pred = next(rowiter)
    dict_of_stats[k] = {
        "CpGRate_post": row_post[1]["CpGRate"],
        "CpGRate_pred": row_pred[1]["CpGRate"],
        "CpGRate_comp": row_post[1]["CpGRate"]>row_pred[1]["CpGRate"],
        "TpARate_post": row_post[1]["TpARate"], 
        "TpARate_pred": row_pred[1]["TpARate"],
        "TpARate_comp": row_post[1]["TpARate"]>row_pred[1]["TpARate"],
        "mcmcID": row_post[1]["mcmcID"],
        "geneID": row_post[1]["geneID"],
    }
    k+=1

In [None]:
df_TsCpGRate = pd.DataFrame.from_dict(data=dict_of_stats,orient="index")
df_comp = df_TsCpGRate.groupby(["geneID",])[["CpGRate_comp","TpARate_comp"]]\
    .agg([np.mean]).droplevel(level=1,axis=1).reset_index()

In [None]:
df_comp["TpARate_comp"].hist()

In [None]:
df_comp.sort_values(by=["geneID"]).to_csv(ROOT_dir + "/reports/map_test_gtrg4.csv",sep="\t")

In [None]:
np.sum(df_comp["CpGRate_comp"] > 0.95) / 137

In [None]:
dict_of_stats_ = {}
k=0
for geneID in list_of_geneID_emp:
    test = df_comp.loc[(df_comp["geneID"]== geneID),"CpGRate_comp"]
    CpGRate_pvalue = (np.sum(test > 0.95))/test.shape[0]
    dict_of_stats_[k] = {
        "geneID": geneID,
        "pvalue": CpGRate_pvalue,
    }
    k +=1
pd.DataFrame.from_dict(data=dict_of_stats_,orient="index")

#### M0GTR

In [None]:
input_dir = ROOT_dir + "/outputs/empirical/pbmpi_m0gtr/"
pattern = "*-A.TsCpGRate"
df_concat_m0gtr = recover_data(input_dir=input_dir,pattern=pattern)

In [None]:
df_concat_m0gtr.groupby(["geneID","type"]).agg(["count"])

In [None]:
df_concat_m0gtr["CpGRate"] = df_concat_m0gtr["NSubSynTsCpG23"]/df_concat_m0gtr["TcodonNCG"]
df_concat_m0gtr["TpARate"] = df_concat_m0gtr["NSubSynTsTpA23"]/df_concat_m0gtr["TcodonNTA"]
df_concat_m0gtr.groupby(["geneID","type"]).agg([np.mean,])[["TcodonNCG","NSubSynTsCpG23","TcodonNTA","NSubSynTsTpA23","CpGRate","TpARate","NSub","NNSynSyb","NSynSub"]].round(2)

In [None]:
dict_of_stats = {}
k = 0
rowiter = iter(df_concat_m0gtr.iterrows())
while ((row_post := next(rowiter, None)) is not None):
    row_pred = next(rowiter)
    dict_of_stats[k] = {
        "CpGRate_post": row_post[1]["CpGRate"],
        "CpGRate_pred": row_pred[1]["CpGRate"],
        "CpGRate_comp": row_post[1]["CpGRate"]>row_pred[1]["CpGRate"],
        "TpARate_post": row_post[1]["TpARate"], 
        "TpARate_pred": row_pred[1]["TpARate"],
        "TpARate_comp": row_post[1]["TpARate"]>row_pred[1]["TpARate"],
        "mcmcID": row_post[1]["mcmcID"],
        "geneID": row_post[1]["geneID"],
    }
    k+=1

In [None]:
df_TsCpGRate = pd.DataFrame.from_dict(data=dict_of_stats,orient="index")
df_comp = df_TsCpGRate.groupby(["geneID",])[["CpGRate_comp","TpARate_comp"]]\
    .agg([np.mean]).droplevel(level=1,axis=1).reset_index()

In [None]:
df_comp["CpGRate_comp"].hist()

In [None]:
df_comp.sort_values(by=["geneID"]).to_csv(ROOT_dir + "/reports/map_test_m0gtr.csv",sep="\t")

In [None]:
np.sum(df_comp["CpGRate_comp"] > 0.95)

In [None]:
(np.sum(df_comp["CpGRate_comp"] > 0.95) / 137) . round(2)

In [None]:
dict_of_stats_ = {}
k=0
for geneID in list_of_geneID_emp:
    test = df_comp.loc[(df_comp["geneID"]== geneID),"CpGRate_comp"]
    CpGRate_pvalue = (np.sum(test > 0.95))/test.shape[0]
    dict_of_stats_[k] = {
        "geneID": geneID,
        "pvalue": CpGRate_pvalue,
    }
    k +=1
pd.DataFrame.from_dict(data=dict_of_stats_,orient="index")

### Simulation


In [None]:
def recover_data(input_dir, pattern)-> List[pd.DataFrame]:
    list_of_df: List[pd.DataFrame] = []
    list_of_files: List[str] = glob.glob(input_dir + pattern)
    for f in list_of_files:
        # WDR91-M0GTR-0.2-1-1-1-0-A.TsCpGRate
        GENEID = f.split("/")[-1].split("-")[0]
        OMEGA = float(f.split("/")[-1].split("-")[2])
        CPG = float(f.split("/")[-1].split("-")[3])
        TPA = float(f.split("/")[-1].split("-")[4])
        TBL = float(f.split("/")[-1].split("-")[5])
        DRAWID = int(f.split("/")[-1].split("-")[6])
        df: pd.DataFrame = pd.read_csv(f, sep="\t")
        df["geneID"] = [GENEID]*df.shape[0]
        df["omega"] = [OMEGA]*df.shape[0]
        df["CpG"] = [CPG]*df.shape[0]
        df["TpA"] = [TPA]*df.shape[0]
        df["tbl"] = [TBL]*df.shape[0]
        df["drawID"] = [DRAWID]*df.shape[0]
        list_of_df += [df]
    return pd.concat(list_of_df,ignore_index=True)

#### GTR+G

In [None]:
input_dir = ROOT_dir + "/outputs/simulation/m0gtr/pbmpi_gtrg4/"
pattern = "*-A.TsCpGRate"
df_concat_gtrg4 = recover_data(input_dir=input_dir,pattern=pattern)
df_concat_gtrg4 = df_concat_gtrg4.loc[(df_concat_gtrg4["TpA"]==1)]#&(df_concat_gtrg4["CpG"]==1)&(df_concat_gtrg4["tbl"]==1)

In [None]:
set(df_concat_gtrg4.geneID)

In [None]:
df_concat_gtrg4["CpGRate"] = (df_concat_gtrg4["CG>TG"]+df_concat_gtrg4["CG>CA"])/df_concat_gtrg4["CG"]
df_concat_gtrg4["TpARate"] = (df_concat_gtrg4["TA>CA"]+df_concat_gtrg4["TA>TG"])/df_concat_gtrg4["TA"]
df_concat_gtrg4.groupby(["geneID","drawID","omega","CpG","TpA","tbl","type"]).agg([np.mean,np.std])[["CpGRate","TpARate","CG>TG","CG>CA","CG","TA>CA","TA>TG","TA"]].round(2)

In [None]:
df_concat_gtrg4.groupby(["geneID","omega","CpG","TpA","tbl","type"]).agg(["count"]).to_csv(ROOT_dir + "/reports/ALLO.csv",sep="\t")

In [None]:
dict_of_stats = {}
k = 0
rowiter = iter(df_concat_gtrg4.iterrows())
while ((row_post := next(rowiter, None)) is not None):
    row_pred = next(rowiter)
    dict_of_stats[k] = {
        "CpGRate_post": row_post[1]["CpGRate"],
        "CpGRate_pred": row_pred[1]["CpGRate"],
        "CpGRate_comp": row_post[1]["CpGRate"]>row_pred[1]["CpGRate"],
        "TpARate_post": row_post[1]["TpARate"], 
        "TpARate_pred": row_pred[1]["TpARate"],
        "TpARate_comp": row_post[1]["TpARate"]>row_pred[1]["TpARate"],
        "mcmcID": row_post[1]["mcmcID"],
        "geneID": row_post[1]["geneID"],
        "omega" : row_post[1]["omega"],
        "CpG" : row_post[1]["CpG"],
        "TpA" : row_post[1]["TpA"],
        "tbl" : row_post[1]["tbl"],
        "drawID": row_post[1]["drawID"], 
    }
    k+=1

In [None]:
df_TsCpGRate = pd.DataFrame.from_dict(data=dict_of_stats,orient="index")
df_comp = df_TsCpGRate.groupby(["geneID","drawID","omega","CpG","TpA","tbl"])[["CpGRate_comp","TpARate_comp"]]\
    .agg([np.mean]).droplevel(level=1,axis=1).reset_index()

In [None]:
df_comp.sort_values(by=["geneID","omega","CpG","TpA","tbl"]).to_csv(ROOT_dir + "/reports/map_test_m0gtr_gtrg4.csv",sep="\t")

In [None]:
df_comp.groupby(by=["CpG","TpA","omega","tbl"])[["CpGRate_comp"]].agg([sign,"count"])

In [None]:
dict_of_stats_ = {}
k=0
for geneID in list_of_geneID_simu:
    for omega in [0.2,1]:
        for CpG in [1,4,8]:
            for TpA in [1,]:#
                for tbl in [1,10]:#,2,5,10
                    test = df_comp.loc[(df_comp["geneID"]== geneID)&(df_comp["CpG"]== CpG)&(df_comp["TpA"]== TpA)&(df_comp["tbl"]== tbl)&(df_comp["omega"]== omega),"CpGRate_comp"]
                    CpGRate_pvalue = (np.sum(test > 0.95))/test.shape[0]
                    dict_of_stats_[k] = {
                        "geneID": geneID,
                        "CpG": CpG,
                        "TpA": TpA,
                        "tbl": tbl,
                        "omega":omega,
                        "pvalue": CpGRate_pvalue,
                    }
                    k +=1
                        
pd.DataFrame.from_dict(data=dict_of_stats_,orient="index").groupby(by=["omega","CpG","TpA","tbl"]).agg([np.mean])

#### M0GTR

In [None]:
input_dir = ROOT_dir + "/outputs/simulation/m0gtr/pbmpi_m0gtr_/"
pattern = "*-M0GTR-*A.TsCpGRate"
df_concat_m0gtr = recover_data(input_dir=input_dir,pattern=pattern)
df_concat_m0gtr = df_concat_m0gtr.loc[(df_concat_m0gtr["TpA"]==1)]#&(df_concat_gtrg4["tbl"]==1)
df_concat_m0gtr["CpGRate"] = df_concat_m0gtr["NSubSynTsCpG23"]/df_concat_m0gtr["TcodonNCG"]
df_concat_m0gtr["TpARate"] = df_concat_m0gtr["NSubSynTsTpA23"]/df_concat_m0gtr["TcodonNTA"]
df_concat_m0gtr.drop("geneID",axis=1).groupby(["omega","CpG","TpA","tbl","type"]).agg([np.mean,])[["TcodonNCG","NSubSynTsCpG23","TcodonNTA","NSubSynTsTpA23","CpGRate","TpARate","NSub","NNSynSyb","NSynSub"]].round(2)

In [None]:
df_concat_m0gtr.groupby(["omega","CpG","TpA","tbl","type"]).agg(["count"])#.to_csv(ROOT_dir + "/reports/ALLO_.csv",sep="\t")

In [None]:
dict_of_stats = {}
k = 0
rowiter = iter(df_concat_m0gtr.iterrows())
while ((row_post := next(rowiter, None)) is not None):
    try:
        row_pred = next(rowiter)
    except Exception as e:
        print(e,row_post[0])
    dict_of_stats[k] = {
        "CpGRate_post": row_post[1]["CpGRate"],
        "CpGRate_pred": row_pred[1]["CpGRate"],
        "CpGRate_comp": row_post[1]["CpGRate"]>row_pred[1]["CpGRate"],
        "TpARate_post": row_post[1]["TpARate"], 
        "TpARate_pred": row_pred[1]["TpARate"],
        "TpARate_comp": row_post[1]["TpARate"]>row_pred[1]["TpARate"],
        "mcmcID": row_post[1]["mcmcID"],
        "geneID": row_post[1]["geneID"],
        "omega" : row_post[1]["omega"],
        "CpG" : row_post[1]["CpG"],
        "TpA" : row_post[1]["TpA"],
        "tbl" : row_post[1]["tbl"],
        "drawID": row_post[1]["drawID"], 
    }
    k+=1

In [None]:
pd.DataFrame.from_dict(data=dict_of_stats,orient="index")\
    .groupby(["drawID","omega","CpG","TpA","tbl"])[["CpGRate_comp","TpARate_comp"]]\
        .agg([np.mean])\
            .to_csv(ROOT_dir + "/reports/CpGRate_comp.csv",sep="\t")

In [None]:
df_comp = pd.DataFrame.from_dict(data=dict_of_stats,orient="index")\
    .groupby(["geneID","drawID","omega","CpG","TpA","tbl"])[["CpGRate_comp","TpARate_comp"]]\
        .agg([np.mean]).droplevel(level=1,axis=1).reset_index()

In [None]:
df_comp.sort_values(by=["geneID","omega","CpG","TpA","tbl"])

In [None]:
dict_of_stats_ = {}
k=0
for geneID in list_of_geneID_simu:
    for omega in [0.2,1]:
        for CpG in [1,4,8]:
            for TpA in [1,]:#
                for tbl in [1,10]:#,2,5,10
                    #(df_comp["geneID"]== geneID)&
                    test = df_comp.loc[(df_comp["geneID"]== geneID)&(df_comp["CpG"]== CpG)&(df_comp["TpA"]== TpA)&(df_comp["tbl"]== tbl)&(df_comp["omega"]== omega),"CpGRate_comp"]
                    CpGRate_pvalue = (np.sum(test > 0.95))/test.shape[0]
                    dict_of_stats_[k] = {
                        "geneID": geneID,
                        "CpG": CpG,
                        "TpA": TpA,
                        "tbl": tbl,
                        "omega":omega,
                        "pvalue": CpGRate_pvalue,
                    }
                    k +=1
                        


In [None]:
df_comp.groupby(by=["CpG","TpA","omega","tbl"])[["CpGRate_comp"]].agg([sign,"count"])

In [None]:
                        
df_comp.loc[(df_comp["geneID"]== geneID)&(df_comp["CpG"]== CpG)&(df_comp["TpA"]== TpA)&(df_comp["tbl"]== tbl)&(df_comp["omega"]== omega),[]]

In [None]:
pd.DataFrame.from_dict(data=dict_of_stats_,orient="index").groupby(by=["CpG","omega","TpA","tbl"]).agg(np.mean)