# Load Libraries

In [None]:
# Data manipulation
import numpy as np
import pandas as pd

# Data science
import math
import scipy.stats as stats
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from statsmodels.stats.multitest import multipletests as mt

# Plots
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt

# Working with dates
from datetime import date,datetime
import dateutil

# Looping  progress
from tqdm.notebook import tqdm

# Reg expressions
import re

# Pretty table printing
import tabulate

# ***REMOVED*** Snippets Require these
import os
import subprocess

import glob

# Misc libraries
from IPython.display import display, HTML
#from IPython.core.display import display, HTML

# Set seaborn figure size, font size, and style
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set(font_scale=1.5)
sns.set_style("white")

# Set Pandas options so we can see our entire dataframe
pd.options.display.max_rows = 10000
pd.options.display.max_columns = 10000
pd.options.display.max_colwidth = None

# Print our versions of this packages, this allows us to make sure
# we have the working versions we need. 
print(f"Pandas version: {pd.__version__}")

In [None]:
HOME_DIR = "/data/pathogen_ncd"

# Collect all empirical results

We have a separate file with an empirical p-value for each disease-antibody pair that we need to combine before we can further analyze.

```bash

# Push all the empirical p-value results for all ICDs into a single file
cat "${HOME_DIR}/results/perm_p_sims/emp_calcs/"*.tsv > all_icd_emp_p_results.tsv

# Grab what will be the header line
grep 'Unparsed_Disease' all_icd_emp_p_results.tsv  | head -n 1 > header

# Remove extraneous header lines in our final file
grep -v 'Unparsed_Disease' all_icd_emp_p_results.tsv  | sponge  all_icd_emp_p_results.tsv

# Add header back to the top of our now cleaned empiricial p-value results file containing an
# empirical p-value for each disease-antibody pair.
cat header all_icd_emp_p_results.tsv | sponge all_icd_emp_p_results.tsv

```

# Examining emp p-values

In [None]:
# Load in analytical results
res_dir = f'{HOME_DIR}/results'
res = pd.read_csv(f'{res_dir}/tri_mod_results_01_17_2023.csv')
res = res.rename(columns = {'organism' : 'org', 'Antigen' : 'anti'})

org_ab_ls = res.loc[:, ['org', 'anti']].drop_duplicates().values.tolist()

In [None]:
emp_dir = f'{res_dir}/perm_p_sims/emp_calcs'

emp_res = pd.read_csv(f'{emp_dir}/all_icd_emp_p_results.tsv', 
                      sep = '\t')

## Make sure we have emp p's for each dis-Ab

In [None]:
# Differing numbers
# 23,257
print(len(res))

# 23,122
print(len(emp_res))

In [None]:
res_str = set(res['icd'] + '_' + res['org'] + '_' + res['anti'])
emp_str = set(emp_res['icd'] + '_' + emp_res['org'] + '_' + emp_res['anti'])

print(emp_str.difference(res_str))

# O80, O81, O82 missing because these are the controls for O codes so 
# these are expected to be left out.
print(list(res_str.difference(emp_str)))

# 135
print(len(list(res_str.difference(emp_str))))

# 135
print(len(res) - len(emp_res))

# So the O controls make up all of the "missing" results

## Write the file out

In [None]:
emp_res.to_csv(f'{res_dir}/emp_results_01_17_2023.tsv', sep = '\t', index = False)