# Load Libraries

In [None]:
# Data manipulation
import numpy as np
import pandas as pd

# Data science
import math
import scipy.stats as stats
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from statsmodels.stats.multitest import multipletests as mt

# Plots
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt

# Working with dates
from datetime import date,datetime
import dateutil

# Looping  progress
from tqdm.notebook import tqdm

# Reg expressions
import re

# Reading/Writing Excel files with Pandas
import xlrd

# Pretty table printing
import tabulate

import os
import subprocess

# Misc libraries
from IPython.display import display, HTML
#from IPython.core.display import display, HTML

# Set seaborn figure size, font size, and style
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set(font_scale=1.5)
sns.set_style("white")

# Set Pandas options so we can see our entire dataframe
pd.options.display.max_rows = 10000
pd.options.display.max_columns = 10000
pd.options.display.max_colwidth = None

# Print our versions of this packages, this allows us to make sure
# we have the working versions we need. 
print(f"Pandas version: {pd.__version__}")
print(f"xlrd version: {xlrd.__version__}")

# Prep Environment

In [None]:
# Remove grey side bars
display(HTML("<style>.container { width:90% !important; }</style>"))

os.chdir('../../../../results/')

# Cov summary

In [None]:
# Most ICDs have 45 Abs tests, however some have 44, this is the case
# when H. pylori's CagA Ab which was only measured for half of the 
# people dropped us below our UKB threshold of 17 for that pair.
# All A and B diseases have a max of 43 as we didn't test either HIV
# Abs on them as expected negatives, and HIV (B24) only has 2 Ab tests

In [None]:
res = pd.read_excel('../manuscript/latest/supplemental_datasets/supplemental_dataset_2.xlsx',
                    sheet_name = 'Results')

In [None]:
# Revert our more human-friendly column names back to computer-friendly ones.
human_to_computer_column_dict = {'Disease' : 'disease', 
                                'ICD10' : 'icd', 
                                'Organism' : 'org', 
                                'Antibody' : 'anti', 
                                'Pair is Associated' : 'pair_is_associated', 
                                'Standard Level' : 'std_lev', 
                                'Replication Status' : 'rep_stat', 
                                'UKB adj p' : 'ukb_per_dis_bh_fdr_corr_nom_p', 
                                'TNX adj p' : 'tnx_per_dis_bh_fdr_corr_p', 
                                'UKB OR' : 'ukb_OR', 
                                'TNX OR' : 'tnx_OR', 
                                'UKB CI' : 'ukb_anti_CI', 
                                'TNX CI' : 'tnx_CI', 
                                'UKB nCase' : 'ukb_nCase',
                                'UKB nControl' : 'ukb_nControl',
                                'TNX nCase' : 'tnx_nCase', 
                                'TNX nControl' : 'tnx_nControl'
}

res = res.rename(columns = human_to_computer_column_dict)

# Revert human friendly organism names to computationally friendly ones
human_to_computer_org_dict = {
                                'BKV': 'bkv',
                                'C. trachomatis': 'c_trach',
                                'CMV': 'cmv',
                                'EBV': 'ebv',
                                'H. pylori': 'h_pylor',
                                'HBV': 'hbv',
                                'HCV': 'hcv',
                                'HHV-6': 'hhv_6',
                                'HHV-7': 'hhv_7',
                                'HIV': 'hiv',
                                'HPV-16': 'hpv_16',
                                'HPV-18': 'hpv_18',
                                'HSV-1': 'hsv_1',
                                'HSV-2': 'hsv_2',
                                'HTLV-1': 'htlv',
                                'JCV': 'jcv',
                                'KSHV': 'kshv',
                                'MCV': 'mcv',
                                'T. gondii': 't_gond',
                                'VZV': 'vzv'
                            }
res.loc[:, 'org'] = res.loc[:, 'org'].replace(human_to_computer_org_dict)


human_to_computer_anti_dict = {
                                    'K8.1' : 'K8_1', 
                                    'gE / gI' : 'gE_gl', 
                                    'pp 52' : 'pp52', 
                                    'pp 28' : 'pp28', 
                                    '2mgG' : 'IgG', 
                                    'momp A' : 'momp_A', 
                                    'pp150 Nter' : 'pp150_Nter', 
                                    '1gG' : 'IgG', 
                                    'p101 k' : 'p101_k', 
                                    'tarp-D F2' : 'tarp-D_F2', 
                                    'momp D' : 'momp_D', 
                                    'tarp-D F1' : 'tarp-D_F1', 
                                    'VCA p18' : 'VCA_p18'
}
res.loc[:, 'anti'] = res.loc[:, 'anti'].replace(human_to_computer_anti_dict)

human_to_computer_std_lev_dict = {
    'Unknown' : 'unk',
    'Exp. Negative' : 'exp_neg'
}
res.loc[:, 'std_lev'] = res.loc[:, 'std_lev'].replace(human_to_computer_std_lev_dict)

human_to_computer_rep_dict = {
    'Did not attempt' : 'did_not_attempt', 
    'Replicated' : 'replicated', 
    'Failed Replication' : 'did_not', 
    'Could not attempt' : 'could_not'
}
res.loc[:, 'rep_stat'] = res.loc[:, 'rep_stat'].replace(human_to_computer_rep_dict)

In [None]:
res.loc[:, 'icd_cat'] = res.loc[:, 'icd'].str[0]
res.loc[:, 'icd_site'] = res.loc[:, 'icd'].str[-2:]
res.loc[:, 'icd_site'] = res.loc[:, 'icd_site'].astype(int)
res.loc[:, 'icd_site_str'] = res.loc[:, 'icd'].str[-2:].str.rjust(width = 2, fillchar = '0')

## Collect the data

In [None]:
# Read in latest results
ukb_res = pd.read_csv('emp_results_01_17_2023_case_filt.tsv', sep = '\t')
org_lev_res = res.copy(deep = True)

In [None]:
# Align UKB and org lev res
# Fix some weird chars
ukb_res.loc[:, 'org'] = ukb_res.loc[:, 'org'].replace({'MCV\xa0' : 'MCV', 
                                                   'H.\xa0pylori' : 'H.pylori'})

org_to_tag_dict = {
                            'HSV1'           : 'hsv_1',
                            'HSV2'           : 'hsv_2',
                            'VZV'            : 'vzv',
                            'EBV'            : 'ebv',
                            'CMV'            : 'cmv',
                            'HHV-6'          : 'hhv_6',
                            'HHV-7'          : 'hhv_7',
                            'KSHV/HHV-8'     : 'kshv',
                            'HBV'            : 'hbv',
                            'HCV'            : 'hcv',
                            'T. gondii'      : 't_gond',
                            'T.gondii'      : 't_gond',
                            'HTLV-1'         : 'htlv',
                            'BKV'            : 'bkv',
                            'JCV'            : 'jcv',
                            'MCV'            : 'mcv',
                            'HPV-16'         : 'hpv_16',
                            'HPV-18'         : 'hpv_18',
                            'C. trachomatis' : 'c_trach',
                            'C.trachomatis' : 'c_trach',

                            'H.pylori'       : 'h_pylor',
                            'HIV'            : 'hiv'
                        }

ukb_res['org'] = ukb_res.loc[:, 'org'].replace(org_to_tag_dict)

In [None]:
print("Orgs:")
print(set(org_lev_res['org'].unique().tolist()).difference(set(ukb_res['org'].unique().tolist())))
print(set(ukb_res['org'].unique().tolist()).difference(set(org_lev_res['org'].unique().tolist())))

print("Antibodies:")
print(set(org_lev_res['anti'].unique().tolist()).difference(set(ukb_res['anti'].unique().tolist())))
print(set(ukb_res['anti'].unique().tolist()).difference(set(org_lev_res['anti'].unique().tolist())))

print("Diseases:")
print(set(org_lev_res['icd'].unique().tolist()).difference(set(ukb_res['icd'].unique().tolist())))
print(set(ukb_res['icd'].unique().tolist()).difference(set(org_lev_res['icd'].unique().tolist())))

## Start Processing

In [None]:
P_THRESH = 0.3
ukb_res['bh_nom_p_sig'] = False
ukb_res.loc[ukb_res['per_dis_bh_fdr_corr_nom_p'] < P_THRESH, 'bh_nom_p_sig'] = True


org_lev_res['bh_nom_p_sig'] = False
org_lev_res.loc[org_lev_res['ukb_per_dis_bh_fdr_corr_nom_p'] < P_THRESH, 'bh_nom_p_sig'] = True


ukb_has_covs = ukb_res.loc[ukb_res['cov_adj_for'].notnull(), :]
ukb_no_covs  =  ukb_res.loc[ukb_res['cov_adj_for'].isnull(), :]


org_has_covs = org_lev_res.loc[org_lev_res['ukb_covs'].notnull(), :]
org_no_covs  =  org_lev_res.loc[org_lev_res['ukb_covs'].isnull(), :]

In [None]:
# 19,289
print(len(ukb_res))

# 14,521 
print(len(ukb_has_covs))


# 4,768
print(len(ukb_no_covs))


all_ukb_covs = ukb_has_covs['cov_adj_for'].str.split(', ', expand = True)
all_ukb_covs = all_ukb_covs.melt()


# None            85091
# age              7642
# sex              6808
# bmi              4511
# tdi_quant        2719
# tobac            2545
# num_in_house     2127
# ethnic           1503
# alc              1259
# num_sex_part     1081
# same_sex          882
print(all_ukb_covs['value'].value_counts(dropna = False).sort_values(ascending = False))



# 8,616
print(len(org_lev_res))

# 6,296
print(len(org_has_covs))

# 2,320
print(len(org_no_covs))


all_org_covs = org_has_covs['ukb_covs'].str.split(', ', expand = True)
all_org_covs = all_org_covs.melt()


# None            37357
# age              3058
# sex              2770
# bmi              2106
# tdi_quant        1066
# tobac            1028
# num_in_house      995
# ethnic            671
# num_sex_part      490
# alc               478
# same_sex          349
print(all_org_covs['value'].value_counts(dropna = False).sort_values(ascending = False))

In [None]:
cov_res = ukb_res.loc[:, ['Disease', 'icd', 'org', 'anti', 'per_dis_bh_fdr_corr_nom_p']].copy(deep = True)

# Order covs by most commonly used to least
cov_ls = all_ukb_covs['value'].value_counts(dropna = False).sort_values(ascending = False).index.tolist()
cov_ls.remove(None)

cov_stats = ['p', 'or', 'ci_low', 'ci_hi']

fin_cov_cols = ['org_mod', 'tnx_bh_fdr', 'ukb_or', 'tnx_or', 'ukb_is_risk', 'tnx_is_risk', 'ukb_no_covs', 'tnx_no_covs']
for curr_cov in cov_ls:
    
    fin_cov_cols.append(f"ukb_{curr_cov}")

    for curr_stat in cov_stats:
        fin_cov_cols.append(f"ukb_{curr_cov}_{curr_stat}")

        
tnx_cov_ls = ['age', 'sex', 'ethnic']       
for curr_cov in tnx_cov_ls:
    
    fin_cov_cols.append(f"tnx_{curr_cov}")

    for curr_stat in cov_stats:
        fin_cov_cols.append(f"tnx_{curr_cov}_{curr_stat}")



tmp_arr = np.zeros(shape = (len(cov_res), len(fin_cov_cols)))
tmp_arr[:] = np.nan
tmp = pd.DataFrame(tmp_arr, columns = fin_cov_cols)

cov_res = cov_res.reset_index(drop = True)
tmp = tmp.reset_index(drop = True)

print(len(cov_res))
print(len(tmp))
cov_res = pd.concat([cov_res, tmp], axis = 1)
print(len(cov_res))

cov_res = cov_res.loc[:, ['Disease', 'icd', 'org', 'anti', 'per_dis_bh_fdr_corr_nom_p'] + fin_cov_cols]

In [None]:
for _, curr_row in tqdm(ukb_res.iterrows(), total = len(ukb_res)):

    curr_dis = curr_row['Disease']
    curr_icd = curr_row['icd']
    curr_org = curr_row['org']
    curr_anti = curr_row['anti']
    curr_adj_p = curr_row['per_dis_bh_fdr_corr_nom_p']

    
    # Not org lev 
    curr_org_lev = org_lev_res.loc[((org_lev_res['icd'] == curr_icd) &
                             (org_lev_res['org'] == curr_org) &
                             (org_lev_res['anti'] == curr_anti)), :]
    
    if len(curr_org_lev) == 0:
        cov_res.loc[((cov_res['icd'] == curr_icd) &
                     (cov_res['org'] == curr_org) &
                     (cov_res['anti'] == curr_anti)), 'org_mod']  = False

    # Process for tnx vars
    else:
        cov_res.loc[((cov_res['icd'] == curr_icd) &
                     (cov_res['org'] == curr_org) &
                     (cov_res['anti'] == curr_anti)), 'org_mod']  = True
        
        
        cov_res.loc[((cov_res['icd'] == curr_icd) &
                     (cov_res['org'] == curr_org) &
                     (cov_res['anti'] == curr_anti)), 'tnx_bh_fdr']  = curr_org_lev['tnx_per_dis_bh_fdr_corr_p'].tolist()[0]

        
        cov_res.loc[((cov_res['icd'] == curr_icd) &
                     (cov_res['org'] == curr_org) &
                     (cov_res['anti'] == curr_anti)), 'ukb_or']  = curr_org_lev['ukb_OR'].tolist()[0]
        
        cov_res.loc[((cov_res['icd'] == curr_icd) &
                     (cov_res['org'] == curr_org) &
                     (cov_res['anti'] == curr_anti)), 'tnx_or']  = curr_org_lev['tnx_OR'].tolist()[0]

        
        if curr_org_lev['tnx_OR'].tolist()[0] < 1:
            cov_res.loc[((cov_res['icd'] == curr_icd) &
                         (cov_res['org'] == curr_org) &
                         (cov_res['anti'] == curr_anti)), 'tnx_is_risk'] = False
            
        else:
            cov_res.loc[((cov_res['icd'] == curr_icd) &
                         (cov_res['org'] == curr_org) &
                         (cov_res['anti'] == curr_anti)), 'tnx_is_risk'] = True            

        
        
        if curr_org_lev['ukb_OR'].tolist()[0] < 1:
            cov_res.loc[((cov_res['icd'] == curr_icd) &
                         (cov_res['org'] == curr_org) &
                         (cov_res['anti'] == curr_anti)), 'ukb_is_risk'] = False
            
        else:
            cov_res.loc[((cov_res['icd'] == curr_icd) &
                         (cov_res['org'] == curr_org) &
                         (cov_res['anti'] == curr_anti)), 'ukb_is_risk'] = True   
        


        curr_org_lev = curr_org_lev.reset_index(drop = True).iloc[0]
        #curr_org_lev['tnx_cov_adj']
        
        # Only do this processing if tnx_cov_ps is not NA, otherwise skip ahead top UKB stuff
        if not isinstance(curr_org_lev['tnx_cov_ps'], float):
            
            cov_res.loc[((cov_res['icd'] == curr_icd) &
                         (cov_res['org'] == curr_org) &
                         (cov_res['anti'] == curr_anti)), 'tnx_no_covs']  = False   
            
            cov_p_spl = curr_org_lev['tnx_cov_ps'].split(', ')
            or_spl_ls = curr_org_lev['tnx_cov_or'].split(', ')

            for curr_cov in cov_ls:

                # See if we have this cov if not just loop to next cov
                curr_cov_name_ls = [x for x in cov_p_spl if x.startswith(curr_cov)]

                if len(curr_cov_name_ls) == 0:
                    continue


                # Loop through each level of our cov to find most significant one
                curr_min_p = 1
                curr_min_p_cov_lev = True       
                for curr_cov_p_str in curr_cov_name_ls:

                    curr_cov_level_name = curr_cov_p_str.split(':')[0].replace(' ', '')
                    curr_cov_level_p = float(curr_cov_p_str.split(':')[1].replace(' ', ''))


                    if curr_cov_level_p < curr_min_p:
                        curr_min_p_cov_lev = curr_cov_level_name
                        curr_min_p = curr_cov_level_p


                curr_cov_p = curr_min_p
                # Get OR for info for this level
                curr_or_str =  [x for x in or_spl_ls if curr_min_p_cov_lev in x][0]
                curr_or_str = curr_or_str.split(':')[1]

                curr_or = float(curr_or_str.split('[')[0].replace(' ', ''))

                ci_str = curr_or_str.split('[')[1].replace(']', '')
                ci_str_spl = re.split('\d-', ci_str)

                curr_low = float(ci_str_spl[0])
                curr_hi = float(ci_str_spl[1])

               # print(f"[{curr_icd}: {curr_org}, {curr_anti}]: {curr_cov}: {curr_min_p_cov_lev}, p = {curr_cov_p}, OR: {curr_or} [{curr_low} - {curr_hi}]")


                # Insert values into our df tracking this
                cov_res.loc[((cov_res['icd'] == curr_icd) &
                             (cov_res['org'] == curr_org) &
                             (cov_res['anti'] == curr_anti)), f"tnx_{curr_cov}"] = curr_min_p_cov_lev

                cov_res.loc[((cov_res['icd'] == curr_icd) &
                             (cov_res['org'] == curr_org) &
                             (cov_res['anti'] == curr_anti)), f"tnx_{curr_cov}_p"] = curr_cov_p

                cov_res.loc[((cov_res['icd'] == curr_icd) &
                             (cov_res['org'] == curr_org) &
                             (cov_res['anti'] == curr_anti)), f"tnx_{curr_cov}_or"] = curr_or

                cov_res.loc[((cov_res['icd'] == curr_icd) &
                             (cov_res['org'] == curr_org) &
                             (cov_res['anti'] == curr_anti)), f"tnx_{curr_cov}_ci_low"] = curr_low


                cov_res.loc[((cov_res['icd'] == curr_icd) &
                             (cov_res['org'] == curr_org) &
                             (cov_res['anti'] == curr_anti)), f"tnx_{curr_cov}_ci_hi"] = curr_hi

        else:
            cov_res.loc[((cov_res['icd'] == curr_icd) &
                         (cov_res['org'] == curr_org) &
                         (cov_res['anti'] == curr_anti)), 'tnx_no_covs']  = True         
    # No covs to process
    # If cov_adj_for is a float then it's NA and thus we have no covs
    if isinstance(curr_row['cov_adj_for'], float):
        cov_res.loc[((cov_res['icd'] == curr_icd) &
                     (cov_res['org'] == curr_org) &
                     (cov_res['anti'] == curr_anti)), 'ukb_no_covs']  = True
    # We have covs    
    else:
        
        cov_res.loc[((cov_res['icd'] == curr_icd) &
                     (cov_res['org'] == curr_org) &
                     (cov_res['anti'] == curr_anti)), 'ukb_no_covs']  = False
        
        or_spl_ls = curr_row['cov_ors'].split(', ')
        cov_p_spl = curr_row['cov_ps'].split(', ')

        for curr_cov in cov_ls:

            # See if we have this cov if not just loop to next cov
            curr_cov_name_ls = [x for x in cov_p_spl if x.startswith(curr_cov)]

            if len(curr_cov_name_ls) == 0:
                continue


            # Loop through each level of our cov to find most significant one
            curr_min_p = 1
            curr_min_p_cov_lev = True       
            for curr_cov_p_str in curr_cov_name_ls:

                curr_cov_level_name = curr_cov_p_str.split(':')[0].replace(' ', '')
                curr_cov_level_p = float(curr_cov_p_str.split(':')[1].replace(' ', ''))


                if curr_cov_level_p < curr_min_p:
                    curr_min_p_cov_lev = curr_cov_level_name
                    curr_min_p = curr_cov_level_p


            curr_cov_p = curr_min_p
            # Get OR for info for this level
            curr_or_str =  [x for x in or_spl_ls if curr_min_p_cov_lev in x][0]
            curr_or_str = curr_or_str.split(':')[1]

            curr_or = float(curr_or_str.split('[')[0].replace(' ', ''))

            ci_str = curr_or_str.split('[')[1].replace(']', '')
            ci_str_spl = re.split('\d-', ci_str)

            curr_low = float(ci_str_spl[0])
            curr_hi = float(ci_str_spl[1])

           # print(f"[{curr_icd}: {curr_org}, {curr_anti}]: {curr_cov}: {curr_min_p_cov_lev}, p = {curr_cov_p}, OR: {curr_or} [{curr_low} - {curr_hi}]")


            # Insert values into our df tracking this
            cov_res.loc[((cov_res['icd'] == curr_icd) &
                         (cov_res['org'] == curr_org) &
                         (cov_res['anti'] == curr_anti)), f"ukb_{curr_cov}"] = curr_min_p_cov_lev

            cov_res.loc[((cov_res['icd'] == curr_icd) &
                         (cov_res['org'] == curr_org) &
                         (cov_res['anti'] == curr_anti)), f"ukb_{curr_cov}_p"] = curr_cov_p

            cov_res.loc[((cov_res['icd'] == curr_icd) &
                         (cov_res['org'] == curr_org) &
                         (cov_res['anti'] == curr_anti)), f"ukb_{curr_cov}_or"] = curr_or

            cov_res.loc[((cov_res['icd'] == curr_icd) &
                         (cov_res['org'] == curr_org) &
                         (cov_res['anti'] == curr_anti)), f"ukb_{curr_cov}_ci_low"] = curr_low


            cov_res.loc[((cov_res['icd'] == curr_icd) &
                         (cov_res['org'] == curr_org) &
                         (cov_res['anti'] == curr_anti)), f"ukb_{curr_cov}_ci_hi"] = curr_hi

In [None]:
cov_res.to_excel('../manuscript/tables/final_cov_usage_for_ukb_and_tnx.xlsx', index = False)

# Plotting

## Summarize results first

In [None]:
UKB_FDR_THRESH = 0.3
TNX_FDR_THRESH = 0.01
COV_THRESH = 0.05

# Load back in data if needed

#  Generated in steps above
cov_res = pd.read_excel('../manuscript/tables/other/final_cov_usage_for_ukb_and_tnx.xlsx')

# All disease-antibody results (using nominal p-value, not empirical, but we can still grab it from the emp file)
ukb_res = pd.read_csv('emp_results_01_17_2023_case_filt.tsv', sep = '\t')

In [None]:
org_lev_res = pd.read_excel('../manuscript/latest/supplemental_datasets/supplemental_dataset_2.xlsx',
                    sheet_name = 'Results')

In [None]:
# Revert our more human-friendly column names back to computer-friendly ones.
human_to_computer_column_dict = {'Disease' : 'disease', 
                                'ICD10' : 'icd', 
                                'Organism' : 'org', 
                                'Antibody' : 'anti', 
                                'Pair is Associated' : 'pair_is_associated', 
                                'Standard Level' : 'std_lev', 
                                'Replication Status' : 'rep_stat', 
                                'UKB adj p' : 'ukb_per_dis_bh_fdr_corr_nom_p', 
                                'TNX adj p' : 'tnx_per_dis_bh_fdr_corr_p', 
                                'UKB OR' : 'ukb_OR', 
                                'TNX OR' : 'tnx_OR', 
                                'UKB CI' : 'ukb_anti_CI', 
                                'TNX CI' : 'tnx_CI', 
                                'UKB nCase' : 'ukb_nCase',
                                'UKB nControl' : 'ukb_nControl',
                                'TNX nCase' : 'tnx_nCase', 
                                'TNX nControl' : 'tnx_nControl'
}

org_lev_res = org_lev_res.rename(columns = human_to_computer_column_dict)

# Revert human friendly organism names to computationally friendly ones
human_to_computer_org_dict = {
                                'BKV': 'bkv',
                                'C. trachomatis': 'c_trach',
                                'CMV': 'cmv',
                                'EBV': 'ebv',
                                'H. pylori': 'h_pylor',
                                'HBV': 'hbv',
                                'HCV': 'hcv',
                                'HHV-6': 'hhv_6',
                                'HHV-7': 'hhv_7',
                                'HIV': 'hiv',
                                'HPV-16': 'hpv_16',
                                'HPV-18': 'hpv_18',
                                'HSV-1': 'hsv_1',
                                'HSV-2': 'hsv_2',
                                'HTLV-1': 'htlv',
                                'JCV': 'jcv',
                                'KSHV': 'kshv',
                                'MCV': 'mcv',
                                'T. gondii': 't_gond',
                                'VZV': 'vzv'
                            }
org_lev_res.loc[:, 'org'] = org_lev_res.loc[:, 'org'].replace(human_to_computer_org_dict)


human_to_computer_anti_dict = {
                                    'K8.1' : 'K8_1', 
                                    'gE / gI' : 'gE_gl', 
                                    'pp 52' : 'pp52', 
                                    'pp 28' : 'pp28', 
                                    '2mgG' : 'IgG', 
                                    'momp A' : 'momp_A', 
                                    'pp150 Nter' : 'pp150_Nter', 
                                    '1gG' : 'IgG', 
                                    'p101 k' : 'p101_k', 
                                    'tarp-D F2' : 'tarp-D_F2', 
                                    'momp D' : 'momp_D', 
                                    'tarp-D F1' : 'tarp-D_F1', 
                                    'VCA p18' : 'VCA_p18'
}
org_lev_res.loc[:, 'anti'] = org_lev_res.loc[:, 'anti'].replace(human_to_computer_anti_dict)

human_to_computer_std_lev_dict = {
    'Unknown' : 'unk',
    'Exp. Negative' : 'exp_neg'
}
org_lev_res.loc[:, 'std_lev'] = org_lev_res.loc[:, 'std_lev'].replace(human_to_computer_std_lev_dict)

human_to_computer_rep_dict = {
    'Did not attempt' : 'did_not_attempt', 
    'Replicated' : 'replicated', 
    'Failed Replication' : 'did_not', 
    'Could not attempt' : 'could_not'
}
org_lev_res.loc[:, 'rep_stat'] = org_lev_res.loc[:, 'rep_stat'].replace(human_to_computer_rep_dict)

In [None]:
# 19,289
print(len(ukb_res))

# 19,289
print(len(cov_res))

# 8616
print(len(org_lev_res))

### Verify alignment of datasets

In [None]:
# Align UKB and org lev res
# Fix some weird chars
ukb_res.loc[:, 'org'] = ukb_res.loc[:, 'org'].replace({'MCV\xa0' : 'MCV', 
                                                   'H.\xa0pylori' : 'H.pylori'})

org_to_tag_dict = {
                            'HSV1'           : 'hsv_1',
                            'HSV2'           : 'hsv_2',
                            'VZV'            : 'vzv',
                            'EBV'            : 'ebv',
                            'CMV'            : 'cmv',
                            'HHV-6'          : 'hhv_6',
                            'HHV-7'          : 'hhv_7',
                            'KSHV/HHV-8'     : 'kshv',
                            'HBV'            : 'hbv',
                            'HCV'            : 'hcv',
                            'T. gondii'      : 't_gond',
                            'T.gondii'      : 't_gond',
                            'HTLV-1'         : 'htlv',
                            'BKV'            : 'bkv',
                            'JCV'            : 'jcv',
                            'MCV'            : 'mcv',
                            'HPV-16'         : 'hpv_16',
                            'HPV-18'         : 'hpv_18',
                            'C. trachomatis' : 'c_trach',
                            'C.trachomatis' : 'c_trach',

                            'H.pylori'       : 'h_pylor',
                            'HIV'            : 'hiv'
                        }

ukb_res['org'] = ukb_res.loc[:, 'org'].replace(org_to_tag_dict)

In [None]:
print("Orgs:")
print(set(cov_res['org'].unique().tolist()).difference(set(ukb_res['org'].unique().tolist())))
print(set(ukb_res['org'].unique().tolist()).difference(set(cov_res['org'].unique().tolist())))

print("Antibodies:")
print(set(cov_res['anti'].unique().tolist()).difference(set(ukb_res['anti'].unique().tolist())))
print(set(ukb_res['anti'].unique().tolist()).difference(set(cov_res['anti'].unique().tolist())))

print("Diseases:")
print(set(cov_res['icd'].unique().tolist()).difference(set(ukb_res['icd'].unique().tolist())))
print(set(ukb_res['icd'].unique().tolist()).difference(set(cov_res['icd'].unique().tolist())))

In [None]:
# NaN    10673
# 1.0     4608
# 0.0     4008
# Name: ukb_is_risk, dtype: int64
print(cov_res['ukb_is_risk'].value_counts(dropna = False))


# NaN    10673
# 1.0     8416
# 0.0      200
# Name: tnx_is_risk, dtype: int64
print(cov_res['tnx_is_risk'].value_counts(dropna = False))


cov_res.loc[:, 'ukb_is_risk'] = cov_res['ukb_is_risk'].replace({np.nan : 'NA',
                                                                1.0 : 'True',
                                                                0.0 : 'False'})

cov_res.loc[:, 'tnx_is_risk'] = cov_res['tnx_is_risk'].replace({np.nan : 'NA',
                                                                1.0 : 'True',
                                                                0.0 : 'False'})

# NA       10673
# True      4608
# False     4008
# Name: ukb_is_risk, dtype: int64
print(cov_res['ukb_is_risk'].value_counts(dropna = False))


# NA       10673
# True      8416
# False      200
# Name: tnx_is_risk, dtype: int64
print(cov_res['tnx_is_risk'].value_counts(dropna = False))

In [None]:
cov_res = cov_res.rename(columns = {'per_dis_bh_fdr_corr_nom_p' : 'bh_fdr'})

In [None]:
# 19,289
#  8,616
#    841
#    215
TOT_NUM_AB_TESTS = len(cov_res)
TOT_NUM_ORG_TESTS = sum(cov_res['org_mod'] == True)

TOT_AB_TESTS_SIG = sum(cov_res['bh_fdr'] < UKB_FDR_THRESH)
TOT_ORG_TESTS_SIG = len(cov_res.loc[(
                                        (cov_res['bh_fdr'] < UKB_FDR_THRESH) &
                                        (cov_res['tnx_bh_fdr'] < TNX_FDR_THRESH)

        ) & (cov_res['ukb_is_risk'] == cov_res['tnx_is_risk']), :])

# 490
TOT_NUM_TNX_ORG_TESTS = len(org_lev_res.loc[org_lev_res['tnx_test_id'].notnull(), :])

print(TOT_NUM_AB_TESTS)
print(TOT_NUM_ORG_TESTS)
print(TOT_AB_TESTS_SIG)
print(TOT_ORG_TESTS_SIG)
print(TOT_NUM_TNX_ORG_TESTS)

In [None]:
# Kinda doing a stupid trick to make re-using my older code easier now that we are not
# calculating a meta p-value
cov_res['meta'] = False

cov_res.loc[(
                (cov_res['bh_fdr'] < UKB_FDR_THRESH) &
                (cov_res['tnx_bh_fdr'] < TNX_FDR_THRESH)) & 
    
    
                (cov_res['ukb_is_risk'] == cov_res['tnx_is_risk']), 'meta'] = True

In [None]:
cov_res.loc[:, 'tnx_no_covs'] = cov_res['tnx_no_covs'].replace({np.nan : 'NA',
                                                                1.0 : 'True',
                                                                0.0 : 'False'})

In [None]:
cov_ls = ['no_covs', 'age', 'sex', 'ethnic', 'bmi', 'tdi_quant', 
              'tobac', 'alc', 'num_in_house', 'num_sex_part', 'same_sex']


cov_sum_ls = []

tnx_cov_res = cov_res.loc[cov_res['tnx_or'].notnull(), :].copy(deep = True)
for curr_cov in tqdm(cov_ls):

#     if curr_cov == 'no_covs':
#         ukb_curr_cov = 'ukb_no_covs'
#         tnx_curr_cov = f"tnx_{curr_cov}"
#     else:
        
    ukb_curr_cov = f"ukb_{curr_cov}"
    tnx_curr_cov = f"tnx_{curr_cov}"

    ukb_cov_p_name = f"ukb_{curr_cov}_p"
    ukb_cov_or_name = f"ukb_{curr_cov}_or"
    tnx_cov_p_name = f"tnx_{curr_cov}_p"
    tnx_cov_or_name = f"tnx_{curr_cov}_or"


    print(f"[{curr_cov}] {ukb_curr_cov}: {ukb_cov_p_name} | {tnx_curr_cov}: {tnx_cov_p_name}")


    # How many Ab models used in
    # How many times have we used this cov?
    if 'no_covs' not in curr_cov:
        ab_models_with_ukb_cov = cov_res.loc[cov_res.loc[:, ukb_curr_cov].notnull(), :]
        ab_models_with_cov_where_ukb_cov_sig = ab_models_with_ukb_cov.loc[ab_models_with_ukb_cov[ukb_cov_p_name] < COV_THRESH, :]
        
        n_ukb_cov_used_ab_mods_where_ukb_cov_sig = len(ab_models_with_cov_where_ukb_cov_sig)
    else:
        
        ab_models_with_ukb_cov = cov_res.loc[cov_res.loc[:, ukb_curr_cov] == True, :]
        n_ukb_cov_used_ab_mods_where_ukb_cov_sig = 0
        
        
    n_ukb_cov_used_ab_mods = len(ab_models_with_ukb_cov)
    
    


    org_models_with_ukb_cov = ab_models_with_ukb_cov.loc[ab_models_with_ukb_cov['org_mod'] == True, :]
    n_ukb_cov_used_org_mods = len(org_models_with_ukb_cov)

    if 'no_covs' not in curr_cov:
        org_models_with_cov_where_ukb_cov_sig = org_models_with_ukb_cov.loc[org_models_with_ukb_cov[ukb_cov_p_name] < COV_THRESH, :]
        n_ukb_cov_used_org_mods_where_ukb_cov_sig = len(org_models_with_cov_where_ukb_cov_sig)
    
    else:
        n_ukb_cov_used_org_mods_where_ukb_cov_sig = 0
    


    if tnx_curr_cov in tnx_cov_res.columns:    
        if 'no_covs' not in curr_cov:
            tnx_models_with_tnx_cov = tnx_cov_res.loc[tnx_cov_res.loc[:, tnx_curr_cov].notnull(), :]
            n_tnx_cov_used_org_mods = len(tnx_models_with_tnx_cov)

            tnx_models_with_cov_where_tnx_cov_sig = tnx_models_with_tnx_cov.loc[tnx_models_with_tnx_cov[tnx_cov_p_name] < COV_THRESH, :]
            n_tnx_cov_used_org_mods_where_tnx_cov_sig = len(tnx_models_with_cov_where_tnx_cov_sig)
      
        else:
            tnx_models_with_tnx_cov = tnx_cov_res.loc[tnx_cov_res.loc[:, ukb_curr_cov] == True, :]
            n_tnx_cov_used_org_mods = len(tnx_models_with_tnx_cov)

            n_tnx_cov_used_org_mods_where_tnx_cov_sig = 0
        
    else:
        
        n_tnx_cov_used_org_mods = 0
        n_tnx_cov_used_org_mods_where_tnx_cov_sig = 0
        
 
    cov_sum_ls.append([curr_cov, 
                       n_ukb_cov_used_ab_mods, n_ukb_cov_used_ab_mods_where_ukb_cov_sig,
                       n_ukb_cov_used_org_mods, n_ukb_cov_used_org_mods_where_ukb_cov_sig, 
                       n_tnx_cov_used_org_mods, n_tnx_cov_used_org_mods_where_tnx_cov_sig
                      ])
    
cov_sum = pd.DataFrame(cov_sum_ls, columns = ['cov', 
                           'n_ukb_cov_used_ab_mods', 'n_ukb_cov_used_ab_mods_where_ukb_cov_sig',
                           'n_ukb_cov_used_org_mods', 'n_ukb_cov_used_org_mods_where_ukb_cov_sig', 
                           'n_tnx_cov_used_org_mods', 'n_tnx_cov_used_org_mods_where_tnx_cov_sig'])


cov_sum['perc_ukb_cov_used_ab_mods'] = cov_sum['n_ukb_cov_used_ab_mods'] / TOT_NUM_AB_TESTS
cov_sum['perc_ukb_cov_used_org_mods'] = cov_sum['n_ukb_cov_used_org_mods'] / TOT_NUM_ORG_TESTS
cov_sum['perc_tnx_cov_used_org_mods'] = cov_sum['n_tnx_cov_used_org_mods'] / TOT_NUM_TNX_ORG_TESTS


cov_sum['perc_ukb_cov_used_ab_mods_where_ukb_cov_sig'] = cov_sum['n_ukb_cov_used_ab_mods_where_ukb_cov_sig'] / TOT_NUM_AB_TESTS
cov_sum['perc_ukb_cov_used_org_mods_where_ukb_cov_sig'] = cov_sum['n_ukb_cov_used_org_mods_where_ukb_cov_sig'] / TOT_NUM_ORG_TESTS
cov_sum['perc_tnx_cov_used_org_mods_where_tnx_cov_sig'] = cov_sum['n_tnx_cov_used_org_mods_where_tnx_cov_sig'] / TOT_NUM_TNX_ORG_TESTS


perc_covs = cov_sum.loc[:, ['cov', 
               'perc_ukb_cov_used_ab_mods', 'perc_ukb_cov_used_org_mods', 'perc_tnx_cov_used_org_mods',
               'perc_ukb_cov_used_ab_mods_where_ukb_cov_sig', 'perc_ukb_cov_used_org_mods_where_ukb_cov_sig',
                'perc_tnx_cov_used_org_mods_where_tnx_cov_sig'  
                       ]]               

## Now Start plotting

In [None]:
cov_order = perc_covs.sort_values('perc_ukb_cov_used_ab_mods', ascending = False)['cov'].tolist()
cov_order.remove('no_covs')
cov_order.insert(0, 'no_covs')

### Small multiples using shading for significance of Cov

In [None]:
# Plots using different shades for Significance
title = "Covariate Use Across All Models"
y_lab = "Percent of All Models"

UKB_COLOR = '#5b9bd5'
TNX_COLOR = '#f4b183'

TITLE_FONT_SIZE = 24
TITLE_X = 0.5
TITLE_Y = 0.93

SUB_TITLE_FONT_DICT = {
    'fontsize' : 16,
    'fontweight': 'bold'
    
}

ANNO_FONT_DICT = {
    'fontsize' : 12
}

TNX_MISS_FD = {    
    'fontsize' : 36,
    'fontweight': 'bold',
    'color' : TNX_COLOR,
    
}

TNX_MISS_LAB_Y = 0.025

FACE_ALPHA = 0.3

PLOT_W = 15
PLOT_H = 15

BAR_WIDTH = 2
BTW_BAR_W = 1

COV_WIDTH = 4
BTW_COV_W = 2

COL_LS = plt.cm.tab10.colors

BAR_LAB_ADD = 0.01

x_dict = dict()
for num, val in enumerate(cov_order):
    x_dict[val] = num 


# 
perc_covs = perc_covs.iloc[perc_covs['cov'].map(x_dict).argsort()]

    
fig, (ukb_ab_ax, ukb_org_ax, tnx_org_ax) = plt.subplots(nrows = 3, ncols = 1, 
                                                        sharex = True, 
                                                        figsize = (PLOT_W, PLOT_H), 
                                                        facecolor = 'white') 
x_lab_dict = dict()
curr_x = 0
curr_cov_x = 0 
for _, curr_row in tqdm(perc_covs.iterrows(), total = len(perc_covs)):
    curr_cov = curr_row['cov']
    curr_cov_x = x_dict[curr_cov]

    curr_perc_ukb_used_ab_mods  = curr_row['perc_ukb_cov_used_ab_mods']
    curr_perc_ukb_used_org_mods = curr_row['perc_ukb_cov_used_org_mods']
    curr_perc_tnx_used_org_mods = curr_row['perc_tnx_cov_used_org_mods']


    curr_perc_ukb_cov_used_ab_mods_where_ukb_cov_sig  = curr_row['perc_ukb_cov_used_ab_mods_where_ukb_cov_sig']
    curr_perc_ukb_cov_used_org_mods_where_ukb_cov_sig  = curr_row['perc_ukb_cov_used_org_mods_where_ukb_cov_sig']
    curr_perc_tnx_cov_used_org_mods_where_tnx_cov_sig  = curr_row['perc_tnx_cov_used_org_mods_where_tnx_cov_sig']

    curr_perc_ukb_cov_used_ab_mods_where_ukb_cov_NOT_SIG  = curr_perc_ukb_used_ab_mods - curr_perc_ukb_cov_used_ab_mods_where_ukb_cov_sig
    curr_perc_ukb_cov_used_org_mods_where_ukb_cov_NOT_SIG = curr_perc_ukb_used_org_mods - curr_perc_ukb_cov_used_org_mods_where_ukb_cov_sig
    curr_perc_tnx_cov_used_org_mods_where_tnx_cov_NOT_SIG = curr_perc_tnx_used_org_mods - curr_perc_tnx_cov_used_org_mods_where_tnx_cov_sig

    if curr_cov == 'no_covs': 
        curr_sig_col = 'black'
    else:  
        curr_sig_col     = COL_LS[curr_cov_x - 1]
    #curr_not_sig_col = COL_LS[(curr_cov_x * 2) + 1]
    
    ###################   
    # UKB Ab
    ###################
    
    # All bar at bottom
    ukb_ab_ax.bar(x      = curr_x,
           height = curr_perc_ukb_used_ab_mods,
           width  = BAR_WIDTH,
           bottom = 0,
           align  = 'center',
           fc  = 'white',
           ec  = curr_sig_col)


    ukb_ab_ax.text(x = curr_x, 
                   y = curr_perc_ukb_used_ab_mods + BAR_LAB_ADD,
                   s = format(cov_sum.loc[(cov_sum['cov'] == curr_cov), 'n_ukb_cov_used_ab_mods'].values.tolist()[0], ','),
                   va = 'center',
                   ha = 'center',
                   fontdict = ANNO_FONT_DICT
                   )
    
    
    # Sig bar filled
    ukb_ab_ax.bar(x      = curr_x,
           height = curr_perc_ukb_cov_used_ab_mods_where_ukb_cov_sig,
           width  = BAR_WIDTH,
           bottom = 0,
           align  = 'center',
           fc  = curr_sig_col,
           ec  = curr_sig_col, 
           alpha = FACE_ALPHA)
    
    
    if curr_cov != 'no_covs': 
        ukb_ab_ax.text(x = curr_x, 
                   y = curr_perc_ukb_cov_used_ab_mods_where_ukb_cov_sig / 2,
                   s = format(cov_sum.loc[(cov_sum['cov'] == curr_cov), 'n_ukb_cov_used_ab_mods_where_ukb_cov_sig'].values.tolist()[0], ','),
                   va = 'center',
                   ha = 'center',
                   fontdict = ANNO_FONT_DICT
                   )

    ###################   
    # UKB Org
    ###################
    # Sig bar at bottom
    ukb_org_ax.bar(x      = curr_x,
           height = curr_perc_ukb_used_org_mods,
           width  = BAR_WIDTH,
           bottom = 0,
           align  = 'center',
           fc  = 'white',
           ec  = curr_sig_col)

    ukb_org_ax.text(x = curr_x, 
                   y = curr_perc_ukb_used_org_mods + BAR_LAB_ADD,
                   s = format(cov_sum.loc[(cov_sum['cov'] == curr_cov), 'n_ukb_cov_used_org_mods'].values.tolist()[0], ','),
                   va = 'center',
                   ha = 'center',
                   fontdict = ANNO_FONT_DICT
                   )
    
    # Not sig lighter at top
    ukb_org_ax.bar(x      = curr_x,
           height = curr_perc_ukb_cov_used_org_mods_where_ukb_cov_sig,
           width  = BAR_WIDTH,
           bottom = 0,
           align  = 'center',
           fc  = curr_sig_col,
           ec  = curr_sig_col,
          alpha = FACE_ALPHA)

    if curr_cov != 'no_covs': 
        ukb_org_ax.text(x = curr_x, 
                   y = curr_perc_ukb_cov_used_org_mods_where_ukb_cov_sig / 2,
                   s = format(cov_sum.loc[(cov_sum['cov'] == curr_cov), 'n_ukb_cov_used_org_mods_where_ukb_cov_sig'].values.tolist()[0], ','),
                   va = 'center',
                   ha = 'center',
                   fontdict = ANNO_FONT_DICT
                   )
    
    ###################   
    # TNX Org
    ###################
    
    # Sig bar at bottom
    tnx_org_ax.bar(x      = curr_x,
           height = curr_perc_tnx_used_org_mods,
           width  = BAR_WIDTH,
           bottom = 0,
           align  = 'center',
           fc  = 'white',
           ec  = curr_sig_col)

    if curr_perc_tnx_used_org_mods != 0:
        tnx_org_ax.text(x = curr_x, 
                       y = curr_perc_tnx_used_org_mods + BAR_LAB_ADD,
                       s = format(cov_sum.loc[(cov_sum['cov'] == curr_cov), 'n_tnx_cov_used_org_mods'].values.tolist()[0], ','),
                       va = 'center',
                       ha = 'center',
                       fontdict = ANNO_FONT_DICT
                       )
    
    # Not sig lighter at top
    tnx_org_ax.bar(x      = curr_x,
           height = curr_perc_tnx_cov_used_org_mods_where_tnx_cov_sig,
           width  = BAR_WIDTH,
           bottom = 0,
           align  = 'center',
           fc  = curr_sig_col,
           ec  = curr_sig_col,
          alpha = FACE_ALPHA)

    if curr_perc_tnx_cov_used_org_mods_where_tnx_cov_sig != 0:
        tnx_org_ax.text(x = curr_x, 
                   y = curr_perc_tnx_cov_used_org_mods_where_tnx_cov_sig / 2,
                   s = format(cov_sum.loc[(cov_sum['cov'] == curr_cov), 'n_tnx_cov_used_org_mods_where_tnx_cov_sig'].values.tolist()[0], ','),
                   va = 'center',
                   ha = 'center',
                   fontdict = ANNO_FONT_DICT
                   )
    
    x_lab_dict[curr_cov] = curr_x

    curr_x = curr_x + BAR_WIDTH + BTW_COV_W
    curr_cov_x = curr_cov_x + 1
    


ticks_x = []
tick_labs = []
for k, v in x_lab_dict.items():
    
    if k == 'tdi_quant':
        temp_lab = 'TDI Quantile'
        
    elif k == 'tobac':
        temp_lab = 'Tobacco'
        
    elif k == 'alc':
        temp_lab = 'Alcohol'
        
    elif k == 'num_in_house':
        temp_lab = 'Number in House'
    
    elif k == 'ethnic':
        temp_lab = 'Ethnicity'
        
    elif k == 'same_sex':
        temp_lab = 'Same Sex Inter.'
    
    elif k == 'num_sex_part':
        temp_lab = 'Number Sex Part.'
    
    elif k == 'bmi':
        temp_lab = 'BMI'
        
    elif k == 'no_covs':
        temp_lab = 'No Covariates'
    
    else:
        temp_lab = k.title()
        
        
    ticks_x.append(v)
    tick_labs.append(temp_lab)
    
tnx_org_ax.set_xticks(ticks_x)
tnx_org_ax.set_xticklabels(tick_labs, ha = 'center', va = 'top')

tnx_org_ax.tick_params(axis = 'x', labelrotation = 90)



title_obj = fig.suptitle(title)    
title_obj.set_size(TITLE_FONT_SIZE)  # set font size
title_obj.set_position([TITLE_X, TITLE_Y])

ukb_ab_ax.set_ylim(0, 0.45)
ukb_ab_ax.set_ylabel(y_lab)

ukb_org_ax.set_ylim(0, 0.45)
ukb_org_ax.set_ylabel(y_lab)

tnx_org_ax.set_ylim(0, 0.55)
tnx_org_ax.set_ylabel(y_lab)

ukb_ab_ax.set_title(f'UKB Disease-Antibody Models [n = {format(TOT_NUM_AB_TESTS, ",")}]', loc = 'left', 
                    fontdict = SUB_TITLE_FONT_DICT, color = UKB_COLOR)
ukb_org_ax.set_title(f'UKB Disease-Pathogen Models [n = {format(TOT_NUM_ORG_TESTS, ",")}]', loc = 'left', 
                     fontdict = SUB_TITLE_FONT_DICT, color = UKB_COLOR)
tnx_org_ax.set_title(f'TNX Disease-Pathogen Models [n = {format(TOT_NUM_TNX_ORG_TESTS, ",")}]', loc = 'left', 
                     fontdict = SUB_TITLE_FONT_DICT, color = TNX_COLOR)


for curr_ax in [ukb_ab_ax, ukb_org_ax, tnx_org_ax]:
    curr_ax.tick_params(axis = 'y',
                        left = True, labelleft = True,
                        right = True, labelright = True,)
    
    
tnx_miss_covs = ['BMI', 'TDI Quantile', 'Tobacco', 'Number in House',
                 'Alcohol', 'Number Sex Part.', 'Same Sex Inter.']

tnx_ticks = tnx_org_ax.get_xticklabels()


for curr_cov in tnx_miss_covs:
    for curr_tick in tnx_ticks:
        
        if curr_tick.get_text() == curr_cov:
            curr_x = curr_tick._x
            
            tnx_org_ax.text(x = curr_x, y = TNX_MISS_LAB_Y,
                    s = 'X',
                    ha = 'center', va = 'center', 
                    fontdict = TNX_MISS_FD)   

In [None]:
out_dir = '../manuscript/figures/supp_fig_1'
fn = f"{out_dir}/supp_fig_1.pdf"
fig.savefig(fn, format = 'pdf', dpi = 600, bbox_inches="tight")