# Load Libraries

In [None]:
# Data manipulation
import numpy as np
import pandas as pd

# Data science
import math
import scipy.stats as stats
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from statsmodels.stats.multitest import multipletests as mt

# Plots
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt

# Working with dates
from datetime import date,datetime
import dateutil

# Looping  progress
from tqdm.notebook import tqdm

# Reg expressions
import re

# Pretty table printing
import tabulate

import os
import subprocess

# Misc libraries
from IPython.display import display, HTML
#from IPython.core.display import display, HTML

# Set seaborn figure size, font size, and style
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set(font_scale=1.5)
sns.set_style("white")

# Set Pandas options so we can see our entire dataframe
pd.options.display.max_rows = 10000
pd.options.display.max_columns = 10000
pd.options.display.max_colwidth = None

# Print our versions of this packages, this allows us to make sure
# we have the working versions we need. 
print(f"Pandas version: {pd.__version__}")


# Remove grey side bars
display(HTML("<style>.container { width:90% !important; }</style>"))

In [None]:
HOME_DIR = "/data/pathogen_ncd"

Concatenate the separate TNX result files together to have a set of final TNX 
results

```bash

RES_DIR="${HOME_DIR}/phecode/tnx/path_analysis"

res="${RES_DIR}/res"

res_fp="${RES_DIR}/phecode_collected_res_12_5.tsv"

# Combine all result files 
cat "${res}"/phe_*_results.tsv > "${res_fp}"

# Grab the header
head -n1 "${res_fp}"  > header

# Remove all header lines that are all throughout the file
grep -v  "Disease_Description" "${res_fp}" | sponge "${res_fp}"

# Now just add 1 header line in at top of file
cat header "${res_fp}" | sponge "${res_fp}"

rm -v header

```

# Prep UKB and TNX results

## Read in UKB and TNX Result Files

In [None]:
# UKB results file
all_ukb = pd.read_excel(f'{HOME_DIR}/phecode/ukb/path_analysis/ukb_phecode_results_MCC_of_ONE_2024_10_22_with_std_lev.xlsx',
                        dtype = {'phecode' : str})
all_ukb.loc[:, ['nCase', 'nControl']] = all_ukb.loc[:, ['nCase', 'nControl']].astype(int)
all_ukb['num_tot_samples'] = all_ukb['nCase'] + all_ukb['nControl']
all_ukb = all_ukb.rename(columns = {'organism' : 'org', 'Phecode' : 'phecode', 'Antigen' : 'anti'})

# TNX categorical test results
all_tnx_cat = pd.read_csv(f'{HOME_DIR}/phecode/tnx/path_analysis/res/phecode_collected_res_12_5.tsv', 
                          dtype = {'phecode' : str}, sep = '\t')

all_tnx = all_tnx_cat.copy(deep = True)
all_tnx.loc[:, ['num_case', 'num_con']] = all_tnx.loc[:, ['num_case', 'num_con']].astype(int)
all_tnx['num_tot_samples'] = all_tnx['num_case'] + all_tnx['num_con']
all_tnx['mcc'] = 'mcc1'

In [None]:
#   Number of rows in clean Path data: 21,900

# All orgs have 1,095 results
all_ukb['org'].value_counts(dropna = False)

# All phecodes have 20 results
all_ukb['phecode'].value_counts(dropna = False)

# 1,095 unique Phecodes each with 20 orgs = 21,900
# 21,900
print(all_ukb.shape[0])

# All orgs have variable # results
all_tnx['org'].value_counts(dropna = False)

# All phecodes have variable # results
all_tnx['phecode'].value_counts(dropna = False)

# 437 Phecodes each with variable number of tests.
# 117,966- only ran UKB sig
print(all_tnx.shape[0])

## Align UKB and TNX Pathogen tags

In [None]:
# UKB and TNX Anti's are already aligned
# Fix some weird chars
all_ukb.loc[:, 'org'] = all_ukb.loc[:, 'org'].replace({'MCV\xa0' : 'MCV', 
                                                   'H.\xa0pylori' : 'H.pylori'})
all_tnx.loc[:, 'org'] = all_tnx.loc[:, 'org'].replace({'MCV\xa0' : 'MCV', 
                                                   'H.\xa0pylori' : 'H.pylori'})

org_to_tag_dict = {
                            'HSV1'           : 'hsv1',
                            'hsv_1'          : 'hsv1',
                            'HSV2'           : 'hsv2',
                            'hsv_2'          : 'hsv2',
                            'VZV'            : 'vzv',
                            'EBV'            : 'ebv',
                            'CMV'            : 'cmv',
                            'HHV-6'          : 'hhv6',
                            'hhv_6'          : 'hhv6',
                            'HHV-7'          : 'hhv7',
                            'hhv_7'          : 'hhv7',
                            'KSHV/HHV-8'     : 'kshv',
                            'HBV'            : 'hbv',
                            'HCV'            : 'hcv',
                            'T. gondii'      : 'tox',
                            'T.gondii'       : 'tox',
                            't_gond'         : 'tox',
                            'HTLV-1'         : 'htlv',
                            'BKV'            : 'bkv',
                            'JCV'            : 'jcv',
                            'MCV'            : 'mcv',
                            'HPV-16'         : 'hpv16',
                            'hpv_16'         : 'hpv16',
                            'HPV-18'         : 'hpv18',
                            'hpv_18'         : 'hpv18',
                            'C. trachomatis' : 'chlam',
                            'C.trachomatis'  : 'chlam',
                            'c_trach'        : 'chlam',

                            'H.pylori'       : 'hpylori',
                            'h_pylor'        : 'hpylori',
    
                            'HIV'            : 'hiv'
                        }

all_ukb['tag'] = all_ukb.loc[:, 'org'].replace(org_to_tag_dict)
all_tnx['tag'] = all_tnx.loc[:, 'org'].replace(org_to_tag_dict)

In [None]:
# {'mcv'}
# No TNX results for MCV - expected
print(set(all_ukb['tag'].unique().tolist()).difference(set(all_tnx['tag'].unique().tolist())))

# {}
# No pathogens in UKB results that aren't also in TNX results
print(set(all_tnx['tag'].unique().tolist()).difference(set(all_ukb['tag'].unique().tolist())))

# Do statistical power filtering

## Case and Total Sample Filtering

In [None]:
# Statistical power requirements
MIN_CASE_THRESH = 17
MIN_TOT_SAMP_THRESH  = 187

# Tier 1 phecodes - the reverse would be expected negative

# | Phecode | Disease_Description        | Disease_Group       | ICD10(s) in PheCode     | ICDs not in ICD10-based Tier 1 | ICDs in ICD10-based Tier 1 | is_interesting | man_rev_interested | Tier 1 Pathogen | Notes                                                                                |
# |---------|----------------------------|---------------------|-------------------------|--------------------------------|----------------------------|----------------|--------------------|-----------------|--------------------------------------------------------------------------------------|
# | 053     | Herpes zoster              | infectious diseases | B02, G53                | G53                            | B02                        | Y              | Y                  | VZV             | G53.0: Postzoster neuralgia                                                          |
# | 054     | Herpes simplex             | infectious diseases | A60, B00, B08           | B08                            | A60, B00                   | Y              | Y                  | HSV1, HSV2      | B08.8: Other forms of   herpesviral infection                                        |
# | 070     | Viral   hepatitis          | infectious diseases | B17, B18, B19           | B17, B18                       | B19                        | Y              | Y                  | HBV, HCV        |                                                                                      |
# | 070.2   | Viral   hepatitis B        | infectious diseases | B16, B18                | B16, B18                       | -                          | Y              | Y                  | HBV             |                                                                                      |
# | 070.3   | Viral hepatitis C          | infectious diseases | B17, B18                | B17, B18                       | -                          | Y              | Y                  | HCV             |                                                                                      |
# | 070.9   | Hepatitis NOS              | infectious diseases | K71, K75, K76           | K71                            | K75, K76                   | Y              | Y                  | HBV, HCV        |                                                                                      |
# | 071     | HIV infection, symptomatic | infectious diseases | B20, B21, B22, B23, B24 | B20, B21, B22, B23             | B24                        | Y              | Y                  | HIV             | All codes mean they have HIV   infection, just usually indicate additional infection |
# | 078     | Viral warts & HPV          | infectious diseases | A63, B07                | A63, B07                       | -                          | Y              | Y                  | HPV16, HPV18    |                                                                                      |
# | 079.2   | Infectious mononucleosis   | infectious diseases | B27                     |                                | B27                        | Y              | Y                  | EBV             | Exact match B27 only                                                                 |

tier_1_phecodes = ['053', '054', '070', '070.2', '070.3', '070.9', '071', '078', '079.2']

In [None]:
# Total in UKB:  21900
# Total in UKB that meet case threshold [n >= 17]:  21900
# Total in TNX:  117966
# Total in TNX stds:  926
# Total in TNX non-stds:  117040
# Total in TNX standards that meet case threshold [n >= 17] and total sample threshold [n >= 187]:  864
# Total in TNX non-standards that meet case threshold [n >= 17] and total sample threshold [n >= 187]:  107908
# Total in combined TNX that meet thresholds: 108772

print(f"Total in UKB:  {len(all_ukb)}")
ukb = all_ukb.copy(deep = True)
ukb = ukb.loc[((ukb['phecode'].isin(tier_1_phecodes)) | 
         ((ukb['nCase'] >= MIN_CASE_THRESH) &
          (ukb['num_tot_samples'] >= MIN_TOT_SAMP_THRESH))), :]

print(f"Total in UKB that meet case threshold [n >= {MIN_CASE_THRESH}]:  {len(ukb)}")


print(f"Total in TNX:  {len(all_tnx)}")
tnx_stds = all_tnx.loc[all_tnx['phecode'].isin(tier_1_phecodes), :]
tnx_non_stds = all_tnx.loc[~all_tnx['phecode'].isin(tier_1_phecodes), :]
print(f"Total in TNX stds:  {len(tnx_stds)}")
print(f"Total in TNX non-stds:  {len(tnx_non_stds)}")

tnx_stds = tnx_stds.loc[((tnx_stds['num_case'] >= MIN_CASE_THRESH) &
                         (tnx_stds['num_tot_samples'] >= MIN_TOT_SAMP_THRESH)), :]
print(f"Total in TNX standards that meet case threshold [n >= {MIN_CASE_THRESH}] and total sample threshold [n >= {MIN_TOT_SAMP_THRESH}]:  {len(tnx_stds)}")

tnx_non_stds = tnx_non_stds.loc[((tnx_non_stds['num_case'] >= MIN_CASE_THRESH) &
                   (tnx_non_stds['num_tot_samples'] >= MIN_TOT_SAMP_THRESH)), :]
print(f"Total in TNX non-standards that meet case threshold [n >= {MIN_CASE_THRESH}] and total sample threshold [n >= {MIN_TOT_SAMP_THRESH}]:  {len(tnx_non_stds)}")

tnx = pd.concat([tnx_stds, tnx_non_stds])
print(f"Total in combined TNX that meet thresholds: {len(tnx)}")

# Work on UKB

## Calculate per-disease FDR corrected NOMINAL p-value

In [None]:
phe_ls = ukb['phecode'].unique().tolist()

fin_ukb_ls = [] 
for curr_phe in tqdm(phe_ls):
    curr_dis_res = ukb.loc[ukb['phecode'] == curr_phe, :].copy(deep = True)

    curr_dis_res['per_dis_bh_fdr_corr_nom_p'] = mt(curr_dis_res['p_val'], 
                                                   alpha = 0.05, method = 'fdr_bh')[1]

    fin_ukb_ls.extend(curr_dis_res.values.tolist())
    
fin_ukb = pd.DataFrame(fin_ukb_ls, columns = ukb.columns.tolist() + ['per_dis_bh_fdr_corr_nom_p'])

# Work on TNX results

## Limit TNX to selected UKB dis-org pairs

In [None]:
print("Orgs:")
print(sorted(list(set(tnx['tag'].unique().tolist()).difference(set(ukb['tag'].unique().tolist())))))
print(sorted(list(set(ukb['tag'].unique().tolist()).difference(set(tnx['tag'].unique().tolist())))))

print("Antibodies:")
print(sorted(list(set(tnx['anti'].unique().tolist()).difference(set(ukb['anti'].unique().tolist())))))
print(sorted(list(set(ukb['anti'].unique().tolist()).difference(set(tnx['anti'].unique().tolist())))))

print("Diseases:")
print(sorted(list(set(tnx['phecode'].unique().tolist()).difference(set(ukb['phecode'].unique().tolist())))))
print(sorted(list(set(ukb['phecode'].unique().tolist()).difference(set(tnx['phecode'].unique().tolist())))))

In [None]:
# Some diseases in TNX that are not in UKB results most likely filtered out
# by minimum number of case/control requirements

# No TNX tests for the following orgs: 
# 'hhv7'
# 'htlv': Only org for U14 so that shows up in diff of Abs.
# 'mcv'

# No UKB results for the following Phecodes by BH Corrected NOMINAL P:

# No TNX diags for the following Phecodes:
# '008.52', '041.2', '041.21', '078', '081', '1000', '1001', '1006', 
# '1015', '110.12', '110.2', '1100', '112.3', '117', '157', '165.1', 
# '170', '170.2', '172', '172.1', '172.11', '184.1', '187', '189.1', 
# '189.11', '191', '191.1', '193', '198.5', '202.2', '202.24', '204.2', 
# '208', '212', '214', '215', '222', '225', '225.1', '227', '227.2', 
# '229', '242', '244.1', '246', '250.4', '250.5', '250.7', '251', 
# '252', '252.1', '256.4', '260.6', '261', '261.2', '271', '271.3', 
# '272.9', '274.11', '274.2', '275', '275.1', '276.1', '276.11', 
# '276.14', '276.4', '277', '277.4', '277.5', '278', '278.1', '279', 
# '280.2', '281.11', '281.12', '282', '284', '285.22', '286.7', 
# '287.31', '290', '290.1', '290.11', '290.2', '292', '292.3', '292.4', 
# '292.6', '295', '295.1', '296', '296.1', '296.2', '296.22', '300.1',
# '300.11', '300.13', '300.9', '301', '302.1', '303', '303.3', '305.2', 
# '306.9', '316', '324', '327', '327.3', '327.4', '327.41', '331', 
# '331.1', '333', '333.1', '340.1', '342', '344', '345.1', '345.11', 
# '345.12', '346', '348', '348.2', '348.7', '348.9', '361', '361.1', 
# '362.3', '362.31', '362.4', '364.4', '364.5', '365', '365.2', '366.2',
# '367', '367.9', '368', '368.2', '368.3', '368.9', '370', '371.1', 
# '371.3', '374.1', '374.3', '378.1', '378.5', '379.1', '379.3', '380',
# '380.1', '381.1', '382', '383', '385.3', '386.1', '386.2', '386.3', 
# '386.9', '388', '389', '389.2', '394.7', '395.4', '395.6', '396', 
# '401.21', '411.9', '414', '418.1', '420', '420.2', '425.1', '426', 
# '426.23', '426.3', '426.31', '426.32', '426.9', '426.91', '427.2', '427.3', 
# '427.41', '427.5', '427.7', '428', '429', '429.2', '429.3', '430.3', 
# '433', '433.2', '433.21', '433.3', '433.31', '440', '441.1', '444', 
# '444.1', '446', '446.5', '446.9', '454', '454.11', '458', '458.2', 
# '458.9', '465.4', '470', '471', '472', '473', '473.3', '480.5', '501',
# '504', '509.8', '512.1', '513', '513.4', '516.1', '520', '520.2', '522',
# '522.5', '523', '523.1', '523.3', '523.32', '525', '527', '527.2', 
# '528.1', '528.11', '528.12', '528.6', '529', '530.2', '530.9', '535.1', 
# '537', '540.1', '550.1', '550.3', '550.4', '555.1', '559', '560.3', 
# '564.9', '568', '568.1', '569.1', '569.2', '572', '573.3', '573.5', 
# '574.12', '575.1', '578.1', '578.2', '578.9', '579', '580.14', '580.3', 
# '580.32', '585.3', '586.1', '586.11', '590', '594', '594.1', '594.2', 
# '594.3', '596.1', '598.9', '599', '599.2', '599.9', '600', '601', 
# '601.12', '601.4', '610.2', '610.4', '612', '612.2', '613', '613.1', 
# '613.7', '613.8', '613.9', '614.1', '614.51', '614.53', '615', '618.2',
# '619.2', '619.3', '619.5', '621', '622', '622.1', '622.2', '623', 
# '625.1', '626.11', '626.12', '626.14', '626.4', '626.8', '627', '627.4',
# '628', '634.3', '635', '635.2', '635.3', '645', '647', '651', '652', 
# '653', '679', '681.2', '686.2', '686.4', '687.1', '690', '690.1', '691',
# '695.3', '695.8', '696.3', '696.42', '701.2', '701.5', '704.2', '704.8',
# '705.1', '709', '709.2', '709.7', '716.2', '720', '721.8', '722.1', 
# '722.7', '723', '723.1', '724', '724.9', '726.3', '728', '728.7', 
# '728.71', '732.1', '735.2', '735.23', '736', '736.2', '738', '740.11', 
# '742', '742.9', '743.2', '747.11', '747.12', '751', '751.11', '751.22', 
# '752', '752.1', '755', '756', '766', '771', '772', '781', '797', 
# '798.1', '800', '800.2', '800.3', '801', '803.1', '803.3', '804', 
# '830', '840', '850', '851', '854', '857', '870.1', '870.5', '871', 
# '913', '915', '916', '949', '958', '960', '960.2', '979', '990'

## First Limit TNX Results to just UKB Significant Pairs

In [None]:
# Total in combined TNX that meet thresholds: 69595

# 108,772
print(len(tnx))

ukb_vals = fin_ukb.loc[:, ['phecode', 'tag', 'anti']].values.tolist()

filt_tnx_ls = []
for curr_phecode, curr_org, curr_anti in tqdm(ukb_vals):
    
    sel_tnx = tnx.loc[((tnx['phecode'] == curr_phecode) & 
                       (tnx['tag'] == curr_org) &
                       (tnx['anti'] == curr_anti)), :].values.tolist()


    filt_tnx_ls.extend(sel_tnx)
    
filt_tnx = pd.DataFrame(filt_tnx_ls, columns = tnx.columns)

# 108,772
print(len(filt_tnx))

In [None]:
# 21,900
print(len(fin_ukb.loc[:, ['phecode', 'tag', 'anti']].drop_duplicates()))

# 11,218
print(len(filt_tnx.loc[:, ['phecode', 'tag', 'anti']].drop_duplicates()))

## Calculate TNX per-phecode FDR corrected NOMINAL p-value

In [None]:
filt_tnx.loc[:, 'p_val'] = filt_tnx.loc[:, 'p_val'].astype(float)

phe_ls = filt_tnx['phecode'].unique().tolist()

fin_tnx_ls = [] 
for curr_phecode in tqdm(phe_ls):
    curr_dis_res = filt_tnx.loc[filt_tnx['phecode'] == curr_phecode, :].copy(deep = True)

    curr_dis_res['per_dis_bh_fdr_corr_p'] = mt(curr_dis_res['p_val'], 
                                                   alpha = 0.05, method = 'fdr_bh')[1]

    fin_tnx_ls.extend(curr_dis_res.values.tolist())
    
fin_tnx = pd.DataFrame(fin_tnx_ls, columns = filt_tnx.columns.tolist() + ['per_dis_bh_fdr_corr_p'])

# Merge cleaned UKB and cleaned TNX results

In [None]:
fin_ukb.columns = [f'ukb_{x}' for x in fin_ukb.columns.tolist()] 
fin_tnx.columns = [f'tnx_{x}' for x in fin_tnx.columns.tolist()] 

In [None]:
# nrow UKB: 21900
# nrow TNX: 108772
# nrow combo: 119454
print(f"nrow UKB: {len(fin_ukb)}")
print(f"nrow TNX: {len(fin_tnx)}")

combo = fin_ukb.merge(fin_tnx, how = 'left',
                      left_on = ['ukb_phecode', 'ukb_tag', 'ukb_anti'],
                      right_on = ['tnx_phecode', 'tnx_tag', 'tnx_anti'])

print(f"nrow combo: {len(combo)}")

In [None]:
# 108,772
len(combo.loc[~combo['tnx_phecode'].isna(),: ])

In [None]:
combo = combo.loc[:, ['ukb_Disease_Description', 'ukb_Disease_Group', 'ukb_phecode', 
                      'ukb_tag', 'ukb_anti',  'ukb_std_lev',
                      'tnx_test_type', 'tnx_test_id', 'tnx_test', 'tnx_mod_method',
                      'ukb_sex_specific_dis', 'ukb_nCase', 'ukb_nControl',
                      'ukb_control_set', 
                      'tnx_dis_sex_str', 'tnx_num_case', 'tnx_num_con',
                      'tnx_con_str', 'tnx_n_mixed',
                      
                      'ukb_per_dis_bh_fdr_corr_nom_p', 'ukb_p_val',
                      'tnx_per_dis_bh_fdr_corr_p', 'tnx_p_val',

                      
                      'ukb_anti_OR',  'tnx_OR',

                      'ukb_sig_covs', 'ukb_cov_adj_for',
                      
                      'ukb_Warnings', 'ukb_is_warning',
                      'tnx_glm_warn_msg', 'tnx_glm_warn_bool',
                      
                      'ukb_model', 'tnx_model',
                      
                      'ukb_cov_ps',  'ukb_cov_ors', 'tnx_cov_adj',
                      'tnx_cov_ps', 'tnx_cov_or',
                      'ukb_anti_CI',  'tnx_CI',

                      
                      'ukb_avg_age_case',  'ukb_avg_avg_con', 
                      'ukb_avg_titer_case', 'ukb_avg_titer_con',
                      'ukb_std_titer_case', 'ukb_std_titer_con', 
                      'ukb_med_titer_case', 'ukb_med_titer_con',
                       
                      'tnx_case_age', 'tnx_con_age', 'tnx_case_titer', 'tnx_con_titer',
                      'tnx_case_titer_std', 'tnx_con_titer_std',
                      'tnx_case_titer_med', 'tnx_con_titer_med',
                      'tnx_n_con_neg', 'tnx_n_con_pos', 'tnx_n_case_neg', 'tnx_n_case_pos',
                      'tnx_log_trans', 
                      
                      
                      'tnx_note_str',
                     ]]

combo = combo.rename(columns = {
                         'ukb_Disease_Description' : 'Disease_Description',
                         'ukb_Disease_Group' : 'Disease_Group',
                         'ukb_phecode' : 'phecode',
                         'ukb_tag' : 'org',
                         'ukb_anti' : 'anti',
                         'ukb_std_lev' : 'std_lev',
})

In [None]:
# Combo verified to have:
# See if there are any rows where there is an TNX test (not null)
# and the disease sexes don't match

# All good - empty
print(combo.loc[((combo['ukb_sex_specific_dis'] != combo['tnx_dis_sex_str']) &
           (combo['tnx_test_type'].notnull())), :])

# Also, make sure the control sets are the same for both UKB and TNX
# All good - empty 
print(combo.loc[((combo['ukb_control_set'] != combo['tnx_con_str']) &
           (combo['tnx_test_type'].notnull())), :])

# Do some cleaning

## Convert some columns to ints

In [None]:
# 108,772
sum(combo.loc[:, 'tnx_n_mixed'].notnull())

In [None]:
combo.loc[:, ['ukb_nCase', 'ukb_nControl', 
              'tnx_num_case', 'tnx_num_con']] = combo.loc[:, ['ukb_nCase', 'ukb_nControl', 
                                                              'tnx_num_case', 'tnx_num_con']].astype("Int64")

# Workaround for apparently known bug?
# https://stackoverflow.com/a/60024263
combo.loc[:, 'tnx_n_mixed'] = combo.loc[:, 'tnx_n_mixed'].astype("Int64")
combo.loc[:, 'tnx_n_con_neg'] = combo.loc[:, 'tnx_n_con_neg'].astype("Int64")
combo.loc[:, 'tnx_n_con_pos'] = combo.loc[:, 'tnx_n_con_pos'].astype("Int64")
combo.loc[:, 'tnx_n_case_neg'] = combo.loc[:, 'tnx_n_case_neg'].astype("Int64")
combo.loc[:, 'tnx_n_case_pos'] = combo.loc[:, 'tnx_n_case_pos'].astype("Int64")

In [None]:
# Verify type conversion
combo.loc[:, ['ukb_nCase', 'ukb_nControl', 
              'tnx_num_case', 'tnx_num_con', 'tnx_n_mixed', 
              'tnx_n_con_neg', 'tnx_n_con_pos', 
              'tnx_n_case_neg', 'tnx_n_case_pos']].info()

## Annotate risk

In [None]:
combo.loc[:, ['ukb_anti_OR', 'tnx_OR']] = combo.loc[:, ['ukb_anti_OR', 'tnx_OR']].astype(float)

combo['ukb_risk'] = False
combo.loc[combo['ukb_anti_OR'] > 1, 'ukb_risk'] = True

combo['tnx_risk'] = False
combo.loc[combo['tnx_OR'] > 1, 'tnx_risk'] = True

# Collapse to best TNX result

## Make small function

In [None]:
# Similar to round, but if the decimal cannot be represented by the number of digits we switch to
# sci notation with that many digits in the significand.
def make_small(num, digits):
    if digits < 1:
        print("Requires positive number of digits")
        return

    dig_min_1 = digits - 1

    low_bound = 1 / (10 ** (dig_min_1))
    up_bound = 10 ** (dig_min_1)

    if ((num < low_bound) | (num > up_bound)):
        #print(f"{low_bound} < X < {up_bound} : No")
        return "{:.{}e}".format(num, dig_min_1)

    return round(num, digits)

## Actual loop to collapse TNX results

In [None]:
phe_org_ls = combo.loc[:, ['phecode', 'org']].drop_duplicates().values.tolist()

fin_combo_ls = []
for curr_phe, curr_org in tqdm(phe_org_ls):
    
    #print(f"{curr_icd} {curr_org}")
    curr_res = combo.loc[((combo['phecode'] == curr_phe) &
                          (combo['org'] == curr_org)), :].copy(deep = True)

    
    # No TNX tests
    if (len(curr_res) == 1) & (all(curr_res['tnx_test_id'].isna())):

        curr_res['other_test_str'] = ''
        curr_res['or_flip_tnx_tests'] = ''

        fin_combo_ls.extend(curr_res.values.tolist())

        continue
    
    
    # If organism is risk
    is_ukb_risk = curr_res['ukb_risk'].unique().tolist()[0]

    # Risk
    if is_ukb_risk:

        curr_tnx_res = curr_res.loc[curr_res['tnx_risk'] == True, :]
        curr_tnx_opp_res = curr_res.loc[curr_res['tnx_risk'] == False, :]

    # Protective
    else:

        curr_tnx_res = curr_res.loc[curr_res['tnx_risk'] == False, :]
        curr_tnx_opp_res = curr_res.loc[curr_res['tnx_risk'] == True, :]

    curr_tnx_res = curr_tnx_res.sort_values(['tnx_per_dis_bh_fdr_corr_p'], ascending = True).reset_index(drop = True)

    
    # No tests in the same direction, so get most sig in oppsite direction
    if len(curr_tnx_res) == 0:    
        curr_tnx_opp_res = curr_tnx_opp_res.sort_values(['tnx_per_dis_bh_fdr_corr_p'], ascending = True).reset_index(drop = True)
        # get most sig TNX p
        best_tnx = curr_tnx_opp_res.iloc[0].copy(deep = True)

        # Grab the rest of the tests (less sig)
        rest_tnx = curr_tnx_opp_res.iloc[1:].copy(deep = True)
        rest_tnx = rest_tnx.reset_index(drop = True)

        
        # Throw the rest of the tests in a column of best_tnx
        if len(rest_tnx) > 0:
            best_tnx['or_flip_tnx_tests'] =  rest_tnx.agg( lambda x: f"{x['tnx_test_id']} [{x['tnx_test']}, {x['tnx_mod_method']}]:  nCase: {x['tnx_num_case']} | nCon: {x['tnx_num_con']}, corr p-val: {make_small(x['tnx_per_dis_bh_fdr_corr_p'],3)}, uncorr p-val: {make_small(x['tnx_p_val'], 3)}, OR: {make_small(x['tnx_OR'], 3)} | model: {x['tnx_model']}, glm_warn [{x['tnx_glm_warn_bool']}]: {x['tnx_glm_warn_msg']} log10 trans: {x['tnx_log_trans']} | Notes: {x['tnx_note_str']}", axis = 1)
        else:
            best_tnx['or_flip_tnx_tests'] = ''


        if len(curr_tnx_opp_res) > 0:
            best_tnx['other_test_str'] =  curr_tnx_opp_res.agg( lambda x: f"{x['tnx_test_id']} [{x['tnx_test']}, {x['tnx_mod_method']}]:  nCase: {x['tnx_num_case']} | nCon: {x['tnx_num_con']}, corr p-val: {make_small(x['tnx_per_dis_bh_fdr_corr_p'],3)}, uncorr p-val: {make_small(x['tnx_p_val'], 3)}, OR: {make_small(x['tnx_OR'], 3)} | model: {x['tnx_model']}, glm_warn [{x['tnx_glm_warn_bool']}]: {x['tnx_glm_warn_msg']} log10 trans: {x['tnx_log_trans']} | Notes: {x['tnx_note_str']}", axis = 1)
        else:
            best_tnx['other_test_str'] = ''

        fin_combo_ls.append(best_tnx.values.tolist())
    
    
    else:
        # get most sig TNX p
        best_tnx = curr_tnx_res.iloc[0].copy(deep = True)

        # Grab the rest of the tests (less sig)
        rest_tnx = curr_tnx_res.iloc[1:].copy(deep = True)
        rest_tnx = rest_tnx.reset_index(drop = True)

        # Throw the rest of the tests in a column of best_tnx
        if len(rest_tnx) > 0:
            best_tnx['other_test_str'] =  rest_tnx.agg( lambda x: f"{x['tnx_test_id']}  [{x['tnx_test']}, {x['tnx_mod_method']}]:  nCase: {x['tnx_num_case']} | nCon: {x['tnx_num_con']}, corr p-val: {make_small(x['tnx_per_dis_bh_fdr_corr_p'],3)}, uncorr p-val: {make_small(x['tnx_p_val'], 3)}, OR: {make_small(x['tnx_OR'], 3)} | model: {x['tnx_model']}, glm_warn [{x['tnx_glm_warn_bool']}]: {x['tnx_glm_warn_msg']} log10 trans: {x['tnx_log_trans']} | Notes: {x['tnx_note_str']}", axis = 1)
        else:
            best_tnx['other_test_str'] = ''


        if len(curr_tnx_opp_res) > 0:
            best_tnx['or_flip_tnx_tests'] =  curr_tnx_opp_res.agg( lambda x: f"{x['tnx_test_id']} [{x['tnx_test']}, {x['tnx_mod_method']}]:  nCase: {x['tnx_num_case']} | nCon: {x['tnx_num_con']}, corr p-val: {make_small(x['tnx_per_dis_bh_fdr_corr_p'],3)}, uncorr p-val: {make_small(x['tnx_p_val'], 3)}, OR: {make_small(x['tnx_OR'], 3)} | model: {x['tnx_model']}, glm_warn [{x['tnx_glm_warn_bool']}]: {x['tnx_glm_warn_msg']} log10 trans: {x['tnx_log_trans']} | Notes: {x['tnx_note_str']}", axis = 1)
        else:
            best_tnx['or_flip_tnx_tests'] = ''

        fin_combo_ls.append(best_tnx.values.tolist())
    
fin_combo = pd.DataFrame(fin_combo_ls, columns = combo.columns.tolist() + ['other_test_str', 'or_flip_tnx_tests'])

fin = fin_combo.copy(deep = True)

# TNX: 21,900 -> Matching the # UKB Phe-Org pairs
print(len(fin))

In [None]:
fin = fin.sort_values(['phecode', 'org', 'ukb_per_dis_bh_fdr_corr_nom_p'])

# Mark replication status for each Disease-Pathogen Pair

In [None]:
UKB_THRESH = 0.3
TNX_THRESH = 0.01

org_lev_res = fin.copy(deep = True)

# Default: 'did_not_attempt'
org_lev_res['rep_stat'] = 'did_not_attempt'


# Rep: UKB and TNX sig
org_lev_res.loc[((org_lev_res.loc[:, 'ukb_per_dis_bh_fdr_corr_nom_p'] < UKB_THRESH) & 
                 (org_lev_res.loc[:, 'tnx_per_dis_bh_fdr_corr_p'] < TNX_THRESH)), 
                'rep_stat'] = 'replicated'


# Could not: UKB sig but no TNX test
org_lev_res.loc[((org_lev_res.loc[:, 'ukb_per_dis_bh_fdr_corr_nom_p'] < UKB_THRESH ) & 
                 (org_lev_res['tnx_p_val'].isna())), 'rep_stat'] = 'could_not'


# Did not: UKB sig and either (TNX not sig) or (TNX sig but OR in opposite direction of UKB)
org_lev_res.loc[(

            (org_lev_res['ukb_per_dis_bh_fdr_corr_nom_p'] < UKB_THRESH) & 

            (
                (org_lev_res['tnx_per_dis_bh_fdr_corr_p'] >= TNX_THRESH) |

                (
                    (org_lev_res.loc[:, 'tnx_per_dis_bh_fdr_corr_p'] < TNX_THRESH) &
                    (org_lev_res['ukb_risk'] !=  org_lev_res['tnx_risk'])
                )

            )
        ), 'rep_stat'] = 'did_not'

org_lev_res.loc[:, 'std_lev'] = org_lev_res.loc[:, 'std_lev'].replace({
                                                            'true_neg' : 'exp_neg',
                                                            'Gold' : 'Tier 1',
                                                            'Silver' : 'Tier 2',
                                                            'unk'   : 'unk'
                                                        })

org_lev_res['pair_is_associated'] = 'No'
org_lev_res.loc[org_lev_res['rep_stat'] == 'replicated', 'pair_is_associated'] = 'Yes'

In [None]:
# Set all did not attempt TNX stuff to NA
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_test_type'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_test_id'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_test'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_mod_method'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_dis_sex'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_num_case'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_num_con'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_con_str'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_n_mixed'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_per_dis_bh_fdr_corr_p'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_p_val'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_OR'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_glm_warn_msg'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_glm_warn_bool'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_model'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_cov_adj'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_cov_ps'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_cov_or'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_CI'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_case_age'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_con_age'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_case_titer'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_con_titer'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_case_titer_std'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_con_titer_std'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_case_titer_med'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_con_titer_med'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_n_con_neg'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_n_con_pos'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_n_case_neg'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_n_case_pos'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_log_trans'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_note_str'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_risk'] = ''
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'other_test_str'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'or_flip_tnx_tests'] = np.nan

## Re-org some cols

In [None]:
org_lev_res = org_lev_res.loc[:, ['Disease_Description', 'Disease_Group', 'phecode', 'org', 'anti', 
                                  'pair_is_associated',  'std_lev', 'rep_stat', 'tnx_test_type', 
                                   'tnx_test_id', 'tnx_test', 'tnx_mod_method', 'ukb_sex_specific_dis',
                                   'ukb_nCase', 'ukb_nControl', 'ukb_control_set', 
                                   'tnx_dis_sex', 'tnx_num_case', 'tnx_num_con', 'tnx_con_str',
                                   'tnx_n_mixed', 'ukb_per_dis_bh_fdr_corr_nom_p',
                                   'tnx_per_dis_bh_fdr_corr_p', 'ukb_p_val',
                                   'tnx_p_val', 'ukb_anti_OR', 'tnx_OR', 'ukb_sig_covs', 'ukb_cov_adj_for',
                                   'ukb_Warnings', 'ukb_is_warning', 'tnx_glm_warn_msg',
                                   'tnx_glm_warn_bool', 'ukb_model', 'tnx_model', 'ukb_cov_ps',
                                   'ukb_cov_ors', 'tnx_cov_adj', 'tnx_cov_ps', 'tnx_cov_or', 'ukb_anti_CI',
                                   'tnx_CI', 'ukb_avg_age_case', 'ukb_avg_avg_con', 'ukb_avg_titer_case',
                                   'ukb_avg_titer_con', 'ukb_std_titer_case', 'ukb_std_titer_con',
                                   'ukb_med_titer_case', 'ukb_med_titer_con', 'tnx_case_age',
                                   'tnx_con_age', 'tnx_case_titer', 'tnx_con_titer', 'tnx_case_titer_std',
                                   'tnx_con_titer_std', 'tnx_case_titer_med', 'tnx_con_titer_med',
                                   'tnx_n_con_neg', 'tnx_n_con_pos', 'tnx_n_case_neg', 'tnx_n_case_pos',
                                   'tnx_log_trans', 
                                   'tnx_note_str', 'ukb_risk', 'tnx_risk', 'other_test_str', 'or_flip_tnx_tests']]

### Assemble final File

In [None]:
# Since all of the additional metadata we add in the other notebook,
# ukb_tnx_icd_combining_results_pub.ipynb, is ICD keyed and I don't think there 
# is a way to change that, we can't really run that. 

# So we will output our final file from here

In [None]:
integ_res = org_lev_res.copy(deep = True) 

### Detailed Organism Tagging

In [None]:
# Load org data file
ant_dict = pd.read_excel(f'{HOME_DIR}/dicts/antigen_dict.xlsx', sheet_name = 'Sheet1',
                         engine = 'openpyxl', dtype = {'Baltimore' : "Int64"})

# Fix minor issue with space coding
ant_dict.loc[:,'Abbrev'] = ant_dict.loc[:,'Abbrev'].replace({'H.\xa0pylori' : 'H. pylori'})
ant_dict.loc[:,'Abbrev'] = ant_dict.loc[:,'Abbrev'].replace({r'[^\x00-\x7F]+':''}, regex=True)

In [None]:
# Mapping org_test data org names to significant results org names
ant_dict.loc[:,'tag'] = ant_dict.loc[:,'Abbrev'].replace({
                                'C. trachomatis' : 'chlam',
                                'EBV'            : 'ebv',
                                'H. pylori'      : 'hpylori',
                                'H.pylori'       : 'hpylori',
                                'HBV'            : 'hbv',
                                'HCV'            : 'hcv',
                                'HSV1'           : 'hsv1',
                                'HSV2'           : 'hsv2',
                                'CMV'            : 'cmv',
                                'HHV-6'          : 'hhv6',
                                'HHV-7'          : 'hhv7',
                                'HIV'            : 'hiv',
                                'JCV'            : 'jcv',
                                'HTLV-1'         : 'htlv',
                                'T. gondii'      : 'tox',
                                'VZV'            : 'vzv',
                                'KSHV/HHV-8'     : 'kshv',
                                'HPV-16'         : 'hpv16',
                                'HPV-18'         : 'hpv18',
                                'BKV'            : 'bkv',
                                'MCV'            : 'mcv',
})

In [None]:
# set()
# set()
print(set(integ_res['org'].unique().tolist()).difference(set(ant_dict['tag'].unique().tolist())))
print(set(ant_dict['tag'].unique().tolist()).difference(set(integ_res['org'].unique().tolist())))

In [None]:
ant = ant_dict.loc[: , ['tag', 'Organism Type', 'Family',  'Sub-family', 'Species', 
                          'Baltimore', 'Balt_reason', 'Herpes']]

In [None]:
# 45 x 8
print(ant.shape)

ant.drop_duplicates(inplace = True)

# 20 x 8
print(ant.shape)

#### Merging

In [None]:
# 21,900 x 68
print(integ_res.shape)

integ_res = pd.merge(integ_res, ant, left_on = 'org',
                     right_on = 'tag', how = 'left')

# 21,900 x 76
print(integ_res.shape)

### Rearrange some columns

In [None]:
integ_res = integ_res.rename(columns = {
                                            'tag' : 'Pathogen', 
                                            'phecode' : 'Phecode', 
                                            'anti' : 'Antibody',
                                            'ukb_anti_OR' : 'ukb_OR',
                                            'tnx_num_case' : 'tnx_nCase',
                                            'tnx_num_con'  : 'tnx_nControl',
                                            'ukb_cov_adj_for' : 'ukb_covs',
                                            'tnx_cov_adj' : 'tnx_covs',
                                            'ukb_avg_avg_con' : 'ukb_avg_age_con',
                                            'tnx_case_age' : 'tnx_avg_age_case',
                                            'tnx_con_age' : 'tnx_avg_age_con',
                                            'Family' : 'Pathogen Family', 
                                            'Sub-family': 'Pathogen Sub-family' , 
                                            'Species' : 'Pathogen Species', 
                                            'Balt_reason' :  'Pathogen Baltimore Reason',
                                            'pair_is_associated' : 'Pair is Associated',
                                            'std_lev' : 'Standard Level', 
                                            'rep_stat' : 'Replication Status',
                                            'ukb_per_dis_bh_fdr_corr_nom_p': 'UKB FDR',
                                            'tnx_per_dis_bh_fdr_corr_p' : 'TNX FDR',
                                            'ukb_OR' : 'UKB OR', 
                                            'tnx_OR' : 'TNX OR',
                                            'ukb_nCase' : 'UKB nCase', 
                                            'ukb_nControl' : 'UKB nControl', 
                                            'tnx_nCase' : 'TNX nCase', 
                                            'tnx_nControl' : 'TNX nControl',
                                            
                                            'tnx_test_id' : 'TNX Test ID', 
                                            'tnx_test' : 'TNX Test', 
                                            'tnx_mod_method' : 'TNX Model Meth',
                                            'tnx_con_str' : 'tnx_control_set',
                                            
                                            'ukb_is_warning' : 'ukb_glm_warn_bool', 
                                            'ukb_Warnings' : 'ukb_glm_warn_msg'
                                            })






In [None]:
integ_res = integ_res.loc[:, [
                                    'Disease_Description', 'Disease_Group', 'Phecode', 
                                    'Pathogen', 'Antibody', 
                                    'Pair is Associated', 'Standard Level', 'Replication Status',
                                    'tnx_test_id', 'tnx_test', 'tnx_mod_method',
                                    'UKB FDR', 'TNX FDR', 
                                    'UKB OR', 'TNX OR', 'UKB CI',	'TNX CI',
                                    'UKB nCase', 'UKB nControl', 'TNX nCase', 'TNX nControl',
                                    'TNX Test ID',	'TNX Test',	'TNX Model Meth',
                                    'ukb_covs', 'tnx_covs', 
                                    'ukb_control_set', 'tnx_control_set', 'tnx_n_mixed',                        
                                    'ukb_model', 'tnx_model',
                                    'ukb_avg_titer_case', 'ukb_avg_titer_con',
                                    'ukb_std_titer_case', 'ukb_std_titer_con',
                                    'ukb_med_titer_case', 'ukb_med_titer_con',
                                    'tnx_n_con_neg', 'tnx_n_con_pos', 
                                    'tnx_n_case_neg', 'tnx_n_case_pos',
                                    'ukb_avg_age_case', 'ukb_avg_age_con',
                                    'tnx_avg_age_case', 'tnx_avg_age_con',
                                    'ukb_p_val','tnx_p_val',
                                    'ukb_sig_covs', 'ukb_cov_ps', 'ukb_cov_ors',
                                    'tnx_cov_ps', 'tnx_cov_or',
                                    'tnx_note_str', 'other_test_str', 'or_flip_tnx_tests',
                                    'ukb_glm_warn_bool', 'ukb_glm_warn_msg',
                                    'tnx_glm_warn_bool',  'tnx_glm_warn_msg',                            
                                    'Pathogen Type', 'Pathogen Family', 
                                    'Pathogen Sub-family', 'Pathogen Species'                    

                              
                
                         ]]

In [None]:
data_dict = {
 'Disease_Description' : 'Phecode disease description',
 'Disease_Group' : 'Phecode broad grouping',
 'Phecode' : 'Phecode',
 'Pathogen' : 'Organism Abbreviation',
 'Antibody': 'Antibody with best UKB association result',
 'Pair is Associated' : 'Boolean indicating whether this pair is replicated',
 'Standard Level' : 'Whether and what kind of standard this disease-organism pair is',
 'Replication Status' : 'The UKB-TNX replication status for this dis-org pair',
 'UKB FDR' : 'Disease-wide BH FDR adjusted UKB nominal p-value',
 'TNX FDR' : 'Disease-wide BH FDR adjusted TNX nominal p-value',
 'UKB OR' : 'Odds ratio for UKB titer [continuous]',
 'TNX OR' : 'Odds ratio for TNX association test [categorical]',
 'UKB CI' : '95% confidence intervals for UKB odds ratio',
 'TNX CI' : '95% confidence intervals for TNX odds ratio',
 'UKB nCase' : 'Number of cases in UKB analysis',
 'UKB nControl' : 'Number of controls in UKB analysis',
 'TNX nCase' : 'Number of cases in TNX test analysis',
 'TNX nControl' : 'Number of controls in TNX test analysis',    
 'TNX Test ID':  'LOINC code for TNX test used for replication',
 'TNX Test' : 'Name of TNX test used for replication',
 'TNX Model Meth' : 'Logistic regression method used, either GLM or Firth (exact test)',
 'ukb_covs' : 'Confounders that UKB logistic regression model was adjusted for',
 'tnx_covs' : 'Confounders that TNX logistic regression model was adjusted for, ideally this is the same as ukb_covs, but did not have access to data for some of the possible confounders in the TNX data set',
 'ukb_control_set' : "Controls used for UKB analysis (all, Female, Male, 'O80,O81,O82,O83,O84' (healthy pregnancies)",
 'tnx_control_set' : "Controls used for TNX analysis (all, Female, Male, 'O80,O81,O82,O83,O84' (healthy pregnancies)",
 'tnx_n_mixed' : 'Number of TNX participants that fell into cases and controls',
 'ukb_model' : 'Full UKB logistic regression model, in the form coefficient * coef_name [coef p-value] ',
 'tnx_model': 'Full TNX logistic regression model, in the form coefficient * coef_name [coef p-value] ',
 'ukb_avg_titer_case' : 'Average titer value for UKB cases',
 'ukb_avg_titer_con' : 'Average titer value for UKB controls',
 'ukb_std_titer_case': 'Standard deviation for titer value for UKB cases',
 'ukb_std_titer_con': 'Standard deviation for titer value for UKB controls',
 'ukb_med_titer_case': 'Median titer value for UKB cases',
 'ukb_med_titer_con': 'Median titer value for UKB controls',
 'tnx_n_con_neg' : 'Number of disease controls with negative test result for this TNX test',
 'tnx_n_con_pos' : 'Number of disease controls with positive test result for this TNX test',
 'tnx_n_case_neg' : 'Number of disease cases with negative test result for this TNX test',
 'tnx_n_case_pos' : 'Number of disease cases with positive test result for this TNX test',    
 'ukb_avg_age_case' : 'Average age of UKB disease cases',
 'ukb_avg_age_con' : 'Average age of UKB disease controls',
 'tnx_avg_age_case' : 'Average age of diseases cases for this TNX test',
 'tnx_avg_age_con' : 'Average age of diseases controls for this TNX test',
 'ukb_p_val' : 'Unadjusted nominal UKB p-value',
 'tnx_p_val' : 'Unadjusted nominal UKB p-value',
 'ukb_sig_covs' : 'Covariates that were significantly associated with both disease status and titer level in UKB. This may not be the same as ukb_covs because we applied a backwards elimination procedure to this list of significant covariates to end up with the final list of covariates to adjust for, ukb_covs.',
 'ukb_cov_ps' : 'P-values for covariates included in UKB model',
 'ukb_cov_ors' : 'Odds ratios for covariates included in UKB model',
 'tnx_cov_ps' : 'P-values for covariates included in TNX model',
 'tnx_cov_or' : 'Odds ratios for covariates included in TNX model',
 'tnx_note_str' : 'Notes about TNX model',
 'tnx_note_str' : 'Notes about TNX model',
 'other_test_str' : 'Less significant TNX tests that were also run for replication of this pair',
 'ukb_glm_warn_bool' : 'Boolean indicating if GLM threw warning for this UKB model',
 'ukb_glm_warn_msg' : 'Warning message if any, given by R glm model during UKB logistic regression modeling',
 'tnx_glm_warn_bool' : 'Boolean indicating if GLM threw warning for this TNX model',
 'tnx_glm_warn_msg': 'Warning message if any, given by R glm model during TNX logistic regression modeling',
 'Pathogen Type' : 'What type of organism the current organism is, either virus or bacteria (T. gondii is not really a bacteria though)',
 'Pathogen Family' : 'Phylogenetic family of organism',
 'Pathogen Sub-family' : 'Phylogenetic sub-family of organism',
 'Pathogen Species' : 'Phylogenetic species of organism'
}

In [None]:
# set()
# set()
print(set(list(data_dict.keys())).difference(set(integ_res.columns.tolist())))
print(set(integ_res.columns.tolist()).difference(list(data_dict.keys())))

# Additional filtering of Phecode Results

In [None]:
res = integ_res.copy(deep = True)

In [None]:
grps_to_keep = ['circulatory system', 'dermatologic', 'digestive', 'endocrine/metabolic', 
                'genitourinary', 'hematopoietic',  'mental disorders', 'musculoskeletal', 
                'neoplasms', 'neurological', 'pregnancy complications', 'respiratory', 
                'sense organs']

# Disease groups to remove: Any infectious diseases except our standards, then other
#           things that are either not diseases or things we couldn't detect (congenital anom.)
grps_to_rem = ['infectious diseases', 'injuries & poisonings','congenital anomalies',
               'symptoms']

inf_dis_to_keep = res.loc[res['Standard Level'] == 'exp_neg', 'Disease_Description'].unique().tolist()

### HIV

In [None]:
# Want to double check HIV in our Tier 1 and expected negatives. 
# So, we need to remove the results with std_lev of 'NAN' because those 
# are the HIV with Tier 1 results, which are not valid.

# Unknown    21780
# exp_neg      104
# Tier 1        10
# NAN            6
res['Standard Level'].value_counts(dropna = False)


# std_lev
# NAN    6

res.loc[((res['Disease_Description'].isin(inf_dis_to_keep)) & 
          (res['Pathogen'] == 'hiv')), 'Standard Level'].value_counts(dropna = False)

In [None]:
# Looks good!

# std_lev
# Unknown    21780
# exp_neg      104
# Tier 1        10
res.loc[res['Standard Level'] != 'NAN', 'Standard Level'].value_counts(dropna = False)

res = res.loc[res['Standard Level'] != 'NAN', :]


# std_lev
# Unknown    21780
# exp_neg      104
# Tier 1        10
res['Standard Level'].value_counts(dropna = False)

### Disease Group

In [None]:
# Disease_Group
# digestive                  2460
# genitourinary              2360
# circulatory system         2240
# neoplasms                  1780
# sense organs               1640
# endocrine/metabolic        1540
# musculoskeletal            1360
# dermatologic               1320
# respiratory                1140
# neurological                960
# mental disorders            940
# hematopoietic               580
# pregnancy complications     500

# Remove
# infectious diseases         740
# injuries & poisonings      1100
# congenital anomalies        440
# symptoms                    520

# Take a closer look at. -> These can all be removed
    # NaN                                                                280
    # Burns                                                              20
    # Foreign body injury                                                20
    # Symptoms concerning nutrition, metabolism, and development         20
    # Other signs and symptoms involving emotional state                 20
    # Other symptoms                                                     20
    # Crushing injury                                                    20
    # Crushing or internal injury to organs                              20
    # Injury, NOS                                                        20
    # Other tests                                                        20
    # Complications of surgical and medical procedures                   20
    # Effects of heat, cold and air pressure                             20
    # Effects of other external causes                                   20
    # Other ill-defined and unknown causes of morbidity and mortality    20
    # Family history                                                     20

In [None]:
# std_lev
# Unknown    18820
# exp_neg      104
# Tier 1        10
res.loc[((res['Disease_Group'].isin(grps_to_keep)) | 
         (res['Disease_Description'].isin(inf_dis_to_keep))), 
         'std_lev'].value_counts(dropna = False)


# We have 114 infectious diseases left which is equal to our 
# number of positive and negative controls combined, so we are good.

# Disease_Group
# digestive                  2460
# genitourinary              2360
# circulatory system         2240
# neoplasms                  1780
# sense organs               1640
# endocrine/metabolic        1540
# musculoskeletal            1360
# dermatologic               1320
# respiratory                1140
# neurological                960
# mental disorders            940
# hematopoietic               580
# pregnancy complications     500
# infectious diseases         114
res.loc[((res['Disease_Group'].isin(grps_to_keep)) | 
         (res['Disease_Description'].isin(inf_dis_to_keep))), 
         'Disease_Group'].value_counts(dropna = False)


res = res.loc[((res['Disease_Group'].isin(grps_to_keep)) | 
         (res['Disease_Description'].isin(inf_dis_to_keep))), :]


# Everything looks good so let's write out the res

## Write our file out (supplemental_dataset_3.xlsx)

In [None]:
data_dict = pd.DataFrame.from_dict(data_dict, orient = 'index') 
data_dict.reset_index(inplace = True, drop = False)
data_dict.columns = ['Column Name', 'Meaning']

In [None]:
with pd.ExcelWriter(f'{HOME_DIR}/manuscript/supplemental_datasets/supplemental_dataset_3.xlsx', engine='openpyxl') as writer:
    res.to_excel(writer, sheet_name='Results', index=False)
    data_dict.to_excel(writer, sheet_name='Dictionary', index=False)