# Load Libraries

In [None]:
# Data manipulation
import numpy as np
import pandas as pd

# Data science
import math
import scipy.stats as stats
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from statsmodels.stats.multitest import multipletests as mt

# Plots
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt

# Working with dates
from datetime import date,datetime
import dateutil

# Looping  progress
from tqdm.notebook import tqdm

# Reg expressions
import re

# Pretty table printing
import tabulate

# ***REMOVED*** Snippets Require these
import os
import subprocess

# Misc libraries
from IPython.display import display, HTML
#from IPython.core.display import display, HTML

# Set seaborn figure size, font size, and style
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set(font_scale=1.5)
sns.set_style("white")

# Set Pandas options so we can see our entire dataframe
pd.options.display.max_rows = 10000
pd.options.display.max_columns = 10000
pd.options.display.max_colwidth = None

# Print our versions of this packages, this allows us to make sure
# we have the working versions we need. 
print(f"Pandas version: {pd.__version__}")


# Remove grey side bars
display(HTML("<style>.container { width:90% !important; }</style>"))

In [None]:
HOME_DIR = "/data/pathogen_ncd"

Concatenate the separate results file (1 per ICD10 code) together to have a set of final TNX results

```bash
# Push all the per-ICD TNX results for all ICDs into a single file

cat "${HOME_DIR}/other/trinetx/new_dataset/results/tnx_results_01_17_2023/res/*.tsv" > tnx_all_results.tsv

# Grab what will be the header line
grep 'Unparsed_Disease' tnx_all_results.tsv  | head -n 1 > header

# Remove extraneous header lines in our final file
grep -v 'Unparsed_Disease' tnx_all_results.tsv  | sponge  tnx_all_results.tsv

# Add header back to the top of our now cleaned TNX results file 
cat header tnx_all_results.tsv | sponge tnx_all_results.tsv

```

# Prep UKB and TNX results

## Read in UKB and TNX Result Files

In [None]:
# UKB results file
all_ukb = pd.read_csv(f'{HOME_DIR}/results/emp_results_01_17_2023.tsv', sep = '\t')
all_ukb['num_tot_samples'] = all_ukb['nCase'] + all_ukb['nControl']

# TNX categorical test results
all_tnx_cat = pd.read_csv(f'{HOME_DIR}/other/trinetx/new_dataset/results/tnx_results_01_17_2023/res/tnx_all_results.tsv', 
                          sep = '\t')
all_tnx_cat = all_tnx_cat.loc[all_tnx_cat['disease_name'] != 'disease_name', :]

all_tnx = all_tnx_cat.copy(deep = True)
all_tnx.loc[:, ['num_case', 'num_con']] = all_tnx.loc[:, ['num_case', 'num_con']].astype(int)
all_tnx['num_tot_samples'] = all_tnx['num_case'] + all_tnx['num_con']

In [None]:
# 23,122
print(all_ukb.shape[0])


# 198,225
print(all_tnx_cat.shape[0])

## Align UKB and TNX Diseases

In [None]:
# No B24 results, so to merge with UKB change TNX B20 to B24
# 0
print(len(all_tnx.loc[all_tnx['icd'] == 'B24', :]))

# 44
print(len(all_tnx.loc[all_tnx['icd'] == 'B20', :]))

# All B20 successfully changed to B24 for merging with UKB's B24 (HIV infection)
all_tnx.loc[all_tnx['icd'] == 'B20', 'icd'] = 'B24'

# 44
print(len(all_tnx.loc[all_tnx['icd'] == 'B24', :]))

# 0
print(len(all_tnx.loc[all_tnx['icd'] == 'B20', :]))

## Align UKB and TNX Pathogen tags

In [None]:
# UKB and TNX Anti's are already aligned
# Fix some weird chars
all_ukb.loc[:, 'org'] = all_ukb.loc[:, 'org'].replace({'MCV\xa0' : 'MCV', 
                                                   'H.\xa0pylori' : 'H.pylori'})

org_to_tag_dict = {
                            'HSV1'           : 'hsv1',
                            'HSV2'           : 'hsv2',
                            'VZV'            : 'vzv',
                            'EBV'            : 'ebv',
                            'CMV'            : 'cmv',
                            'HHV-6'          : 'hhv6',
                            'HHV-7'          : 'hhv7',
                            'KSHV/HHV-8'     : 'kshv',
                            'HBV'            : 'hbv',
                            'HCV'            : 'hcv',
                            'T. gondii'      : 'tox',
                            'T.gondii'      : 'tox',
                            'HTLV-1'         : 'htlv',
                            'BKV'            : 'bkv',
                            'JCV'            : 'jcv',
                            'MCV'            : 'mcv',
                            'HPV-16'         : 'hpv16',
                            'HPV-18'         : 'hpv18',
                            'C. trachomatis' : 'chlam',
                            'C.trachomatis' : 'chlam',

                            'H.pylori'       : 'hpylori',
                            'HIV'            : 'hiv'
                        }

all_ukb['tag'] = all_ukb.loc[:, 'org'].replace(org_to_tag_dict)

# Do statistical power filtering

## Case and Total Sample Filtering

In [None]:
# Statistical power requirements
MIN_CASE_THRESH = 17
MIN_TOT_SAMP_THRESH  = 187

In [None]:
# Total in UKB:  23122
# Total in UKB that meet case threshold [n >= 17]:  19289
# Total in TNX:  198225
# Total in TNX stds:  2073
# Total in TNX non-stds:  196152
# Total in TNX standards that meet case threshold [n >= 17] and total sample threshold [n >= 187]:  1876
# Total in TNX non-standards that meet case threshold [n >= 17] and total sample threshold [n >= 187]:  178058
# Total in combined TNX that meet thresholds: 179934

tnx_stds = all_tnx.loc[all_tnx['icd'].str.startswith(('A', 'B')), :]
tnx_non_stds = all_tnx.loc[~all_tnx['icd'].str.startswith(('A', 'B')), :]

print(f"Total in UKB:  {len(all_ukb)}")
ukb = all_ukb.copy(deep = True)
ukb = ukb.loc[((ukb['ICD10_Cat'].isin(['A', 'B'])) | 
         ((ukb['nCase'] >= MIN_CASE_THRESH) &
          (ukb['num_tot_samples'] >= MIN_TOT_SAMP_THRESH))), :]

print(f"Total in UKB that meet case threshold [n >= {MIN_CASE_THRESH}]:  {len(ukb)}")


print(f"Total in TNX:  {len(all_tnx)}")

print(f"Total in TNX stds:  {len(tnx_stds)}")
print(f"Total in TNX non-stds:  {len(tnx_non_stds)}")

tnx_stds = tnx_stds.loc[((tnx_stds['num_case'] >= MIN_CASE_THRESH) &
                         (tnx_stds['num_tot_samples'] >= MIN_TOT_SAMP_THRESH)), :]
print(f"Total in TNX standards that meet case threshold [n >= {MIN_CASE_THRESH}] and total sample threshold [n >= {MIN_TOT_SAMP_THRESH}]:  {len(tnx_stds)}")

tnx_non_stds = tnx_non_stds.loc[((tnx_non_stds['num_case'] >= MIN_CASE_THRESH) &
                   (tnx_non_stds['num_tot_samples'] >= MIN_TOT_SAMP_THRESH)), :]
print(f"Total in TNX non-standards that meet case threshold [n >= {MIN_CASE_THRESH}] and total sample threshold [n >= {MIN_TOT_SAMP_THRESH}]:  {len(tnx_non_stds)}")

tnx = pd.concat([tnx_stds, tnx_non_stds])
print(f"Total in combined TNX that meet thresholds: {len(tnx)}")

# Work on UKB

## Calculate per-disease FDR corrected NOMINAL p-value

In [None]:
icd_ls = ukb['icd'].unique().tolist()

fin_ukb_ls = [] 
for curr_icd in tqdm(icd_ls):
    curr_dis_res = ukb.loc[ukb['icd'] == curr_icd, :].copy(deep = True)

    curr_dis_res['per_dis_bh_fdr_corr_nom_p'] = mt(curr_dis_res['p_val'], 
                                                   alpha = 0.05, method = 'fdr_bh')[1]

    fin_ukb_ls.extend(curr_dis_res.values.tolist())
    
fin_ukb = pd.DataFrame(fin_ukb_ls, columns = ukb.columns.tolist() + ['per_dis_bh_fdr_corr_nom_p'])

## Collapse to from Antibody to pathogen level (keeping most significant FDR corr nom p)

In [None]:
# 19,289
print(f"Total dis-Ab pairs in UKB: {len(fin_ukb)}")

fin_ukb = fin_ukb.sort_values(['icd', 'org', 'per_dis_bh_fdr_corr_nom_p'])
fin_ukb = fin_ukb.drop_duplicates(['icd', 'org'], keep = 'first')

# 8,616
print(f"Total dis-Org pairs in UKB: {len(fin_ukb)}")

# Work on TNX results

## Limit TNX to selected UKB dis-org pairs

In [None]:
print("Orgs:")
print(set(tnx['org'].unique().tolist()).difference(set(ukb['tag'].unique().tolist())))
print(set(ukb['tag'].unique().tolist()).difference(set(tnx['org'].unique().tolist())))

print("Antibodies:")
print(set(tnx['anti'].unique().tolist()).difference(set(ukb['anti'].unique().tolist())))
print(set(ukb['anti'].unique().tolist()).difference(set(tnx['anti'].unique().tolist())))

print("Diseases:")
print(set(tnx['icd'].unique().tolist()).difference(set(ukb['icd'].unique().tolist())))
print(set(ukb['icd'].unique().tolist()).difference(set(tnx['icd'].unique().tolist())))

In [None]:
# Some diseases in TNX that are not in UKB results most likely filtered out
# by minimum number of case/control requirements

# No TNX tests for the following orgs: 
# 'hhv7'
# 'htlv'
# 'mcv'

# No UKB results for the following ICDs by BH Corrected NOMINAL P:
# 'C15','C16','C25','C49','C53','C62','C71','C73','C82',
# 'D07','D09','D56',
# 'E29','E61','E73','E88',
# 'F00','F06','F09','F22','F42',
# 'G31','G37','G52','G60','G63','G64','G82','G83','G90','G96',
# 'H21','H46','H49','H71',
# 'I05','I07','I11','I62','I85','I99',
# 'J10','J13','J15','J36','J37','J81','J86','J91',
# 'K06','K09',
# 'L04','L23','L63','L68','L93','L94',
# 'M00','M11','M40','M60','M92','M96',
# 'N05','N08','N46','N71',
# 'O00','O13','O16','O20','O24','O30','O42','O46','O47','O60','O62','O66','O69','O75'

# No TNX diags for the following ICDs:
# 'E14'
# 'I64'
# 'I84'
# 'K07'
# 'K10'

## First filter our TNX results to just those for the necessary Dis-Org-Ab pair from UKB

In [None]:
ukb_vals = fin_ukb.loc[:, ['icd', 'tag', 'anti']].values.tolist()

filt_tnx_ls = []
for curr_icd, curr_org, curr_anti in tqdm(ukb_vals):
    
    sel_tnx = tnx.loc[((tnx['icd'] == curr_icd) & 
                       (tnx['org'] == curr_org) &
                       (tnx['anti'] == curr_anti)), :].values.tolist()


    #print(f"{curr_icd}|{curr_org}|{curr_anti}: {len(sel_aou)}")

    filt_tnx_ls.extend(sel_tnx)
    
filt_tnx = pd.DataFrame(filt_tnx_ls, columns = tnx.columns)

# 56,277
print(len(filt_tnx))

## Calculate TNX per-disease FDR corrected NOMINAL p-value

In [None]:
filt_tnx.loc[:, 'p_val'] = filt_tnx.loc[:, 'p_val'].astype(float)

icd_ls = filt_tnx['icd'].unique().tolist()

fin_tnx_ls = [] 
for curr_icd in tqdm(icd_ls):
    curr_dis_res = filt_tnx.loc[filt_tnx['icd'] == curr_icd, :].copy(deep = True)

    curr_dis_res['per_dis_bh_fdr_corr_p'] = mt(curr_dis_res['p_val'], 
                                                   alpha = 0.05, method = 'fdr_bh')[1]

    fin_tnx_ls.extend(curr_dis_res.values.tolist())
    
fin_tnx = pd.DataFrame(fin_tnx_ls, columns = filt_tnx.columns.tolist() + ['per_dis_bh_fdr_corr_p'])

# Merge cleaned UKB and cleaned TNX results

In [None]:
fin_ukb.columns = [f'ukb_{x}' for x in fin_ukb.columns.tolist()] 
fin_tnx.columns = [f'tnx_{x}' for x in fin_tnx.columns.tolist()] 

In [None]:
# nrow UKB: 8616
# nrow TNX: 56277
# nrow combo: 58398
print(f"nrow UKB: {len(fin_ukb)}")
print(f"nrow TNX: {len(fin_tnx)}")

combo = fin_ukb.merge(fin_tnx, how = 'left',
                      left_on = ['ukb_icd', 'ukb_tag', 'ukb_anti'],
                      right_on = ['tnx_icd', 'tnx_org', 'tnx_anti'])

print(f"nrow combo: {len(combo)}")


In [None]:
combo = combo.loc[:, ['ukb_Disease', 'ukb_icd', 'ukb_tag', 'ukb_anti',  'ukb_std_lev',
                      'tnx_test_type', 'tnx_test_id', 'tnx_test', 'tnx_mod_method',
                      'ukb_sex_specific_dis', 'ukb_nCase', 'ukb_nControl',
                      'ukb_control_set', 'ukb_n_mixed',
                      'tnx_dis_sex', 'tnx_num_case', 'tnx_num_con',
                      'tnx_con_str', 'tnx_n_mixed',
                      
                      'ukb_per_dis_bh_fdr_corr_nom_p', 'ukb_mod_3_emp_p', 'ukb_p_val',
                      'tnx_per_dis_bh_fdr_corr_p', 'tnx_p_val',

                      
                      'ukb_anti_OR',  'tnx_OR',

                      'ukb_sig_covs', 'ukb_cov_adj_for',
                      
                      'ukb_Warnings', 'ukb_is_warning',
                      'tnx_glm_warn_msg', 'tnx_glm_warn_bool',
                      
                      'ukb_model', 'tnx_model',
                      
                      'ukb_cov_ps',  'ukb_cov_ors', 'tnx_cov_adj',
                      'tnx_cov_ps', 'tnx_cov_or',
                      'ukb_anti_CI',  'tnx_CI',

                      
                      'ukb_avg_age_case',  'ukb_avg_avg_con', 
                      'ukb_avg_titer_case', 'ukb_avg_titer_con',
                      'ukb_std_titer_case', 'ukb_std_titer_con', 
                      'ukb_med_titer_case', 'ukb_med_titer_con',
                       
                      'tnx_case_age', 'tnx_con_age', 'tnx_case_titer', 'tnx_con_titer',
                      'tnx_case_titer_std', 'tnx_con_titer_std',
                      'tnx_case_titer_med', 'tnx_con_titer_med',
                      'tnx_n_con_neg', 'tnx_n_con_pos', 'tnx_n_case_neg', 'tnx_n_case_pos',
                      'tnx_log_trans', 
                      
                      
                      'ukb_tot_dis_perms', 'ukb_perms_lt_mod_3_p',
                      'tnx_note_str',
                     ]]

combo = combo.rename(columns = {
                         'ukb_Disease' : 'disease',
                         'ukb_icd' : 'icd',
                         'ukb_tag' : 'org',
                         'ukb_anti' : 'anti',
                         'ukb_std_lev' : 'std_lev',
})

In [None]:
# Combo verified to have:
# A row for every UKB result no matter if there is TNX test or not
#   1 row (UKB result) - no TNX tests for A60 x BKV
#   combo.loc[((combo['icd'] == 'A60') & (combo['org'] == 'bkv')), :]

# A row for each TNX test matched to it's correct UKB row
#   19 rows - 1 row for each TNX test 
#   combo.loc[((combo['icd'] == 'A60') & (combo['org'] == 'chlam')), :]

# See if there are any rows where there is an TNX test (not null)
# and the disease sexes don't match

# All good - empty
print(combo.loc[((combo['ukb_sex_specific_dis'] != combo['tnx_dis_sex']) &
           (combo['tnx_test_type'].notnull())), :])

# Also, make sure the control sets are the same for both UKB and TNX
# All good - empty 
print(combo.loc[((combo['ukb_control_set'] != combo['tnx_con_str']) &
           (combo['tnx_test_type'].notnull())), :])

# Do some cleaning

## Convert some columns to ints

In [None]:
combo.loc[:, ['ukb_nCase', 'ukb_nControl', 'ukb_n_mixed',
              'tnx_num_case', 'tnx_num_con']] = combo.loc[:, ['ukb_nCase', 'ukb_nControl', 'ukb_n_mixed',
                                                              'tnx_num_case', 'tnx_num_con']].astype("Int64")

# Workaround for apparently known bug?
# https://stackoverflow.com/a/60024263
#combo.loc[:, 'tnx_n_mixed'] = combo['tnx_n_mixed'].astype(str)
#combo.loc[:, 'tnx_n_mixed'] = combo['tnx_n_mixed'].str.strip()
combo.loc[:, 'tnx_n_mixed'] = combo.loc[:, 'tnx_n_mixed'].astype(float)
combo.loc[:, 'tnx_n_mixed'] = combo.loc[:, 'tnx_n_mixed'].astype("Int64")


#combo.loc[:, 'tnx_n_con_neg'] = combo['tnx_n_con_neg'].str.strip()
combo.loc[:, 'tnx_n_con_neg'] = combo.loc[:, 'tnx_n_con_neg'].astype(float)
combo.loc[:, 'tnx_n_con_neg'] = combo.loc[:, 'tnx_n_con_neg'].astype("Int64")


#combo.loc[:, 'tnx_n_con_pos'] = combo['tnx_n_con_pos'].str.strip()
combo.loc[:, 'tnx_n_con_pos'] = combo.loc[:, 'tnx_n_con_pos'].astype(float)
combo.loc[:, 'tnx_n_con_pos'] = combo.loc[:, 'tnx_n_con_pos'].astype("Int64")



#combo.loc[:, 'tnx_n_case_neg'] = combo['tnx_n_case_neg'].str.strip()
combo.loc[:, 'tnx_n_case_neg'] = combo.loc[:, 'tnx_n_case_neg'].astype(float)
combo.loc[:, 'tnx_n_case_neg'] = combo.loc[:, 'tnx_n_case_neg'].astype("Int64")



#combo.loc[:, 'tnx_n_case_pos'] = combo['tnx_n_case_pos'].str.strip()
combo.loc[:, 'tnx_n_case_pos'] = combo.loc[:, 'tnx_n_case_pos'].astype(float)
combo.loc[:, 'tnx_n_case_pos'] = combo.loc[:, 'tnx_n_case_pos'].astype("Int64")


In [None]:
# Verify
combo.loc[:, ['ukb_nCase', 'ukb_nControl', 'ukb_n_mixed',
              'tnx_num_case', 'tnx_num_con', 'tnx_n_mixed', 
              'tnx_n_con_neg', 'tnx_n_con_pos', 
              'tnx_n_case_neg', 'tnx_n_case_pos']].info()

## Annotate risk

In [None]:
combo.loc[:, ['ukb_anti_OR', 'tnx_OR']] = combo.loc[:, ['ukb_anti_OR', 'tnx_OR']].astype(float)

combo['ukb_risk'] = False
combo.loc[combo['ukb_anti_OR'] > 1, 'ukb_risk'] = True

combo['tnx_risk'] = False
combo.loc[combo['tnx_OR'] > 1, 'tnx_risk'] = True

# Collapse to best TNX result

## Make small function

In [None]:
# Similar to round, but if the decimal cannot be represented by the number of digits we switch to
# sci notation with that many digits in the significand.
def make_small(num, digits):
    if digits < 1:
        print("Requires positive number of digits")
        return

    dig_min_1 = digits - 1

    low_bound = 1 / (10 ** (dig_min_1))
    up_bound = 10 ** (dig_min_1)

    if ((num < low_bound) | (num > up_bound)):
        #print(f"{low_bound} < X < {up_bound} : No")
        return "{:.{}e}".format(num, dig_min_1)

    #print(f"{low_bound} < X < {up_bound} : Yes")
    return round(num, digits)

## Actual loop

In [None]:
icd_org_ls = combo.loc[:, ['icd', 'org']].drop_duplicates().values.tolist()

fin_combo_ls = []
for curr_icd, curr_org in tqdm(icd_org_ls):
    
    #print(f"{curr_icd} {curr_org}")
    curr_res = combo.loc[((combo['icd'] == curr_icd) &
                      (combo['org'] == curr_org)), :].copy(deep = True)

    
    # No TNX tests
    if (len(curr_res) == 1) & (all(curr_res['tnx_test_id'].isna())):

        curr_res['other_test_str'] = ''
        curr_res['or_flip_tnx_tests'] = ''

        fin_combo_ls.extend(curr_res.values.tolist())

        continue
    
    
    # If organism is risk
    is_ukb_risk = curr_res['ukb_risk'].unique().tolist()[0]

    # Risk
    if is_ukb_risk:

        curr_tnx_res = curr_res.loc[curr_res['tnx_risk'] == True, :]
        curr_tnx_opp_res = curr_res.loc[curr_res['tnx_risk'] == False, :]

    # Protective
    else:

        curr_tnx_res = curr_res.loc[curr_res['tnx_risk'] == False, :]
        curr_tnx_opp_res = curr_res.loc[curr_res['tnx_risk'] == True, :]

    curr_tnx_res = curr_tnx_res.sort_values(['tnx_per_dis_bh_fdr_corr_p'], ascending = True).reset_index(drop = True)

    
    # No tests in the same direction, so get most sig in oppsite direction
    if len(curr_tnx_res) == 0:    
        curr_tnx_opp_res = curr_tnx_opp_res.sort_values(['tnx_per_dis_bh_fdr_corr_p'], ascending = True).reset_index(drop = True)
        # get most sig ***REMOVED*** p
        best_tnx = curr_tnx_opp_res.iloc[0].copy(deep = True)

        # Grab the rest of the tests (less sig)
        rest_tnx = curr_tnx_opp_res.iloc[1:].copy(deep = True)
        rest_tnx = rest_tnx.reset_index(drop = True)

        
        # Throw the rest of the tests in a column of best_tnx
        if len(rest_tnx) > 0:
            best_tnx['or_flip_tnx_tests'] =  rest_tnx.agg( lambda x: f"{x['tnx_test_id']} [{x['tnx_test']}, {x['tnx_mod_method']}]:  nCase: {x['tnx_num_case']} | nCon: {x['tnx_num_con']}, corr p-val: {make_small(x['tnx_per_dis_bh_fdr_corr_p'],3)}, uncorr p-val: {make_small(x['tnx_p_val'], 3)}, OR: {make_small(x['tnx_OR'], 3)} | model: {x['tnx_model']}, glm_warn [{x['tnx_glm_warn_bool']}]: {x['tnx_glm_warn_msg']} log10 trans: {x['tnx_log_trans']} | Notes: {x['tnx_note_str']}", axis = 1)
        else:
            best_tnx['or_flip_tnx_tests'] = ''


        if len(curr_tnx_opp_res) > 0:
            best_tnx['other_test_str'] =  curr_tnx_opp_res.agg( lambda x: f"{x['tnx_test_id']} [{x['tnx_test']}, {x['tnx_mod_method']}]:  nCase: {x['tnx_num_case']} | nCon: {x['tnx_num_con']}, corr p-val: {make_small(x['tnx_per_dis_bh_fdr_corr_p'],3)}, uncorr p-val: {make_small(x['tnx_p_val'], 3)}, OR: {make_small(x['tnx_OR'], 3)} | model: {x['tnx_model']}, glm_warn [{x['tnx_glm_warn_bool']}]: {x['tnx_glm_warn_msg']} log10 trans: {x['tnx_log_trans']} | Notes: {x['tnx_note_str']}", axis = 1)
        else:
            best_tnx['other_test_str'] = ''

        fin_combo_ls.append(best_tnx.values.tolist())
    
    
    else:
        # get most sig ***REMOVED*** p
        best_tnx = curr_tnx_res.iloc[0].copy(deep = True)

        # Grab the rest of the tests (less sig)
        rest_tnx = curr_tnx_res.iloc[1:].copy(deep = True)
        rest_tnx = rest_tnx.reset_index(drop = True)

        # Throw the rest of the tests in a column of best_tnx
        if len(rest_tnx) > 0:
            best_tnx['other_test_str'] =  rest_tnx.agg( lambda x: f"{x['tnx_test_id']}  [{x['tnx_test']}, {x['tnx_mod_method']}]:  nCase: {x['tnx_num_case']} | nCon: {x['tnx_num_con']}, corr p-val: {make_small(x['tnx_per_dis_bh_fdr_corr_p'],3)}, uncorr p-val: {make_small(x['tnx_p_val'], 3)}, OR: {make_small(x['tnx_OR'], 3)} | model: {x['tnx_model']}, glm_warn [{x['tnx_glm_warn_bool']}]: {x['tnx_glm_warn_msg']} log10 trans: {x['tnx_log_trans']} | Notes: {x['tnx_note_str']}", axis = 1)
        else:
            best_tnx['other_test_str'] = ''


        if len(curr_tnx_opp_res) > 0:
            best_tnx['or_flip_tnx_tests'] =  curr_tnx_opp_res.agg( lambda x: f"{x['tnx_test_id']} [{x['tnx_test']}, {x['tnx_mod_method']}]:  nCase: {x['tnx_num_case']} | nCon: {x['tnx_num_con']}, corr p-val: {make_small(x['tnx_per_dis_bh_fdr_corr_p'],3)}, uncorr p-val: {make_small(x['tnx_p_val'], 3)}, OR: {make_small(x['tnx_OR'], 3)} | model: {x['tnx_model']}, glm_warn [{x['tnx_glm_warn_bool']}]: {x['tnx_glm_warn_msg']} log10 trans: {x['tnx_log_trans']} | Notes: {x['tnx_note_str']}", axis = 1)
        else:
            best_tnx['or_flip_tnx_tests'] = ''

        fin_combo_ls.append(best_tnx.values.tolist())
    
fin_combo = pd.DataFrame(fin_combo_ls, columns = combo.columns.tolist() + ['other_test_str', 'or_flip_tnx_tests'])

# 8,616
print(len(fin_combo))

In [None]:
fin = fin_combo.copy(deep = True)

# TNX: 8,616 -> dis-Ab to dis-org collapsing has already occurred.
print(len(fin))

In [None]:
fin = fin.sort_values(['icd', 'org', 'ukb_per_dis_bh_fdr_corr_nom_p'])

# Mark replication status for each Disease-Pathogen Pair

In [None]:
UKB_THRESH = 0.3
TNX_THRESH = 0.01

org_lev_res = fin.copy(deep = True)

# Default: 'did_not_attempt'
org_lev_res['rep_stat'] = 'did_not_attempt'


# Rep: UKB and TNX sig
org_lev_res.loc[((org_lev_res.loc[:, 'ukb_per_dis_bh_fdr_corr_nom_p'] < UKB_THRESH) & 
                 (org_lev_res.loc[:, 'tnx_per_dis_bh_fdr_corr_p'] < TNX_THRESH)), 
                'rep_stat'] = 'replicated'


# Could not: UKB sig but no TNX test
org_lev_res.loc[((org_lev_res.loc[:, 'ukb_per_dis_bh_fdr_corr_nom_p'] < UKB_THRESH ) & 
                 (org_lev_res['tnx_p_val'].isna())), 'rep_stat'] = 'could_not'


# Did not: UKB sig and either (TNX not sig) or (TNX sig but OR in opposite direction of UKB)
org_lev_res.loc[(

            (org_lev_res['ukb_per_dis_bh_fdr_corr_nom_p'] < UKB_THRESH) & 

            (
                (org_lev_res['tnx_per_dis_bh_fdr_corr_p'] >= TNX_THRESH) |

                (
                    (org_lev_res.loc[:, 'tnx_per_dis_bh_fdr_corr_p'] < TNX_THRESH) &
                    (org_lev_res['ukb_risk'] !=  org_lev_res['tnx_risk'])
                )

            )
        ), 'rep_stat'] = 'did_not'

org_lev_res.loc[:, 'std_lev'] = org_lev_res.loc[:, 'std_lev'].replace({
                                                            'true_neg' : 'exp_neg',
                                                            'Gold' : 'Tier 1',
                                                            'Silver' : 'Tier 2',
                                                            'unk'   : 'unk'
                                                        })

org_lev_res['pair_is_associated'] = 'No'
org_lev_res.loc[org_lev_res['rep_stat'] == 'replicated', 'pair_is_associated'] = 'Yes'

In [None]:
# Set all did not attempt TNX stuff to NA
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_test_type'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_test_id'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_test'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_mod_method'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_dis_sex'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_num_case'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_num_con'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_con_str'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_n_mixed'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_per_dis_bh_fdr_corr_p'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_p_val'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_OR'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_glm_warn_msg'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_glm_warn_bool'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_model'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_cov_adj'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_cov_ps'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_cov_or'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_CI'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_case_age'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_con_age'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_case_titer'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_con_titer'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_case_titer_std'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_con_titer_std'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_case_titer_med'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_con_titer_med'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_n_con_neg'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_n_con_pos'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_n_case_neg'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_n_case_pos'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_log_trans'] = np.nan
org_lev_res.loc[org_lev_res['rep_stat'] == 'did_not_attempt', 'tnx_note_str'] = np.nan

## Re-org some cols

In [None]:
org_lev_res = org_lev_res.loc[:, ['disease', 'icd', 'org', 'anti', 
                                  'pair_is_associated',  'std_lev', 'rep_stat', 'tnx_test_type', 
                                   'tnx_test_id', 'tnx_test', 'tnx_mod_method', 'ukb_sex_specific_dis',
                                   'ukb_nCase', 'ukb_nControl', 'ukb_control_set', 'ukb_n_mixed',
                                   'tnx_dis_sex', 'tnx_num_case', 'tnx_num_con', 'tnx_con_str',
                                   'tnx_n_mixed', 'ukb_per_dis_bh_fdr_corr_nom_p',
                                   'tnx_per_dis_bh_fdr_corr_p', 'ukb_mod_3_emp_p', 'ukb_p_val',
                                   'tnx_p_val', 'ukb_anti_OR', 'tnx_OR', 'ukb_sig_covs', 'ukb_cov_adj_for',
                                   'ukb_Warnings', 'ukb_is_warning', 'tnx_glm_warn_msg',
                                   'tnx_glm_warn_bool', 'ukb_model', 'tnx_model', 'ukb_cov_ps',
                                   'ukb_cov_ors', 'tnx_cov_adj', 'tnx_cov_ps', 'tnx_cov_or', 'ukb_anti_CI',
                                   'tnx_CI', 'ukb_avg_age_case', 'ukb_avg_avg_con', 'ukb_avg_titer_case',
                                   'ukb_avg_titer_con', 'ukb_std_titer_case', 'ukb_std_titer_con',
                                   'ukb_med_titer_case', 'ukb_med_titer_con', 'tnx_case_age',
                                   'tnx_con_age', 'tnx_case_titer', 'tnx_con_titer', 'tnx_case_titer_std',
                                   'tnx_con_titer_std', 'tnx_case_titer_med', 'tnx_con_titer_med',
                                   'tnx_n_con_neg', 'tnx_n_con_pos', 'tnx_n_case_neg', 'tnx_n_case_pos',
                                   'tnx_log_trans', 'ukb_tot_dis_perms', 'ukb_perms_lt_mod_3_p',
                                   'tnx_note_str', 'ukb_risk', 'tnx_risk', 'other_test_str', 'or_flip_tnx_tests']]

## Save the final results file

In [None]:
org_lev_res.to_excel(f'{HOME_DIR}/results/ukb_tnx_combined_results_03_08_23_final.xlsx', index = False)