# Load Libraries

In [None]:
# Data manipulation
import numpy as np
import pandas as pd

# Data science
import math
import scipy.stats as stats
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from statsmodels.stats.multitest import multipletests as mt

# Plots
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import ticker, gridspec
from matplotlib import font_manager as fm


# Working with dates
from datetime import date,datetime
import dateutil

# Looping  progress
from tqdm.notebook import tqdm

# Reg expressions
import re

# Pretty table printing
import tabulate

import os
import subprocess

# Misc libraries
from IPython.display import display, HTML
#from IPython.core.display import display, HTML

# Set seaborn figure size, font size, and style
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set(font_scale=1.5)
sns.set_style("white")

# Set Pandas options so we can see our entire dataframe
pd.options.display.max_rows = 10000
pd.options.display.max_columns = 10000
pd.options.display.max_colwidth = None

# Print our versions of this packages, this allows us to make sure
# we have the working versions we need. 
print(f"Pandas version: {pd.__version__}")


# Specify the directory where custom fonts are stored
font_dir = '/users/lapt3u/.fonts'

# Add fonts from the specified directory to Matplotlib's font manager
font_files = fm.findSystemFonts(fontpaths=[font_dir])
for font_file in font_files:
    fm.fontManager.addfont(font_file)

# Set the default font family to Arial (if Arial is available)
plt.rcParams['font.family'] = 'Arial'

print(plt.rcParams['font.family'])

# Prep Environment

In [None]:
HOME_DIR = "/data/pathogen_ncd"
os.chdir(f'{HOME_DIR}/results')

# Helper functions

In [None]:
# Function to return coordinate path for green dashed line given heights
def get_coord_path(heights):
    
    TOP_FUDGE = 1.005
    
    x_coords = []
    y_coords = []

    max_rect_ind = len(heights)
    for curr_rect_ind in range(0, len(heights), 1):

        curr_height = heights[curr_rect_ind]

        # First rectangle, start at baseline
        if curr_rect_ind == 0:
            x_coords.append(-0.5)
            y_coords.append(0)


        curr_x1 = curr_rect_ind - 0.5
        curr_x2 = curr_rect_ind + 0.5

        curr_y1 = curr_height * TOP_FUDGE
        curr_y2 = curr_height * TOP_FUDGE

        x_coords.append(curr_x1)
        x_coords.append(curr_x2)
        y_coords.append(curr_y1)
        y_coords.append(curr_y2)

        # Last rectangle, bring it down to the baseline and close it
        if curr_rect_ind == max_rect_ind - 1:
            x_coords.append(curr_x2)
            y_coords.append(0)
            x_coords.append(x_coords[0])
            y_coords.append(0)

    return np.array(list(zip(x_coords, y_coords)))

# Setup the side-by-side plots

In [None]:
# Plot
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.patches as patches
from matplotlib.patches import FancyArrowPatch as Arrow
import matplotlib.lines as lines


UKB_COLOR = '#5b9bd5'
TNX_COLOR = '#f4b183'

TEXT_LABEL_FONT_SIZE = 12
X_TICK_LABEL_FONT_SIZE = 13
Y_TICK_LABEL_FONT_SIZE = 13

DASHED_LWD = 4
DASHED_LINE_ALPHA = 0.95


FIG_W = 14
FIG_H = 6

DUO_FIG_W = 28
DUO_FIG_H = 6

BAR_W = 0.95
TNX_BAR_W = 0.95

ARROW_ST_X = 0.4
ARROW_END_X = 1.20

ARROW_ST_Y = 0.80
ARROW_END_Y = 0.80

ARROW_TEXT_X = ARROW_ST_X + ((ARROW_END_X - ARROW_ST_X) / 2)
ARROW_TEXT_Y = ARROW_ST_Y + .175

PLOT_SEP_W = 0.1

fig = plt.figure(figsize = (DUO_FIG_W, DUO_FIG_H), facecolor = 'white')

gs = gridspec.GridSpec(1, 5, width_ratios=[1, 1, PLOT_SEP_W, 1, 1])

left_ukb_ax = fig.add_subplot(gs[0, 0]) 
left_tnx_ax = fig.add_subplot(gs[0, 1])  
right_ukb_ax = fig.add_subplot(gs[0, 3])
right_tnx_ax = fig.add_subplot(gs[0, 4])

# Add in ICD Results

## Load in data

In [None]:
dat = pd.read_excel('final_res_with_merged_data_04_12_2023.xlsx')

In [None]:
dat['pl_rep_stat'] = dat['rep_stat'].replace({'did_not_attempt' : 'DNA',
                                              'replicated' :      'REP',
                                              'did_not'    :      'DNR',
                                              'could_not'  :      'CNR'})

dat['pl_std_lev'] = dat['std_lev']
dat['pl_std_lev'] = dat['pl_std_lev'].replace({'exp_neg' : 'Exp Neg',
                                               'unk' :      'Unknown'})

## Collect metrics

In [None]:
# Mets
UKB_THRESH = 0.3

MET_COL_LS = ['Result', 'Metric', 'Group', 'Value']
rep_ls = ['REP', 'DNR', 'DNA', 'CNR']
grp_ls = ['Tier 1', 'Tier 2', 'Exp Neg', 'Unknown']

# Total ORG tests, Count
mets = pd.DataFrame([
    ['Total ORG tests', 'Count', 'Total', len(dat)],
    ['Total ORG tests', 'Count', 'Tier 1', len(dat.loc[dat['pl_std_lev'] == 'Tier 1', :])],
    ['Total ORG tests', 'Count', 'Tier 2', len(dat.loc[dat['pl_std_lev'] == 'Tier 2', :])],
    ['Total ORG tests', 'Count', 'Exp Neg', len(dat.loc[dat['pl_std_lev'] == 'Exp Neg', :])],
    ['Total ORG tests', 'Count', 'Unknown', len(dat.loc[dat['pl_std_lev'] == 'Unknown', :])]
], columns = MET_COL_LS)


tmp =  pd.DataFrame([
        ['UKB Sig', 'Count', 'Total', 
             len(dat.loc[((dat['ukb_per_dis_bh_fdr_corr_nom_p'] < UKB_THRESH)), :])],
        ['UKB Sig', 'Count', 'Tier 1', 
             len(dat.loc[((dat['pl_std_lev'] == 'Tier 1') & 
                         (dat['ukb_per_dis_bh_fdr_corr_nom_p'] < UKB_THRESH)), :])],
        ['UKB Sig', 'Count', 'Tier 2', 
              len(dat.loc[((dat['pl_std_lev'] == 'Tier 2') & 
                         (dat['ukb_per_dis_bh_fdr_corr_nom_p'] < UKB_THRESH)), :])],
        ['UKB Sig', 'Count', 'Exp Neg', 
              len(dat.loc[((dat['pl_std_lev'] == 'Exp Neg') & 
                         (dat['ukb_per_dis_bh_fdr_corr_nom_p'] < UKB_THRESH)), :])],
        ['UKB Sig', 'Count', 'Unknown',
              len(dat.loc[((dat['pl_std_lev'] == 'Unknown') & 
                         (dat['ukb_per_dis_bh_fdr_corr_nom_p'] < UKB_THRESH)), :])],
    ], columns = MET_COL_LS)

mets = pd.concat([mets, tmp])


grp_ls = ['Tier 1', 'Tier 2', 'Exp Neg', 'Unknown']


over_pct_ls = []
for curr_grp in grp_ls:
    tot_tests = mets.loc[((mets['Result'] == 'Total ORG tests') &
                         (mets['Metric'] == 'Count') &
                         (mets['Group'] == curr_grp)), 'Value'].tolist()[0]

    sig_tests = mets.loc[((mets['Result'] == 'UKB Sig') &
                             (mets['Metric'] == 'Count') &
                             (mets['Group'] == curr_grp)), 'Value'].tolist()[0]


    curr_overlap = f"{sig_tests} | {tot_tests}"
    curr_percent = sig_tests / tot_tests
    
    
    over_pct_ls.append(['UKB Sig', 'Overlap', curr_grp, curr_overlap])
    over_pct_ls.append(['UKB Sig', 'Percent', curr_grp, curr_percent])

    
tmp = pd.DataFrame(over_pct_ls, columns = MET_COL_LS)
mets = pd.concat([mets, tmp])

met_ls = []
for curr_rep in rep_ls:


    curr_dat = dat.loc[dat['pl_rep_stat'] == curr_rep, :].copy(deep = True)

    # Handle Total
    curr_rep_tot = len(curr_dat)

    # For did not attempt total tests is total org tests not just sig
    if curr_rep == 'DNA':
        tot_tests = mets.loc[((mets['Result'] == 'Total ORG tests') &
                             (mets['Metric'] == 'Count') &
                             (mets['Group'] == 'Total')), 'Value'].tolist()[0]
    else:
        tot_tests = mets.loc[((mets['Result'] == 'UKB Sig') &
                             (mets['Metric'] == 'Count') &
                             (mets['Group'] == 'Total')), 'Value'].tolist()[0]
    
    curr_overlap = f"{curr_rep_tot} | {tot_tests}"
    curr_percent = curr_rep_tot / tot_tests
    
    met_ls.append([curr_rep, 'Count', 'Total', curr_rep_tot])
    met_ls.append([curr_rep, 'Overlap', 'Total', curr_overlap])
    met_ls.append([curr_rep, 'Percent', 'Total', curr_percent])

    for curr_grp in grp_ls:
        
        # For did not attempt total tests is total org tests not just sig
        if curr_rep == 'DNA':
            tot_sig_tests = mets.loc[((mets['Result'] == 'Total ORG tests') &
                                 (mets['Metric'] == 'Count') &
                                 (mets['Group'] == curr_grp)), 'Value'].tolist()[0]
        else:
            tot_sig_tests = mets.loc[((mets['Result'] == 'UKB Sig') &
                                 (mets['Metric'] == 'Count') &
                                 (mets['Group'] == curr_grp)), 'Value'].tolist()[0]


        curr_grp_dat = curr_dat.loc[curr_dat['pl_std_lev'] == curr_grp, :].copy(deep = True)
        curr_rep_stat_num = len(curr_grp_dat)

        curr_overlap = f"{curr_rep_stat_num} | {tot_sig_tests}"
        curr_percent = curr_rep_stat_num / tot_sig_tests


        met_ls.append([curr_rep, 'Count', curr_grp, curr_rep_stat_num])
        met_ls.append([curr_rep, 'Overlap', curr_grp, curr_overlap])
        met_ls.append([curr_rep, 'Percent', curr_grp, curr_percent])
        
        
tmp = pd.DataFrame(met_ls, columns = MET_COL_LS)
mets = pd.concat([mets, tmp])

# Rename some of our dfs
res = dat.copy(deep = True)
dat = mets.copy(deep = True)

In [None]:
# Convert percents to percents instead of decimals
dat.loc[dat['Metric'] == 'Percent', 'Value'] = dat.loc[dat['Metric'] == 'Percent', 'Value'] * 100

In [None]:
# Custom sort dict
sort_dict = {
                'Tier 1'    : 0, 
                'Tier 2'    : 1,
                'Unknown'   : 2,
                'Exp Neg'   : 3,

            }

In [None]:
# Total number of tests per group
tmp = dat.loc[((dat['Result'] == 'Total ORG tests') &
             (dat['Metric'] == 'Count')), ['Group', 'Value']]
    
tmp = tmp.set_index('Group')

tots = tmp['Value'].to_dict()

## Split data sources out

In [None]:
# UKB Res: Percents of total UKB tests that were sig or not
# TNX Res: Percents of significant UKB tests that were sig or not

ukb = dat.loc[dat['Result'].isin(['UKB Sig', 'DNA']), :]
tnx = dat.loc[dat['Result'].isin(['CNR', 'DNR', 'REP']), :]

## Plotting

In [None]:
ukb_percs = ukb.loc[ukb['Metric'] == 'Percent', :]
ukb_percs = ukb_percs.loc[ukb_percs['Group'] != "Total"]
ukb_percs = ukb_percs.loc[ukb_percs['Result'] != "Total ORG tests"]
ukb_percs = ukb_percs.sort_values(by = ['Group'], key = lambda x: x.map(sort_dict))

tnx_percs = tnx.loc[tnx['Metric'] == 'Percent', :]
tnx_percs = tnx_percs.loc[tnx_percs['Group'] != "Total"]
tnx_percs = tnx_percs.loc[tnx_percs['Result'] != "Total ORG tests"]
tnx_percs = tnx_percs.sort_values(by = ['Group'], key = lambda x: x.map(sort_dict))

### Split into our 2 categories

In [None]:
# Split out the data
ukb_dna = ukb_percs.loc[ukb_percs['Result'] == 'DNA', :]
ukb_sig = ukb_percs.loc[ukb_percs['Result'] == 'UKB Sig', :]

### Build the data labels

#### UKB

In [None]:
# Ukb
plt_order = ['DNA', 'UKB Sig']
plt_order = ['UKB Sig']
ukb_x_labs = ukb_percs['Group'].unique().tolist()

ukb_data_labs = [] 
# Build labels
for x in plt_order:
    for y in ukb_x_labs:
        curr_over = ukb[((ukb['Result'] == x) & (ukb['Group'] == y) &
                         (ukb['Metric'] == 'Overlap'))]['Value'].values[0]
        
        ukb_data_labs.append(curr_over)
        
up_ukb_data_labs = []

# Add thousands separators to data labels
for x in range(0, len(ukb_data_labs)):
    
    curr_lab = ukb_data_labs[x]
    curr_arr = curr_lab.split("|")
    
    # Numerator
    num = f'{int(curr_arr[0]):,}'
    
    # Denominator
    denom = f'{int(curr_arr[1]):,}'
    
    # Put them back together and add to fixed arr
    up_ukb_data_labs.append(f"{num} | {denom}")
    
ukb_data_labs = up_ukb_data_labs

#### TNX

In [None]:
# TNX labels
plt_order = ['DNR', 'REP']
plt_order = ['REP']

tnx_x_labs = tnx_percs['Group'].unique().tolist()

tnx_rep = pd.DataFrame(columns = ['Result', 'Metric', 'Group', 'Value'])

tnx_data_labs = [] 
# Build labels and data for plotting
for x in tnx_x_labs:
    curr_grp = tnx[((tnx['Group'] == x)  &
                 (tnx['Metric'] == 'Count'))]


    curr_dnr = curr_grp[curr_grp['Result'] == 'DNR']['Value'].values[0]
    curr_rep = curr_grp[curr_grp['Result'] == 'REP']['Value'].values[0]
    curr_cnr = curr_grp[curr_grp['Result'] == 'CNR']['Value'].values[0]

    curr_tot = curr_dnr + curr_rep

    dnr_perc = (curr_dnr / curr_tot) * 100
    rep_perc = (curr_rep / curr_tot) * 100

    # Add our data labels - might want to skip if == 0
    tnx_data_labs.append(f'{curr_rep} | {curr_tot}')

    # Now add percents to data for plots
    tnx_rep.loc[len(tnx_rep)] = ['REP', 'Percent', x, rep_perc]

tnx_up_data_labs = []

# Add thousands separators to data labels
for x in range(0, len(tnx_data_labs)):
    
    curr_lab = tnx_data_labs[x]
    curr_arr = curr_lab.split("|")
    
    # Numerator
    num = f'{int(curr_arr[0]):,}'
    
    # Denominator
    denom = f'{int(curr_arr[1]):,}'
    
    # Put them back together and add to fixed arr
    tnx_up_data_labs.append(f"{num} | {denom}")
    
tnx_data_labs = tnx_up_data_labs

### Build the plot

In [None]:
# Plot

# UKB bar plot
left_ukb_ax.bar(ukb_x_labs, label = 'UKB Significant',
           height = ukb_sig['Value'],
           bottom = 0,
           color = UKB_COLOR,
           edgecolor = 'white',
           linewidth = 5,
           width = BAR_W)


# TNX bar plot
left_tnx_ax.bar(tnx_x_labs, label = 'Replicated',
       height = tnx_rep['Value'],
       bottom = 0,
       edgecolor = 'white',
       #color = 'black',
       color = TNX_COLOR, 
       linewidth = 5,
       width = TNX_BAR_W)




# get our rectangles
ukb_rects = left_ukb_ax.patches
tnx_rects = left_tnx_ax.patches

# Add the data labels
# https://stackoverflow.com/a/28931750
# UKB
heights = []
widths = []
for curr_rect, curr_lab in zip(ukb_rects, ukb_data_labs):
    curr_height = curr_rect.get_height()
    curr_width  = curr_rect.get_width()
    
    heights.append(curr_height)
    widths.append(curr_width)
    
    # If the height is 0 then it doesn't actually show up 
    # in plot and thus we don't want to label it.
    if curr_height == 0:
        continue
    
    else:
        # Center our label on bar
        curr_x = curr_rect.get_x() + (curr_rect.get_width() / 2)
        
        # Need to pull y-value because bars are stacked, then just
        # center it in that space.
        #curr_y = curr_rect.get_y() + (curr_height / 2)
        curr_y = curr_rect.get_height() + 2

        left_ukb_ax.text(curr_x, curr_y, curr_lab, 
                color = 'black',
                ha="center", va="bottom", fontsize = TEXT_LABEL_FONT_SIZE)

# TNX
for curr_rect, curr_lab in zip(tnx_rects, tnx_data_labs):
    curr_height = curr_rect.get_height()
    
    # If the height is 0 then it doesn't actually show up 
    # in plot and thus we don't want to label it.
    if curr_height == 0:
        print("caught!")
        continue
    
    else:
        # Center our label on bar
        curr_x = curr_rect.get_x() + (curr_rect.get_width() / 2)
        
        # Need to pull y-value because bars are stacked, then just
        # center it in that space.
        #curr_y = curr_rect.get_y() + (curr_height / 2)
        curr_y = curr_rect.get_height() + 2

        left_tnx_ax.text(curr_x, curr_y, curr_lab, 
                color = 'black',
                ha = "center", va = "bottom", 
                fontsize = TEXT_LABEL_FONT_SIZE)
                
for curr_ax in [left_ukb_ax, left_tnx_ax]:
    curr_ax.spines['top'].set_visible(False)
    curr_ax.spines['right'].set_visible(False)
    

left_ukb_ax.tick_params(axis="x", 
                   bottom = False, top = False, 
                   labelbottom = True, labeltop = False,
                   labelsize = X_TICK_LABEL_FONT_SIZE)
   
left_ukb_ax.tick_params(axis="y", left = True, right = False, 
                   labelleft = True, labelright = False,
                   labelsize = Y_TICK_LABEL_FONT_SIZE,
                   direction = 'out')

left_tnx_ax.tick_params(axis="x", 
                   bottom = False, top = False, 
                   labelbottom = True, labeltop = False,
                   labelsize = X_TICK_LABEL_FONT_SIZE)
   
left_tnx_ax.tick_params(axis="y", left = True, right = False, 
                   labelleft = True, labelright = False,
                   labelsize = Y_TICK_LABEL_FONT_SIZE,
                   direction = 'out')


# Draw the dashed green lines around signif bars in UKB
coord_path = get_coord_path(heights)

path_poly = patches.Polygon(coord_path,
                            alpha = 1,
                            linestyle = "dashed",
                            linewidth = DASHED_LWD,
                            ec = '#55a868', 
                            fc = 'None')
                
left_ukb_ax.add_patch(path_poly)


# Draw arrow over from UKB results to TNX results
# Using workaround from: 
# https://github.com/matplotlib/matplotlib/issues/17284#issuecomment-772820638


arrow_line = Arrow((ARROW_ST_X, ARROW_ST_Y), (ARROW_END_X, ARROW_END_Y), 
           arrowstyle = '-',
           shrinkA = 0, shrinkB = 5,
           connectionstyle = "arc3, rad = -0.25",
           linestyle = "dashed",
           linewidth = DASHED_LWD,
           ec = '#55a868',
           fc = '#55a868', 
           transform = left_ukb_ax.transAxes)


ar_style = patches.ArrowStyle.CurveFilledB(angleA = 0)
arrow_head = Arrow((ARROW_ST_X, ARROW_ST_Y), (ARROW_END_X, ARROW_END_Y), 
                   arrowstyle = ar_style,
                   shrinkA = 0, shrinkB = 0,
                   connectionstyle = "arc3, rad = -0.25",
                   linestyle = "solid",
                   linewidth = 0,
                   ec = None,
                   fc = '#55a868', mutation_scale = 50,
                   transform = left_ukb_ax.transAxes)


ar_text_va = 'center'
ar_text_ha = 'center'

ar_text_fd = {
            'family': 'Arial',
            'color':  'black',
            'weight': 'normal',
            'size': 14,  
}

ar_text = left_ukb_ax.text(x = ARROW_TEXT_X, 
                      y = ARROW_TEXT_Y,
                      s = "Test significant pairs\nfor replication",
                      ha = ar_text_ha,
                      va = ar_text_va, 
                      fontdict = ar_text_fd,
                      transform = left_ukb_ax.transAxes)
fig.patches.extend([arrow_head, arrow_line])



test_label_x = 0.0
test_label_y = 1.08
test_label_ha = 'left'

ukb_label_font = {
            'family': 'Arial',
            'color':  UKB_COLOR,
            'weight': 'normal',
            'size': 16,
}


tnx_label_font = {
            'family': 'Arial',
            'color':  TNX_COLOR,
            'weight': 'normal',
            'size': 16,
}


Y_TITLE_FONT = {
            'family': 'Arial',
            'color':  'black',
            'weight': 'normal',
            'size': 15,
}

# Datasource labels
ukb_data_lab = left_ukb_ax.text(x = test_label_x, y = test_label_y, 
                           s = "UK Biobank: Discovery", 
                           va = 'bottom', ha = test_label_ha,
                           transform = left_ukb_ax.transAxes, 
                           fontdict = ukb_label_font)

tnx_data_lab = left_tnx_ax.text(x = test_label_x, y = test_label_y, 
                           s = "TriNetX: Replication", 
                           va = 'bottom', ha = test_label_ha,
                           transform = left_tnx_ax.transAxes, 
                           fontdict = tnx_label_font)

left_ukb_ax.set_ylabel('% Sig Disease-Pathogen of All Pairs Tested', fontdict = Y_TITLE_FONT)
left_tnx_ax.set_ylabel('% UKB Sig Pairs with TNX Data', fontdict = Y_TITLE_FONT)


TIER_1_TEXT = 'Tier 1\nPositives'
TIER_2_TEXT = 'Tier 2\nPositives'
EXP_NEG_TEXT = 'Expected\nNegatives'

ukb_ticks = left_ukb_ax.get_xticklabels()
tnx_ticks = left_tnx_ax.get_xticklabels()

for curr_tick in ukb_ticks:

    if curr_tick.get_text() == 'Tier 1':
        curr_tick.set_text(TIER_1_TEXT)

    elif curr_tick.get_text() == 'Tier 2':
        curr_tick.set_text(TIER_2_TEXT) 

    elif curr_tick.get_text() == 'Exp Neg':
        curr_tick.set_text(EXP_NEG_TEXT)         
        
for curr_tick in tnx_ticks:

    if curr_tick.get_text() == 'Tier 1':
        curr_tick.set_text(TIER_1_TEXT)

    elif curr_tick.get_text() == 'Tier 2':
        curr_tick.set_text(TIER_2_TEXT) 
        
    elif curr_tick.get_text() == 'Exp Neg':
        curr_tick.set_text(EXP_NEG_TEXT)      
        
        
ukb_ticks_loc = left_ukb_ax.get_xticks()
tnx_ticks_loc = left_tnx_ax.get_xticks()

left_ukb_ax.set_xticks(ukb_ticks_loc)
left_tnx_ax.set_xticks(tnx_ticks_loc)

left_ukb_ax.set_xticklabels(ukb_ticks)
left_tnx_ax.set_xticklabels(tnx_ticks)


left_ukb_ax.set_ylim(0, 102)
left_tnx_ax.set_ylim(0, 102)

# Add in Phecode Results

## Load in data

In [None]:
dat = pd.read_excel('final_phecode_mcc1_res_with_merged_data_12_05_2024.xlsx',
                    dtype = {'phecode': str})

In [None]:
dat['pl_rep_stat'] = dat['rep_stat'].replace({'did_not_attempt' : 'DNA',
                                              'replicated' :      'REP',
                                              'did_not'    :      'DNR',
                                              'could_not'  :      'CNR'})

dat['pl_std_lev'] = dat['std_lev']
dat['pl_std_lev'] = dat['pl_std_lev'].replace({'exp_neg' : 'Exp Neg',
                                               'unk' :      'Unknown'})
dat['pl_std_lev'] = dat['pl_std_lev'].fillna('Unknown')

In [None]:
# Phecode 71.1 is HIV infection, symptomatic, which should be a tier 1 with HIV but
# should be exlcuded from expected negatives. Also, we remove HIV the pathogen
# from Tier 1's/Expected negatives

# Tier 1 phecodes - the reverse would be expected negative

# | Phecode | Disease_Description        | Disease_Group       | ICD10(s) in PheCode     | ICDs not in ICD10-based Tier 1 | ICDs in ICD10-based Tier 1 | is_interesting | man_rev_interested | Tier 1 Pathogen | Notes                                                                                |
# |---------|----------------------------|---------------------|-------------------------|--------------------------------|----------------------------|----------------|--------------------|-----------------|--------------------------------------------------------------------------------------|
# | 053     | Herpes zoster              | infectious diseases | B02, G53                | G53                            | B02                        | Y              | Y                  | VZV             | G53.0: Postzoster neuralgia                                                          |
# | 054     | Herpes simplex             | infectious diseases | A60, B00, B08           | B08                            | A60, B00                   | Y              | Y                  | HSV1, HSV2      | B08.8: Other forms of   herpesviral infection                                        |
# | 070     | Viral   hepatitis          | infectious diseases | B17, B18, B19           | B17, B18                       | B19                        | Y              | Y                  | HBV, HCV        |                                                                                      |
# | 070.2   | Viral   hepatitis B        | infectious diseases | B16, B18                | B16, B18                       | -                          | Y              | Y                  | HBV             |                                                                                      |
# | 070.3   | Viral hepatitis C          | infectious diseases | B17, B18                | B17, B18                       | -                          | Y              | Y                  | HCV             |                                                                                      |
# | 070.9   | Hepatitis NOS              | infectious diseases | K71, K75, K76           | K71                            | K75, K76                   | Y              | Y                  | HBV, HCV        |                                                                                      |
# | 071     | HIV infection, symptomatic | infectious diseases | B20, B21, B22, B23, B24 | B20, B21, B22, B23             | B24                        | Y              | Y                  | HIV             | All codes mean they have HIV   infection, just usually indicate additional infection |
# | 078     | Viral warts & HPV          | infectious diseases | A63, B07                | A63, B07                       | -                          | Y              | Y                  | HPV16, HPV18    |                                                                                      |
# | 079.2   | Infectious mononucleosis   | infectious diseases | B27                     |                                | B27                        | Y              | Y                  | EBV             | Exact match B27 only                                                                 |

tier_1_phecodes = ['053', '054', '070', '070.2', '070.3', '070.9', '071', '078', '079.2']

orig_len = len(dat)

# Before: 21,900
print(f'Before: {orig_len}')

# All looks good!

In [None]:
# could_not            508
# did_not             1452
# did_not_attempt    15619
# replicated          1355
print(dat['rep_stat'].value_counts(dropna = False).sort_index())

# CNR      508
# DNA    15619
# DNR     1452
# REP     1355
print(dat['pl_rep_stat'].value_counts(dropna = False).sort_index())

# pl_std_lev
# Exp Neg      104
# Tier 1        10
# Unknown    18820
print(dat['pl_std_lev'].value_counts(dropna = False).sort_index())

## Collect metrics

In [None]:
# met
UKB_THRESH = 0.3

MET_COL_LS = ['Result', 'Metric', 'Group', 'Value']
rep_ls = ['REP', 'DNR', 'DNA', 'CNR']
grp_ls = ['Tier 1', 'Exp Neg', 'Unknown']

# Total ORG tests, Count
mets = pd.DataFrame([
    ['Total ORG tests', 'Count', 'Total', len(dat)],
    ['Total ORG tests', 'Count', 'Tier 1', len(dat.loc[dat['pl_std_lev'] == 'Tier 1', :])],
    ['Total ORG tests', 'Count', 'Tier 2', len(dat.loc[dat['pl_std_lev'] == 'Tier 2', :])],
    ['Total ORG tests', 'Count', 'Exp Neg', len(dat.loc[dat['pl_std_lev'] == 'Exp Neg', :])],
    ['Total ORG tests', 'Count', 'Unknown', len(dat.loc[dat['pl_std_lev'] == 'Unknown', :])]
], columns = MET_COL_LS)


tmp_df =  pd.DataFrame([
    ['UKB Sig', 'Count', 'Total', 
         len(dat.loc[((dat['ukb_per_dis_bh_fdr_corr_nom_p'] < UKB_THRESH)), :])],
    ['UKB Sig', 'Count', 'Tier 1', 
         len(dat.loc[((dat['pl_std_lev'] == 'Tier 1') & 
                     (dat['ukb_per_dis_bh_fdr_corr_nom_p'] < UKB_THRESH)), :])],
    ['UKB Sig', 'Count', 'Tier 2', 
          len(dat.loc[((dat['pl_std_lev'] == 'Tier 2') & 
                     (dat['ukb_per_dis_bh_fdr_corr_nom_p'] < UKB_THRESH)), :])],
    ['UKB Sig', 'Count', 'Exp Neg', 
          len(dat.loc[((dat['pl_std_lev'] == 'Exp Neg') & 
                     (dat['ukb_per_dis_bh_fdr_corr_nom_p'] < UKB_THRESH)), :])],
    ['UKB Sig', 'Count', 'Unknown',
          len(dat.loc[((dat['pl_std_lev'] == 'Unknown') & 
                     (dat['ukb_per_dis_bh_fdr_corr_nom_p'] < UKB_THRESH)), :])],
    ], columns = MET_COL_LS)


mets = pd.concat([mets, tmp_df])

grp_ls = ['Tier 1', 'Exp Neg', 'Unknown']


over_pct_ls = []
for curr_grp in grp_ls:
    tot_tests = mets.loc[((mets['Result'] == 'Total ORG tests') &
                         (mets['Metric'] == 'Count') &
                         (mets['Group'] == curr_grp)), 'Value'].tolist()[0]

    sig_tests = mets.loc[((mets['Result'] == 'UKB Sig') &
                             (mets['Metric'] == 'Count') &
                             (mets['Group'] == curr_grp)), 'Value'].tolist()[0]


    curr_overlap = f"{sig_tests} | {tot_tests}"
    
    if tot_tests == 0:
        curr_percent = 0
    else:
        curr_percent = (sig_tests / tot_tests) * 100    
    
    over_pct_ls.append(['UKB Sig', 'Overlap', curr_grp, curr_overlap])
    over_pct_ls.append(['UKB Sig', 'Percent', curr_grp, curr_percent])
    
mets = pd.concat([mets, pd.DataFrame(over_pct_ls, columns = MET_COL_LS)])

met_ls = []
for curr_rep in rep_ls:


    curr_dat = dat.loc[dat['pl_rep_stat'] == curr_rep, :].copy(deep = True)

    # Handle Total
    curr_rep_tot = len(curr_dat)

    # For did not attempt total tests is total org tests not just sig
    if curr_rep == 'DNA':
        tot_tests = mets.loc[((mets['Result'] == 'Total ORG tests') &
                             (mets['Metric'] == 'Count') &
                             (mets['Group'] == 'Total')), 'Value'].tolist()[0]
    else:
        tot_tests = mets.loc[((mets['Result'] == 'UKB Sig') &
                             (mets['Metric'] == 'Count') &
                             (mets['Group'] == 'Total')), 'Value'].tolist()[0]
    
    curr_overlap = f"{curr_rep_tot} | {tot_tests}"
    curr_percent = curr_rep_tot / tot_tests
    
    met_ls.append([curr_rep, 'Count', 'Total', curr_rep_tot])
    met_ls.append([curr_rep, 'Overlap', 'Total', curr_overlap])
    met_ls.append([curr_rep, 'Percent', 'Total', curr_percent])

    for curr_grp in grp_ls:
        
        # For did not attempt total tests is total org tests not just sig
        if curr_rep == 'DNA':
            tot_sig_tests = mets.loc[((mets['Result'] == 'Total ORG tests') &
                                 (mets['Metric'] == 'Count') &
                                 (mets['Group'] == curr_grp)), 'Value'].tolist()[0]
        else:
            tot_sig_tests = mets.loc[((mets['Result'] == 'UKB Sig') &
                                 (mets['Metric'] == 'Count') &
                                 (mets['Group'] == curr_grp)), 'Value'].tolist()[0]


        curr_grp_dat = curr_dat.loc[curr_dat['pl_std_lev'] == curr_grp, :].copy(deep = True)
        curr_rep_stat_num = len(curr_grp_dat)

        curr_overlap = f"{curr_rep_stat_num} | {tot_sig_tests}"
        
        if tot_sig_tests == 0:
            curr_percent = 0
        else:
          curr_percent = (curr_rep_stat_num / tot_sig_tests ) * 100

        met_ls.append([curr_rep, 'Count', curr_grp, curr_rep_stat_num])
        met_ls.append([curr_rep, 'Overlap', curr_grp, curr_overlap])
        met_ls.append([curr_rep, 'Percent', curr_grp, curr_percent])
        
mets = pd.concat([mets, pd.DataFrame(met_ls, columns = MET_COL_LS)])

# Rename some of our dfs
res = dat.copy(deep = True)
dat = mets.copy(deep = True)

In [None]:
# pl_std_lev  pl_rep_stat
# Exp Neg     CNR                3
#             DNA               86
#             DNR                9
#             REP                6
# Tier 1      DNA                2
#             REP                8
# Unknown     CNR              505
#             DNA            15531
#             DNR             1443
#             REP             1341
res.groupby('pl_std_lev')['pl_rep_stat'].value_counts(dropna = False).sort_index()

In [None]:
# Custom sort dict
sort_dict = {
                'Tier 1'    : 0, 
                'Tier 2'    : 1,
                'Unknown'   : 2,
                'Exp Neg'   : 3,


            }

In [None]:
# Total number of tests per group
tmp = dat.loc[((dat['Result'] == 'Total ORG tests') &
             (dat['Metric'] == 'Count')), ['Group', 'Value']]
    
tmp = tmp.set_index('Group')

tots = tmp['Value'].to_dict()

# tots: {'Total': 21900, 'Tier 1': 10, 'Tier 2': 0, 'Exp Neg': 104, 'Unknown': 21786}

### Split data into UKB Res and TNX res

In [None]:
# UKB Res: Percents of total UKB tests that were sig or not
# Tnx Res: Percents of significant UKB tests that were sig or not

ukb = dat.loc[dat['Result'].isin(['UKB Sig', 'DNA']), :]
tnx = dat.loc[dat['Result'].isin(['CNR', 'DNR', 'REP']), :]

## Plotting

In [None]:
ukb_percs = ukb.loc[ukb['Metric'] == 'Percent', :]
ukb_percs = ukb_percs.loc[ukb_percs['Group'] != "Total"]
ukb_percs = ukb_percs.loc[ukb_percs['Result'] != "Total ORG tests"]
ukb_percs = ukb_percs.sort_values(by = ['Group'], key = lambda x: x.map(sort_dict))

tnx_percs = tnx.loc[tnx['Metric'] == 'Percent', :]
tnx_percs = tnx_percs.loc[tnx_percs['Group'] != "Total"]
tnx_percs = tnx_percs.loc[tnx_percs['Result'] != "Total ORG tests"]
tnx_percs = tnx_percs.sort_values(by = ['Group'], key = lambda x: x.map(sort_dict))

### Split into our 2 categories

In [None]:
# Split out the data
ukb_dna = ukb_percs.loc[ukb_percs['Result'] == 'DNA', :]
ukb_sig = ukb_percs.loc[ukb_percs['Result'] == 'UKB Sig', :]

### Build the data labels

#### UKB

In [None]:
# UKB labels
plt_order = ['DNA', 'UKB Sig']
plt_order = ['UKB Sig']
ukb_x_labs = ukb_percs['Group'].unique().tolist()

ukb_data_labs = [] 
# Build labels
for x in plt_order:
    for y in ukb_x_labs:
        curr_over = ukb[((ukb['Result'] == x) & (ukb['Group'] == y) &
                         (ukb['Metric'] == 'Overlap'))]['Value'].values[0]
        
        ukb_data_labs.append(curr_over)
        
up_ukb_data_labs = []

# Add thousands separators to data labels
for x in range(0, len(ukb_data_labs)):
    
    curr_lab = ukb_data_labs[x]
    curr_arr = curr_lab.split("|")
    
    # Numerator
    num = f'{int(curr_arr[0]):,}'
    
    # Denominator
    denom = f'{int(curr_arr[1]):,}'
    
    # Put them back together and add to fixed arr
    up_ukb_data_labs.append(f"{num} | {denom}")
    
ukb_data_labs = up_ukb_data_labs

#### TNX

In [None]:
# TNX labels
plt_order = ['DNR', 'REP']
plt_order = ['REP']

tnx_x_labs = tnx_percs['Group'].unique().tolist()

tnx_rep = pd.DataFrame(columns = ['Result', 'Metric', 'Group', 'Value'])

tnx_data_labs = [] 
# Build labels and data for plotting
for x in tnx_x_labs:
    curr_grp = tnx[((tnx['Group'] == x)  &
                 (tnx['Metric'] == 'Count'))]


    curr_dnr = curr_grp[curr_grp['Result'] == 'DNR']['Value'].values[0]
    curr_rep = curr_grp[curr_grp['Result'] == 'REP']['Value'].values[0]
    curr_cnr = curr_grp[curr_grp['Result'] == 'CNR']['Value'].values[0]

    curr_tot = curr_dnr + curr_rep

    if curr_tot == 0:
        dnr_perc = 0 
    else:
        dnr_perc = (curr_dnr / curr_tot) * 100
    
    if curr_tot == 0:
        rep_perc = 0
    else:
        rep_perc = (curr_rep / curr_tot) * 100

    # Add our data labels - might want to skip if == 0
    tnx_data_labs.append(f'{curr_rep} | {curr_tot}')

    # Now add percents to data for plots
    tnx_rep.loc[len(tnx_rep)] = ['REP', 'Percent', x, rep_perc]

tnx_up_data_labs = []

# Add thousands separators to data labels
for x in range(0, len(tnx_data_labs)):
    
    curr_lab = tnx_data_labs[x]
    curr_arr = curr_lab.split("|")
    
    # Numerator
    num = f'{int(curr_arr[0]):,}'
    
    # Denominator
    denom = f'{int(curr_arr[1]):,}'
    
    # Put them back together and add to fixed arr
    tnx_up_data_labs.append(f"{num} | {denom}")
    
tnx_data_labs = tnx_up_data_labs

### Build the plot

In [None]:
# Plot
right_ukb_ax.bar(ukb_x_labs, label = 'UKB Significant',
           height = ukb_sig['Value'],
           bottom = 0,
           color = UKB_COLOR,
           edgecolor = 'white',
           linewidth = 5,
           width = BAR_W)


right_tnx_ax.bar(tnx_x_labs, label = 'Replicated',
       height = tnx_rep['Value'],
       bottom = 0,
       edgecolor = 'white',
       #color = 'black',
       color = TNX_COLOR, 
       linewidth = 5,
       width = TNX_BAR_W)




# get our rectangles
ukb_rects = right_ukb_ax.patches
tnx_rects = right_tnx_ax.patches

# Add the data labels
# https://stackoverflow.com/a/28931750
# UKB
heights = []
widths = []
for curr_rect, curr_lab in zip(ukb_rects, ukb_data_labs):
    curr_height = curr_rect.get_height()
    curr_width  = curr_rect.get_width()
    
    heights.append(curr_height)
    widths.append(curr_width)
    # If the height is 0 then it doesn't actually show up 
    # in plot and thus we don't want to label it.
    if curr_height == 0:
        continue
    
    else:
        # Center our label on bar
        curr_x = curr_rect.get_x() + (curr_rect.get_width() / 2)
        
        # Need to pull y-value because bars are stacked, then just
        # center it in that space.
        #curr_y = curr_rect.get_y() + (curr_height / 2)
        curr_y = curr_rect.get_height() + 2

        right_ukb_ax.text(curr_x, curr_y, curr_lab, 
                color = 'black',
                ha="center", va="bottom", fontsize = TEXT_LABEL_FONT_SIZE)

# TNX
for curr_rect, curr_lab in zip(tnx_rects, tnx_data_labs):
    curr_height = curr_rect.get_height()
    
    # If the height is 0 then it doesn't actually show up 
    # in plot and thus we don't want to label it.
    if curr_height == 0:
        print("caught!")
        continue
    
    else:
        # Center our label on bar
        curr_x = curr_rect.get_x() + (curr_rect.get_width() / 2)
        
        # Need to pull y-value because bars are stacked, then just
        # center it in that space.
        #curr_y = curr_rect.get_y() + (curr_height / 2)
        curr_y = curr_rect.get_height() + 2

        right_tnx_ax.text(curr_x, curr_y, curr_lab, 
                color = 'black',
                ha = "center", va = "bottom", 
                fontsize = TEXT_LABEL_FONT_SIZE)
                
for curr_ax in [right_ukb_ax, right_tnx_ax]:
    curr_ax.spines['top'].set_visible(False)
    curr_ax.spines['right'].set_visible(False)
    

right_ukb_ax.tick_params(axis="x", 
                   bottom = False, top = False, 
                   labelbottom = True, labeltop = False,
                   labelsize = X_TICK_LABEL_FONT_SIZE)
   
right_ukb_ax.tick_params(axis="y", left = True, right = False, 
                   labelleft = True, labelright = False,
                   labelsize = Y_TICK_LABEL_FONT_SIZE,
                   direction = 'out')

right_tnx_ax.tick_params(axis="x", 
                   bottom = False, top = False, 
                   labelbottom = True, labeltop = False,
                   labelsize = X_TICK_LABEL_FONT_SIZE)
   
right_tnx_ax.tick_params(axis="y", left = True, right = False, 
                   labelleft = True, labelright = False,
                   labelsize = Y_TICK_LABEL_FONT_SIZE,
                   direction = 'out')

# Draw the dashed green lines around signif bars in UKB
coord_path = get_coord_path(heights)

path_poly = patches.Polygon(coord_path,
                            alpha = 1,
                            linestyle = "dashed",
                            linewidth = DASHED_LWD,
                            ec = '#55a868', 
                            fc = 'None')
                
right_ukb_ax.add_patch(path_poly)


# Draw arrow over from UKB results to TNX results
# Using workaround from: https://github.com/matplotlib/matplotlib/issues/17284#issuecomment-772820638


arrow_line = Arrow((ARROW_ST_X, ARROW_ST_Y), (ARROW_END_X, ARROW_END_Y), 
           arrowstyle = '-',
           shrinkA = 0, shrinkB = 5,
           connectionstyle = "arc3, rad = -0.25",
           linestyle = "dashed",
           linewidth = DASHED_LWD,
           ec = '#55a868',
           fc = '#55a868', 
           transform = right_ukb_ax.transAxes)


ar_style = patches.ArrowStyle.CurveFilledB(angleA = 0)
arrow_head = Arrow((ARROW_ST_X, ARROW_ST_Y), (ARROW_END_X, ARROW_END_Y), 
                   arrowstyle = ar_style,
                   shrinkA = 0, shrinkB = 0,
                   connectionstyle = "arc3, rad = -0.25",
                   linestyle = "solid",
                   linewidth = 0,
                   ec = None,
                   fc = '#55a868', mutation_scale = 50,
                   transform = right_ukb_ax.transAxes)


ar_text_va = 'center'
ar_text_ha = 'center'

ar_text_fd = {
            'family': 'Arial',
            'color':  'black',
            'weight': 'normal',
            'size': 14,  
}

ar_text = right_ukb_ax.text(x = ARROW_TEXT_X, 
                      y = ARROW_TEXT_Y,
                      s = "Test significant pairs\nfor replication",
                      ha = ar_text_ha,
                      va = ar_text_va, 
                      fontdict = ar_text_fd,
                      transform = right_ukb_ax.transAxes)
fig.patches.extend([arrow_head, arrow_line])



test_label_x = 0.0
test_label_y = 1.08
test_label_ha = 'left'

ukb_label_font = {
            'family': 'Arial',
            'color':  UKB_COLOR,
            'weight': 'normal',
            'size': 16,
}


tnx_label_font = {
            'family': 'Arial',
            'color':  TNX_COLOR,
            'weight': 'normal',
            'size': 16,
}


Y_TITLE_FONT = {
            'family': 'Arial',
            'color':  'black',
            'weight': 'normal',
            'size': 15,
}

# Datasource labels
ukb_data_lab = right_ukb_ax.text(x = test_label_x, y = test_label_y, 
                           s = "UK Biobank: Discovery", 
                           va = 'bottom', ha = test_label_ha,
                           transform = right_ukb_ax.transAxes, 
                           fontdict = ukb_label_font)

tnx_data_lab = right_tnx_ax.text(x = test_label_x, y = test_label_y, 
                           s = "TriNetX: Replication", 
                           va = 'bottom', ha = test_label_ha,
                           transform = right_tnx_ax.transAxes, 
                           fontdict = tnx_label_font)

right_ukb_ax.set_ylabel('% Sig Disease-Pathogen of All Pairs Tested', fontdict = Y_TITLE_FONT)
right_tnx_ax.set_ylabel('% UKB Sig Pairs with TNX Data', fontdict = Y_TITLE_FONT)


TIER_1_TEXT = 'Tier 1\nPositives'
TIER_2_TEXT = 'Tier 2\nPositives'
EXP_NEG_TEXT = 'Expected\nNegatives'

ukb_ticks = right_ukb_ax.get_xticklabels()
tnx_ticks = right_tnx_ax.get_xticklabels()

for curr_tick in ukb_ticks:

    if curr_tick.get_text() == 'Tier 1':
        curr_tick.set_text(TIER_1_TEXT)

    elif curr_tick.get_text() == 'Tier 2':
        curr_tick.set_text(TIER_2_TEXT) 

    elif curr_tick.get_text() == 'Exp Neg':
        curr_tick.set_text(EXP_NEG_TEXT)         
        
for curr_tick in tnx_ticks:

    if curr_tick.get_text() == 'Tier 1':
        curr_tick.set_text(TIER_1_TEXT)

    elif curr_tick.get_text() == 'Tier 2':
        curr_tick.set_text(TIER_2_TEXT) 
        
    elif curr_tick.get_text() == 'Exp Neg':
        curr_tick.set_text(EXP_NEG_TEXT)      
        
        
ukb_ticks_loc = right_ukb_ax.get_xticks()
tnx_ticks_loc = right_tnx_ax.get_xticks()

right_ukb_ax.set_xticks(ukb_ticks_loc)
right_tnx_ax.set_xticks(tnx_ticks_loc)

right_ukb_ax.set_xticklabels(ukb_ticks)
right_tnx_ax.set_xticklabels(tnx_ticks)


right_ukb_ax.set_ylim(0, 102)
right_tnx_ax.set_ylim(0, 102)

## Last minute touchups and save

### Add panel letters

In [None]:
# Add the letters on, nature comm med wants lowercase
panel_font = {'size' : 20, 'weight' : 'bold'}

TOP_Y  = 0.985
X_ADJUST = 0.02

# Calculate a
left_trans = left_ukb_ax.transData + left_ukb_ax.figure.transFigure.inverted()
x_min_fig = left_trans.transform((left_ukb_ax.get_xlim()[0], 0))[0]
LEFT_X = x_min_fig - X_ADJUST

# Calculate b
right_trans = right_ukb_ax.transData + right_ukb_ax.figure.transFigure.inverted()
x_min_fig = right_trans.transform((right_ukb_ax.get_xlim()[0], 0))[0]
RIGHT_X = x_min_fig - X_ADJUST


# Add the letters
fig.text(x = LEFT_X, y = TOP_Y, s = 'a', fontdict = panel_font)
fig.text(x = RIGHT_X, y = TOP_Y, s = 'b', fontdict = panel_font)

fig

### Save the file

In [None]:
out_dir = f'{HOME_DIR}/manuscript/figures

fn = f"{out_dir}/Figure_2.svg"
fig.savefig(fn, format = 'svg', dpi = 600, bbox_inches="tight")