## Summary results
For each of the four experiments (binary predictor/outcome; prescription count/binary outcome; binary predictor/age onset; prescrition count/age onset):
1. Range of sample sizes
2. Range of % of each sex (can just show female)
3. Range of mean age
4. Range of odds ratios 
5. % statistically significant after correction

In [11]:
import pandas as pd
import os
import numpy as np
import statsmodels.api as sm

def importdata(ehr, predictor, outcome):
    """Import data from the EHR and return a pandas dataframe.

    Parameters
    ----------
    ehr : str
        'sinai' or 'ukb'
    predictor : str
        'binary' or 'prescription'
    outcome : str
        'binary' or 'age_onset'

    Returns
    -------
    df : pandas.DataFrame
        Dataframe with the predictor and outcome variables.
    """
    if ehr == 'sinai':
        file_ehr = 'MSDW1794_V3'
    elif ehr == 'ukb':
        file_ehr = 'ukbiobank'
    else:
        raise ValueError('Invalid EHR: %s' % ehr)

    if predictor == 'binary':
        file_pred = 'binary_exposure'
    elif predictor == 'prescription':
        file_pred = 'prescription_count'
    elif predictor == 'aud':
        file_pred = 'aud'
    else:
        raise ValueError('Invalid predictor: %s' % predictor)

    if outcome == 'binary':
        file_out = 'binary_outcome'
    elif outcome == 'age_onset':
        file_out = 'age_onset_ncd'
    else:
        raise ValueError('Invalid outcome: %s' % outcome)

    if file_pred == 'aud':
        if ehr=='sinai': path = f"{file_ehr}/voe_outputs/aud/controlsNoAUDDX/{file_out}/controlVarOUD/analyses/period_summaries/"
        elif ehr=='ukb': path = f'{file_ehr}/voe_outputs/aud/controlsNoAUDDX/{file_out}/controlVarSUD/analyses/period_summaries/'
    else:
        path = f"{file_ehr}/voe_outputs/opioids/controlsLessThan3Opioids/{file_pred}/{file_out}/controlVarOUD/analyses/period_summaries/"
    datasets = []

    for enrollment_year in range(1989,2020):
        if os.path.exists(path + f'voe_{enrollment_year}_{enrollment_year+3}.csv'):
            ds = pd.read_csv(path + f'voe_{enrollment_year}_{enrollment_year+3}.csv')
            datasets.append(ds)
    allexpts = pd.concat(datasets)
    print(allexpts.shape)

    #check for duplicate experiments
    allexpts = allexpts.drop_duplicates()
    print(allexpts.shape)

    #remove years with low sample sizes
    print(allexpts.shape)
    if ehr == 'sinai': allexpts = allexpts[allexpts.start_enroll>=2008]
    elif ehr == 'ukb': allexpts = allexpts[(allexpts.start_enroll>=2004) & (allexpts.start_enroll<=2010)]
    if predictor != 'aud': allexpts = allexpts[allexpts['hx_MAT']==0] 

    print(allexpts.shape)

    # create total N column
    allexpts['total_N'] = allexpts['control_N'] + allexpts['opioid_N']

    #add corrected p-values, OR confidence intervals, percentage of each group with NCD
    if 'age_onset_ncd' not in path:
        allexpts['coef'] = np.exp(allexpts['coef'])
        allexpts['.025'] = np.exp(allexpts['.025'])
        allexpts['.975'] = np.exp(allexpts['.975'])

    # correct for multiple comparisons
    allexpts['bonferroni'] = sm.stats.multipletests(allexpts['p'], alpha=0.05, method='bonferroni')[1]
    allexpts['bh_p'] = sm.stats.multipletests(allexpts['p'], alpha=0.05, method='fdr_bh')[1]

    # percentage of NCD for each group
    allexpts['opi_percent_ncd'] = 100 * (allexpts['num_opioid_ncd'] / allexpts['opioid_N'])
    allexpts['con_percent_ncd'] = 100 * (allexpts['num_control_ncd'] / allexpts['control_N'])

    # percentage of sex for total sample
    allexpts['total_female%'] = ((allexpts['control_female%'] * allexpts['control_N']) + (allexpts['opioid_female%'] * allexpts['opioid_N'])) / allexpts['total_N']

    # mean age for total sample
    allexpts['total_mean_age'] = ((allexpts['control_AgeMean'] * allexpts['control_N']) + (allexpts['opioid_AgeMean'] * allexpts['opioid_N'])) / allexpts['total_N']

    df = allexpts.copy()
    return df

def get_summary(predictor, outcome):
    sinai = importdata('sinai', predictor, outcome)
    ukb = importdata('ukb', predictor, outcome)
    df = pd.concat([sinai, ukb])
    print(predictor, outcome, '\n',
    'N', min(df.total_N), max(df.total_N), '\n',
    'Female %', min(df['total_female%']), max(df['total_female%']), '\n',
    'Mean age', min(df['total_mean_age']), max(df['total_mean_age']), '\n',
    'Coef', f'Min: {min(df.coef)},', f'Max: {max(df.coef)},', f'Median: {np.median(df.coef)}', '\n',
    '% significant', sum(df.bh_p<0.05)/len(df), f'Median p: {np.median(df.bh_p)}','\n',
    )
    return df

def update_figures(predictor, outcome):
    df = get_summary(predictor, outcome)
    

In [12]:
get_summary('aud', 'binary')

(156, 23)
(156, 23)
(156, 23)
(156, 23)
(312, 23)
(312, 23)
(312, 23)
(312, 23)
aud binary 
 N 1778 416483 
 Female % 0.5113524448348372 0.6766005988877798 
 Mean age 49.70500071032817 76.96312695823299 
 Coef Min: 7.452639268664084e-09, Max: 37.0626740818265, Median: 5.087230434612368 
 % significant 0.8611111111111112 Median p: 2.5838293663917333e-11 



  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,control_N,opioid_N,control_AgeMean,control_AgeSD,opioid_AgeMean,opioid_AgeSD,control_male%,control_female%,opioid_male%,opioid_female%,...,ncd_age_threshold,hx_tobacco,hx_sud_covar,total_N,bonferroni,bh_p,opi_percent_ncd,con_percent_ncd,total_female%,total_mean_age
0,85440,624,65.221278,12.625339,58.544872,10.043554,0.437769,0.562207,0.682692,0.317308,...,45,0,0,86064,4.384900e-13,6.264144e-15,4.967949,2.297519,0.560432,65.172871
1,85440,624,65.221278,12.625339,58.544872,10.043554,0.437769,0.562207,0.682692,0.317308,...,45,0,1,86064,1.345582e-08,1.140324e-10,4.967949,2.297519,0.560432,65.172871
2,85440,624,65.221278,12.625339,58.544872,10.043554,0.437769,0.562207,0.682692,0.317308,...,45,1,0,86064,1.754215e-09,1.654920e-11,4.967949,2.297519,0.560432,65.172871
3,85440,624,65.221278,12.625339,58.544872,10.043554,0.437769,0.562207,0.682692,0.317308,...,45,1,1,86064,1.202959e-06,8.654378e-09,4.967949,2.297519,0.560432,65.172871
4,62174,326,70.672821,10.338272,66.030675,8.269556,0.442082,0.557886,0.668712,0.331288,...,55,0,0,62500,4.651786e-12,5.814733e-14,8.282209,3.004471,0.556704,70.648608
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7,158881,1991,66.251326,5.392068,65.681065,5.469264,0.448166,0.551834,0.721246,0.278754,...,55,1,1,160872,4.439210e-28,1.008911e-29,4.068307,0.623108,0.548455,66.244269
8,91995,1050,70.136779,2.885819,70.085714,2.841912,0.469232,0.530768,0.739048,0.260952,...,65,0,0,93045,3.412946e-36,1.264054e-37,5.619048,0.942443,0.527723,70.136203
9,91995,1050,70.136779,2.885819,70.085714,2.841912,0.469232,0.530768,0.739048,0.260952,...,65,0,1,93045,6.376925e-19,7.872747e-21,5.619048,0.942443,0.527723,70.136203
10,91995,1050,70.136779,2.885819,70.085714,2.841912,0.469232,0.530768,0.739048,0.260952,...,65,1,0,93045,2.565139e-32,7.773150e-34,5.619048,0.942443,0.527723,70.136203


In [21]:
get_summary('ukb', 'prescription', 'binary')

(12016, 28)
(12016, 28)
(12016, 28)
(2112, 28)
ukb prescription binary 
 N 16599 186540 
 Female % 0.500210830672851 0.5703712579915274 
 Mean age 58.098693792099034 69.6107443931923 
 Coef 0.9974383569534021 1.0047918534092297 



In [8]:
sinai_bin_binary = importdata('sinai', 'binary', 'binary')
sinai_rx_binary = importdata('sinai', 'prescription', 'binary')
sinai_bin_age = importdata('sinai', 'binary', 'age_onset')
sinai_rx_age = importdata('sinai', 'prescription', 'age_onset')

(5184, 28)
(5184, 28)
(5184, 28)
(2592, 28)
(5184, 28)
(5184, 28)
(5184, 28)
(2592, 28)
(4736, 28)
(4736, 28)
(4736, 28)
(2368, 28)
(4736, 28)
(4736, 28)
(4736, 28)
(2368, 28)


In [7]:
sinai_bin_binary

Unnamed: 0,control_N,opioid_N,control_AgeMean,control_AgeSD,opioid_AgeMean,opioid_AgeSD,control_male%,control_female%,opioid_male%,opioid_female%,...,hx_tobacco,hx_sud_covar,hx_MAT,bonferroni,bh_p,total_N,opi_percent_ncd,con_percent_ncd,total_female%,total_mean_age
0,137544,1979,64.895786,12.487435,63.212228,11.962694,0.440514,0.559436,0.325417,0.674583,...,0,0,0,0.000020,1.585663e-08,139523,2.981304,1.583493,0.561069,64.871906
2,137544,1979,64.895786,12.487435,63.212228,11.962694,0.440514,0.559436,0.325417,0.674583,...,0,1,0,0.000110,8.228837e-08,139523,2.981304,1.583493,0.561069,64.871906
4,137544,1979,64.895786,12.487435,63.212228,11.962694,0.440514,0.559436,0.325417,0.674583,...,1,0,0,0.000512,3.454060e-07,139523,2.981304,1.583493,0.561069,64.871906
6,137544,1979,64.895786,12.487435,63.212228,11.962694,0.440514,0.559436,0.325417,0.674583,...,1,1,0,0.001709,1.060986e-06,139523,2.981304,1.583493,0.561069,64.871906
8,137544,1979,64.895786,12.487435,63.212228,11.962694,0.440514,0.559436,0.325417,0.674583,...,0,0,0,0.000312,2.168458e-07,139523,2.981304,1.583493,0.561069,64.871906
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
566,174391,1164,76.260661,7.927970,74.903780,7.712913,0.432780,0.567202,0.345361,0.654639,...,1,1,0,1.000000,7.634795e-04,175555,4.810997,2.923316,0.567782,76.251665
568,174391,1164,76.260661,7.927970,74.903780,7.712913,0.432780,0.567202,0.345361,0.654639,...,0,0,0,0.177131,8.253990e-05,175555,4.810997,2.923316,0.567782,76.251665
570,174391,1164,76.260661,7.927970,74.903780,7.712913,0.432780,0.567202,0.345361,0.654639,...,0,1,0,1.000000,4.555796e-04,175555,4.810997,2.923316,0.567782,76.251665
572,174391,1164,76.260661,7.927970,74.903780,7.712913,0.432780,0.567202,0.345361,0.654639,...,1,0,0,0.567220,2.555044e-04,175555,4.810997,2.923316,0.567782,76.251665
