In [4]:
from statsmodels.stats.meta_analysis import combine_effects
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import warnings

In [5]:
os.chdir('/project/ssverma_shared/projects/Endometriosis/Endo_Subtyping_Heterogeneity/Meta_Analysis/')

In [6]:
! snakemake -n all --quiet

[33mBuilding DAG of jobs...[0m
[33mNothing to be done (all requested files are present and up to date).[0m


In [7]:
phenos = ['endometriosis',
          'cluster_vs_controls_0',
          'cluster_vs_controls_1',
          'cluster_vs_controls_2',
          'cluster_vs_controls_3',
          'cluster_vs_controls_4']

In [8]:
pheno_ss = {p: [] for p in phenos}
pheno_ss_AFR_EUR = {p: [] for p in phenos}
pheno_ss_EUR = {p: [] for p in phenos}

for d in ['eMERGE', 'UKBB', 'PMBB']:
    for a in ['AFR', 'EUR', 'ASIAN']:
        if d == 'eMERGE' and a == 'ASIAN':
            continue
        for f in os.listdir(f'{d}_{a}/Sumstats/'):
            file_pheno = f.split('.')[0]
            pheno_ss[file_pheno].append(f'{d}_{a}/Sumstats/{f}')
            if a != 'ASIAN':
                pheno_ss_AFR_EUR[file_pheno].append(f'{d}_{a}/Sumstats/{f}')
            if a == 'EUR':
                pheno_ss_EUR[file_pheno].append(f'{d}_{a}/Sumstats/{f}')

In [9]:
eMERGE_id_map = pd.read_table('SNP_Lists/eMERGE_to_PMBB_ID_map.txt', index_col=['ID'])['PMBB_ID']
UKBB_id_map = pd.read_table('SNP_Lists/UKBB_to_PMBB_ID_map.txt', index_col=['ID'])['PMBB_ID']

In [10]:
os.makedirs('Meta_Input/', exist_ok=True)

for p in phenos:
    print(p)
    effs = pd.DataFrame()
    errs = pd.DataFrame()
    alleles = pd.DataFrame()

    # Load data
    for ss_file in pheno_ss[p]:
        ss_dataset = ss_file.split('_')[0]
        ss_cohort = ss_file.split('/')[0]
        temp = pd.read_table(ss_file, index_col='ID', nrows=None)
        temp = temp[temp['AF_Allele2'] >= 0.01]
        temp = temp[['BETA', 'Allele2', 'SE']].dropna()

        if ss_dataset == 'eMERGE':
            temp = temp[temp.index.isin(eMERGE_id_map.index)]
            temp = temp.rename(index=eMERGE_id_map)
        if ss_dataset == 'UKBB':
            temp = temp[temp.index.isin(UKBB_id_map.index)]
            temp = temp.rename(index=UKBB_id_map)

        effs[ss_cohort] = temp['BETA']
        errs[ss_cohort] = temp['SE']
        alleles[ss_cohort] = temp['Allele2']

    # Now we have data to subset for meta-analysis
    effs = effs[effs.count(axis=1) > 1]
    errs = errs.loc[effs.index]
    alleles = alleles.loc[effs.index]

    use_allele = alleles.mode(axis=1)[0]
    use_allele_2D = np.broadcast_to(use_allele.T, alleles.T.shape).T
    flip_coords = np.where(alleles.values != use_allele_2D)
    effs.values[flip_coords] = -effs.values[flip_coords]

    for cohort, effect_col in effs.iteritems():
        table = pd.concat([effect_col, errs[cohort], use_allele], axis=1)
        table.columns = ['BETA', 'SE', 'A1']
        table.index.name = 'SNP'
        table['OR'] = np.exp(table['BETA'])
        table = table.dropna().reset_index()
        table[['CHR', 'BP']] = table['SNP'].str.replace('chr', '').str.split(':', expand=True)[[0, 1]]
        table[['CHR', 'BP', 'SNP', 'OR', 'SE', 'A1']].to_csv(f'Meta_Input/{cohort}.{p}.txt', sep='\t', index=False)

    # Use different combinations of groups
    for group, ss_dict in zip(['ALL', 'AFR_EUR', 'EUR'], [pheno_ss, pheno_ss_AFR_EUR, pheno_ss_EUR]):
        print('\t', group)
        ss_group = ss_dict[p]
        cohorts = [f.split('/')[0] for f in ss_group]

        plink_cmd_parts = ['plink', '--meta-analysis']
        plink_cmd_parts.extend([f'Meta_Input/{cohort}.{p}.txt' for cohort in cohorts])
        plink_cmd_parts.extend(['--out', f'Meta_Output/{p}.{group}.PLINK'])
        cmd = ' '.join(plink_cmd_parts)

        ! module load plink/1.90Beta6.18; {cmd}



endometriosis
	 ALL
PLINK v1.90b6.18 64-bit (16 Jun 2020)          www.cog-genomics.org/plink/1.9/
(C) 2005-2020 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to Meta_Output/endometriosis.ALL.PLINK.log.
Options in effect:
  --meta-analysis Meta_Input/eMERGE_AFR.endometriosis.txt Meta_Input/eMERGE_EUR.endometriosis.txt Meta_Input/UKBB_AFR.endometriosis.txt Meta_Input/UKBB_EUR.endometriosis.txt Meta_Input/UKBB_ASIAN.endometriosis.txt Meta_Input/PMBB_AFR.endometriosis.txt Meta_Input/PMBB_EUR.endometriosis.txt Meta_Input/PMBB_ASIAN.endometriosis.txt
  --out Meta_Output/endometriosis.ALL.PLINK

128235 MB RAM detected; reserving 64117 MB for main workspace.
--meta-analysis: 748 variants processed; results written to
Meta_Output/endometriosis.ALL.PLINK.meta .
	 AFR_EUR
PLINK v1.90b6.18 64-bit (16 Jun 2020)          www.cog-genomics.org/plink/1.9/
(C) 2005-2020 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to Meta_Output/endomet

In [11]:
os.makedirs('Meta_Output/', exist_ok=True)

for p in phenos:
    print(p)
    effs = pd.DataFrame()
    vars = pd.DataFrame()
    alleles = pd.DataFrame()

    # Load data
    for ss_file in pheno_ss[p]:
        ss_dataset = ss_file.split('_')[0]
        ss_cohort = ss_file.split('/')[0]
        temp = pd.read_table(ss_file, index_col='ID', nrows=None)
        temp['N'] = temp['N_case'] + temp['N_ctrl']
        temp['SD'] = temp['SE'] * np.sqrt(temp['N'])
        temp['VAR'] = temp['SD'] ** 2
        temp = temp[temp['AF_Allele2'] >= 0.01]
        temp = temp[['BETA', 'VAR', 'Allele2']].dropna()
        if ss_dataset == 'eMERGE':
            temp = temp[temp.index.isin(eMERGE_id_map.index)]
            temp = temp.rename(index=eMERGE_id_map)
        if ss_dataset == 'UKBB':
            temp = temp[temp.index.isin(UKBB_id_map.index)]
            temp = temp.rename(index=UKBB_id_map)

        effs[ss_cohort] = temp['BETA']
        vars[ss_cohort] = temp['VAR']
        alleles[ss_cohort] = temp['Allele2']

    # Now we have data to subset for meta-analysis
    effs = effs[effs.count(axis=1) > 1]
    vars = vars.loc[effs.index]
    alleles = alleles.loc[effs.index]

    # Use different combinations of groups
    for group, ss_dict in zip(['ALL', 'AFR_EUR', 'EUR'], [pheno_ss, pheno_ss_AFR_EUR, pheno_ss_EUR]):
        print('\t', group)
        ss_group = ss_dict[p]
        cohorts = [f.split('/')[0] for f in ss_group]
        use_effs, use_vars, use_alleles = effs[cohorts], vars[cohorts], alleles[cohorts]

        meta_results_fe_rows = []
        meta_results_re_rows = []

        for variant in use_effs.index:
            effects = use_effs.loc[variant].dropna()
            variance = use_vars.loc[variant].dropna()
            if len(effects) < 2:
                continue
            v_alleles = use_alleles.loc[variant].dropna()
            use_allele = v_alleles.mode().loc[0]
            effects.loc[v_alleles != use_allele] = -effects.loc[v_alleles != use_allele]

            with warnings.catch_warnings():
                warnings.filterwarnings('ignore', r'invalid value encountered in sqrt')
                warnings.filterwarnings('ignore', r'divide by zero encountered')
                results = combine_effects(effects, variance, method_re='chi2', row_names=effects.index).summary_frame()
            results['ID'] = variant
            results['N_Studies'] = len(effects)
            results['Effect_Allele'] = use_allele

            meta_results_re_rows.append(results.loc['random effect', ['Effect_Allele', 'eff', 'sd_eff', 'ci_low', 'ci_upp', 'ID', 'N_Studies']])
            meta_results_fe_rows.append(results.loc['fixed effect', ['Effect_Allele', 'eff', 'sd_eff', 'ci_low', 'ci_upp', 'ID', 'N_Studies']])

        meta_results_re = pd.concat(meta_results_re_rows, axis=1).transpose().set_index('ID').dropna(subset='sd_eff')
        meta_results_fe = pd.concat(meta_results_fe_rows, axis=1).transpose().set_index('ID').dropna(subset='sd_eff')

        output_file_re = f'Meta_Output/{p}.{group}.RE.csv.gz'
        output_file_fe = f'Meta_Output/{p}.{group}.FE.csv.gz'

        meta_results_re.to_csv(output_file_re)
        meta_results_fe.to_csv(output_file_fe)

endometriosis
	 ALL
	 AFR_EUR
	 EUR
cluster_vs_controls_0
	 ALL
	 AFR_EUR
	 EUR
cluster_vs_controls_1
	 ALL
	 AFR_EUR
	 EUR
cluster_vs_controls_2
	 ALL
	 AFR_EUR
	 EUR
cluster_vs_controls_3
	 ALL
	 AFR_EUR
	 EUR
cluster_vs_controls_4
	 ALL
	 AFR_EUR
	 EUR
