In [9]:
from statsmodels.stats.meta_analysis import combine_effects
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import warnings

In [10]:
os.chdir('/project/ssverma_shared/projects/Endometriosis/Endo_Subtyping_Heterogeneity/Meta_Analysis/')

In [11]:
! snakemake -n all --quiet

[33mBuilding DAG of jobs...[0m
[33mNothing to be done (all requested files are present and up to date).[0m


In [12]:
phenos = ['endometriosis',
          'cluster_vs_controls_1',
          'cluster_vs_controls_2',
          'cluster_vs_controls_3',
          'cluster_vs_controls_4',
          'cluster_vs_controls_5']

In [13]:
pheno_ss = {p: [] for p in phenos}
pheno_ss_AFR_EUR = {p: [] for p in phenos}
pheno_ss_EUR = {p: [] for p in phenos}
pheno_ss_AFR = {p: [] for p in phenos}

biobanks = ['eMERGE', 'UKBB', 'PMBB', 'AOU']
# biobanks = ['eMERGE', 'PMBB', 'AOU']

for d in biobanks:
    for a in ['AFR', 'EUR', 'ASIAN']:
        if d == 'eMERGE' and a == 'ASIAN':
            continue
        for f in os.listdir(f'{d}_{a}/Sumstats/'):
            file_pheno = f.split('.')[0]
            if '0' in file_pheno:
                continue
            pheno_ss[file_pheno].append(f'{d}_{a}/Sumstats/{f}')
            if a != 'ASIAN':
                pheno_ss_AFR_EUR[file_pheno].append(f'{d}_{a}/Sumstats/{f}')
            if a == 'EUR':
                pheno_ss_EUR[file_pheno].append(f'{d}_{a}/Sumstats/{f}')
            if a == 'AFR':
                pheno_ss_AFR[file_pheno].append(f'{d}_{a}/Sumstats/{f}')

In [14]:
eMERGE_id_map = pd.read_table('SNP_Lists/eMERGE_to_PMBB_ID_map.txt', index_col=['ID'])['PMBB_ID']
UKBB_id_map = pd.read_table('SNP_Lists/UKBB_to_PMBB_ID_map.txt', index_col=['ID'])['PMBB_ID']
AOU_id_map = pd.read_table('SNP_Lists/AOU_to_PMBB_ID_map.txt', index_col=['ID'])['PMBB_ID']

In [15]:
use_allele = pd.read_table('PMBB_AFR/Sumstats/endometriosis.saige.gz', index_col='ID')['Allele2']
use_allele

ID
chr1:21686033:C:T     T
chr1:21720782:C:T     T
chr1:21722171:C:T     T
chr1:21772447:C:T     T
chr1:21772448:C:T     T
                     ..
chr9:133370796:C:A    A
chr9:133372523:G:C    C
chr9:133402020:C:T    T
chr9:133405414:T:G    G
chr9:133405868:C:T    T
Name: Allele2, Length: 11425, dtype: object

In [16]:
os.makedirs('Meta_Input/', exist_ok=True)

for p in phenos:
    print(p)
    effs = pd.DataFrame()
    errs = pd.DataFrame()
    alleles = pd.DataFrame()

    # Load data
    for ss_file in pheno_ss[p]:
        ss_dataset = ss_file.split('_')[0]
        ss_cohort = ss_file.split('/')[0]
        temp = pd.read_table(ss_file, nrows=None)
        temp = temp.rename(columns={'MarkerID': 'ID', 'p.value': 'P'})
        temp = temp.set_index('ID')
        temp = temp[temp['AF_Allele2'] >= 0.01]
        temp = temp[['BETA', 'Allele2', 'SE']].dropna()

        if ss_dataset == 'eMERGE':
            temp = temp[temp.index.isin(eMERGE_id_map.index)]
            temp = temp.rename(index=eMERGE_id_map)
        if ss_dataset == 'UKBB':
            temp = temp[temp.index.isin(UKBB_id_map.index)]
            temp = temp.rename(index=UKBB_id_map)
        if ss_dataset == 'AOU':
            temp = temp[temp.index.isin(AOU_id_map.index)]
            temp = temp.rename(index=AOU_id_map)

        print(ss_cohort)

        effs[ss_cohort] = temp['BETA']
        errs[ss_cohort] = temp['SE']
        alleles[ss_cohort] = temp['Allele2']

    keep_vars = effs.index.intersection(use_allele.index)
    effs, errs, alleles = effs.loc[keep_vars], errs.loc[keep_vars], alleles.loc[keep_vars]

    # Now we have data to subset for meta-analysis
    effs = effs[effs.count(axis=1) > 1]
    errs = errs.loc[effs.index]
    alleles = alleles.loc[effs.index]

    use_allele_temp = use_allele.loc[alleles.index]
    use_allele_2D = np.broadcast_to(use_allele_temp.T, alleles.T.shape).T
    flip_coords = np.where(alleles.values != use_allele_2D)
    effs.values[flip_coords] = -effs.values[flip_coords]

    for cohort, effect_col in effs.items():
        table = pd.concat([effect_col, errs[cohort], use_allele], axis=1)
        table.columns = ['BETA', 'SE', 'A1']
        table.index.name = 'SNP'
        table['OR'] = np.exp(table['BETA'])
        table = table.dropna().reset_index()
        table[['CHR', 'BP']] = table['SNP'].str.replace('chr', '').str.split(':', expand=True)[[0, 1]]
        need_cols = ['CHR', 'BP', 'SNP', 'OR', 'SE', 'A1']
        table[need_cols].to_csv(f'Meta_Input/{cohort}.{p}.txt', sep='\t', index=False)

    # Use different combinations of groups
    for group, ss_dict in zip(['ALL', 'AFR_EUR', 'EUR', 'AFR'], [pheno_ss, pheno_ss_AFR_EUR, pheno_ss_EUR, pheno_ss_AFR]):
        print('\t', group)
        ss_group = ss_dict[p]
        cohorts = [f.split('/')[0] for f in ss_group]

        if len(cohorts) < 2:
            continue

        plink_cmd_parts = ['plink', '--meta-analysis']
        plink_cmd_parts.extend([f'Meta_Input/{cohort}.{p}.txt' for cohort in cohorts])
        plink_cmd_parts.extend(['--out', f'Meta_Output/{p}.{group}.PLINK'])
        cmd = ' '.join(plink_cmd_parts)

        ! module load plink/1.90Beta6.18; {cmd}



	 ALL
PLINK v1.90b6.18 64-bit (16 Jun 2020)          www.cog-genomics.org/plink/1.9/
(C) 2005-2020 Shaun Purcell, Christopher Chang   GNU General Public License v3
Logging to Meta_Output/cluster_vs_controls_3.ALL.PLINK.log.
Options in effect:
  --meta-analysis Meta_Input/eMERGE_AFR.cluster_vs_controls_3.txt Meta_Input/eMERGE_EUR.cluster_vs_controls_3.txt Meta_Input/UKBB_AFR.cluster_vs_controls_3.txt Meta_Input/UKBB_EUR.cluster_vs_controls_3.txt Meta_Input/UKBB_ASIAN.cluster_vs_controls_3.txt Meta_Input/PMBB_AFR.cluster_vs_controls_3.txt Meta_Input/PMBB_EUR.cluster_vs_controls_3.txt Meta_Input/AOU_AFR.cluster_vs_controls_3.txt Meta_Input/AOU_EUR.cluster_vs_controls_3.txt
  --out Meta_Output/cluster_vs_controls_3.ALL.PLINK

515273 MB RAM detected; reserving 257636 MB for main workspace.
--meta-analysis: 10495 variants processed; results written to
Meta_Output/cluster_vs_controls_3.ALL.PLINK.meta .
	 AFR_EUR
PLINK v1.90b6.18 64-bit (16 Jun 2020)          www.cog-genomics.org/pli