In [1]:
import os
import pandas as pd
import subprocess
import warnings
warnings.filterwarnings('ignore')

In [2]:
hapmap3 = pd.read_table('hapmap3.all.txt') # in GRCh37
hapmap3

Unnamed: 0,CHR,POS,SNPID
0,1,68082,rs367789441
1,1,77866,rs563593912
2,1,87409,rs139490478
3,1,87647,rs146836579
4,1,88144,rs541926283
...,...,...,...
1689632,22,50518815,rs11568172
1689633,22,50572406,rs3736686
1689634,22,50922470,rs2236032
1689635,22,50922771,rs35576574


In [3]:
bim_file = pd.read_table('./target_data/target.data.impute.bim', 
                         names=['CHR', 'SNP', 'No', 'POS', 'REF', 'ALT'])
bim_file

Unnamed: 0,CHR,SNP,No,POS,REF,ALT
0,1,rs201500819,0,769828,C,G
1,1,rs74879860,0,771265,C,A
2,1,rs74512038,0,778597,T,C
3,1,rs74707816,0,779059,A,G
4,1,rs3121393,0,784860,T,C
...,...,...,...,...,...,...
5605061,22,rs9616978,0,50781891,G,C
5605062,22,rs369304721,0,50782762,A,G
5605063,22,rs115055839,0,50783303,C,T
5605064,22,rs372062323,0,50783547,A,G


# Define functions:

In [3]:
def preprocess_gwas_PRSice2(path_gwas:str,
                             output_folder:str):
    gwas = pd.read_table(path_gwas)[['CHR', 'SNPID', 'Allele2', 'Allele1', 'BETA', 'p.value']]
    gwas = gwas[gwas['CHR'].isin(range(1, 23))]
    gwas = gwas[gwas['SNPID'].str.startswith('rs')]
    gwas.columns = ['CHR', 'SNP', 'A1', 'A2', 'BETA', 'P']
    gwas.to_csv(os.path.join(output_folder, 'gwas.PRSice.txt'), index=False, sep='\t')

In [None]:
def preprocess_gwas_BBJ(hapmap3: pd.DataFrame,
                        path_gwas:str,
                        output_folder:str,
                        n_gwas:int,
                        prsice2:bool):
    """
    Preprocess GWAS summary statistics for BBJ.
    Args:
        hapmap3 (pd.DataFrame): HapMap3 data with columns ['CHR', 'POS', 'SNPID'].
        path_gwas (str): Path to the GWAS summary statistics file.
        output_folder (str): Folder to save the processed files.
        n_gwas (int): Effective sample size of the GWAS.
        prsice2 (bool): Whether to preprocess for PRSice2.
    
    Returns:
        None: Saves processed files in the specified output folder.
    """

    gwas = pd.read_table(path_gwas)[['CHR', 'POS', 'Allele2', 'Allele1', 'BETA', 'SE']]
    gwas = pd.merge(hapmap3, gwas, how='inner', on=['CHR', 'POS'])
    gwas = gwas[(gwas['Allele1'].str.len() == 1) & (gwas['Allele2'].str.len() == 1)]
    # only keep SNPID startwith rs
    gwas = gwas[gwas['SNPID'].str.startswith('rs')].drop_duplicates(subset='SNPID')

    # gwas for PRS-CS and PRS-CSx:
    gwas_PRS_CS = gwas[['SNPID', 'Allele2', 'Allele1', 'BETA', 'SE']]
    gwas_PRS_CS.columns = ['SNP', 'A1', 'A2', 'BETA', 'SE']
    gwas_PRS_CS.to_csv(os.path.join(output_folder, 'gwas.PRSCS.txt'), index=False, sep='\t')

    # gwas for LDpred2:
    gwas_LDpred2 = gwas[['CHR', 'POS', 'SNPID', 'Allele2', 'Allele1', 'BETA', 'SE']]
    gwas_LDpred2.columns = ['chr', 'pos', 'rsid', 'a1', 'a0', 'beta', 'beta_se']
    gwas_LDpred2['n_eff'] = n_gwas
    gwas_LDpred2.to_csv(os.path.join(output_folder, 'gwas.LDpred2.txt'), index=False, sep='\t')

    # gwas for PRSice2:
    if prsice2:
        preprocess_gwas_PRSice2(path_gwas, output_folder)

In [None]:
# output: chr	pos	SNP	A1	A2	Freq1.Hapmap	b	se	p	N
def process_gwas_shaPRS(hapmap3: pd.DataFrame,
                        path_gwas:str,
                        output_folder:str,
                        n_gwas:int,
                        columns=['CHR', 'POS', 'Allele2', 'Allele1', 'BETA', 'SE', 'p.value']):
    """
    Preprocess GWAS summary statistics for shaPRS.
    Args:
        hapmap3 (pd.DataFrame): HapMap3 data with columns ['CHR', 'POS', 'SNPID'].
        path_gwas (str): Path to the GWAS summary statistics file.
        output_folder (str): Folder to save the processed files.
        n_gwas (int): Effective sample size of the GWAS.
        columns (list): Columns to select from the GWAS data.
    Returns:
        None: Saves processed files in the specified output folder.
    """
    gwas = pd.read_table(path_gwas)[columns]
    gwas.columns = ['CHR', 'POS', 'A1', 'A2', 'BETA', 'SE', 'P']
    gwas = pd.merge(hapmap3, gwas, how='inner', on=['CHR', 'POS'])
    gwas = gwas[(gwas['A1'].str.len() == 1) & (gwas['A2'].str.len() == 1)]
    # only keep SNPID startwith rs
    gwas = gwas[gwas['SNPID'].str.startswith('rs')].drop_duplicates(subset='SNPID')

    gwas['Freq1.Hapmap'] = 'X'
    gwas['N'] = n_gwas
    gwas.columns = ['chr', 'pos', 'SNP', 'A1', 'A2', 'b', 'se', 'p', 'Freq1.Hapmap', 'N']
    gwas[['chr', 'pos', 'SNP', 'A1', 'A2', 'Freq1.Hapmap', 'b', 'se', 'p', 'N']].to_csv(os.path.join(output_folder, 'gwas.shaPRS.txt'), index=False, sep='\t')

# 1. Breast Cancer

## a. EAS (BBJ)

In [None]:
gwas_path_breast_cancer = 'sum_stats_data/breast_cancer/hum0197.v3.BBJ.BC.v1/GWASsummary_BrC_Japanese_SakaueKanai2020.auto.txt'
output_folder_breast_cancer = 'sum_stats_data/breast_cancer/hum0197.v3.BBJ.BC.v1/'
n_gwas_breast_cancer = 79550

In [None]:
preprocess_gwas_BBJ(hapmap3=hapmap3,
                    path_gwas=gwas_path_breast_cancer,
                    output_folder=output_folder_breast_cancer,
                    n_gwas=n_gwas_breast_cancer,
                    prsice2=True)

In [None]:
process_gwas_shaPRS(hapmap3=hapmap3,
                    path_gwas=gwas_path_breast_cancer,
                    output_folder=output_folder_breast_cancer,
                    n_gwas=n_gwas_breast_cancer)

## b. EUR

In [None]:
gwas_path_breast_cancer = 'sum_stats_data/breast_cancer/hum0197.v3.EUR.BC.v1/GWASsummary_BrC_EUR_SakaueKanai2020.auto.txt'
output_folder_breast_cancer = 'sum_stats_data/breast_cancer/hum0197.v3.EUR.BC.v1/'
n_gwas_breast_cancer = 257730

In [None]:
preprocess_gwas_BBJ(hapmap3=hapmap3,
                    path_gwas=gwas_path_breast_cancer,
                    output_folder=output_folder_breast_cancer,
                    n_gwas=n_gwas_breast_cancer,
                    prsice2=False)

In [None]:
process_gwas_shaPRS(hapmap3=hapmap3,
                    path_gwas=gwas_path_breast_cancer,
                    output_folder=output_folder_breast_cancer,
                    n_gwas=n_gwas_breast_cancer)

# 2. Colorectal Cancer

## a. EAS (BBJ)

In [None]:
gwas_path_colorectal_cancer = 'sum_stats_data/colorectal_cancer/hum0197.v3.BBJ.CC.v1/GWASsummary_CRC_Japanese_SakaueKanai2020.auto.txt'
output_folder_colorectal_cancer = 'sum_stats_data/colorectal_cancer/hum0197.v3.BBJ.CC.v1/'
n_gwas_colorectal_cancer = 167691

In [None]:
preprocess_gwas_BBJ(hapmap3=hapmap3,
                    path_gwas=gwas_path_colorectal_cancer,
                    output_folder=output_folder_colorectal_cancer,
                    n_gwas=n_gwas_colorectal_cancer,
                    prsice2=True)

In [None]:
process_gwas_shaPRS(hapmap3=hapmap3,
                    path_gwas=gwas_path_colorectal_cancer,
                    output_folder=output_folder_colorectal_cancer,
                    n_gwas=n_gwas_colorectal_cancer)

## b. EUR

In [None]:
gwas_path_colorectal_cancer = 'sum_stats_data/colorectal_cancer/hum0197.v3.EUR.CC.v1/GWASsummary_CRC_EUR_SakaueKanai2020.auto.txt'
output_folder_colorectal_cancer = 'sum_stats_data/colorectal_cancer/hum0197.v3.EUR.CC.v1/'
n_gwas_colorectal_cancer = 470002

In [None]:
preprocess_gwas_BBJ(hapmap3=hapmap3,
                    path_gwas=gwas_path_colorectal_cancer,
                    output_folder=output_folder_colorectal_cancer,
                    n_gwas=n_gwas_colorectal_cancer,
                    prsice2=False)

In [None]:
process_gwas_shaPRS(hapmap3=hapmap3,
                    path_gwas=gwas_path_colorectal_cancer,
                    output_folder=output_folder_colorectal_cancer,
                    n_gwas=n_gwas_colorectal_cancer)

# 3. Gastric Cancer

## a. EAS (BBJ)

In [None]:
gwas_path_gastric_cancer = 'sum_stats_data/gastric_cancer/hum0197.v3.BBJ.GC.v1/GWASsummary_GaC_Japanese_SakaueKanai2020.auto.txt'
output_folder_gastric_cancer = 'sum_stats_data/gastric_cancer/hum0197.v3.BBJ.GC.v1/'
n_gwas_gastric_cancer = 167122

preprocess_gwas_BBJ(hapmap3=hapmap3,
                    path_gwas=gwas_path_gastric_cancer,
                    output_folder=output_folder_gastric_cancer,
                    n_gwas=n_gwas_gastric_cancer,
                    prsice2=True)

process_gwas_shaPRS(hapmap3=hapmap3,
                    path_gwas=gwas_path_gastric_cancer,
                    output_folder=output_folder_gastric_cancer,
                    n_gwas=n_gwas_gastric_cancer)

## b. EUR

In [None]:
gwas_path_gastric_cancer = 'sum_stats_data/gastric_cancer/hum0197.v3.EUR.GC.v1/GWASsummary_GaC_EUR_SakaueKanai2020.auto.txt'
output_folder_gastric_cancer = 'sum_stats_data/gastric_cancer//hum0197.v3.EUR.GC.v1/'
n_gwas_gastric_cancer = 476116

preprocess_gwas_BBJ(hapmap3=hapmap3,
                    path_gwas=gwas_path_gastric_cancer,
                    output_folder=output_folder_gastric_cancer,
                    n_gwas=n_gwas_gastric_cancer,
                    prsice2=False)

In [None]:
process_gwas_shaPRS(hapmap3=hapmap3,
                    path_gwas=gwas_path_gastric_cancer,
                    output_folder=output_folder_gastric_cancer,
                    n_gwas=n_gwas_gastric_cancer)

# 4. PD (Parkinson Disease)

## a. EAS (BBJ)

In [None]:
# n_gwas = 6050: sumstats from GCST90278092
gwas = pd.read_table('sum_stats_data/pd/gwas.catalog.EAS.GCST90278092/GCST90278092.h.tsv')
gwas = gwas[['chromosome', 'base_pair_location', 'rsid', 'effect_allele', 'other_allele', 'beta', 'standard_error', 'p_value']]
gwas.columns = ['CHR', 'POS', 'SNPID', 'A1', 'A2', 'BETA', 'SE', 'P']
gwas = gwas[gwas['CHR'].isin(range(1, 23))]
gwas = gwas[(gwas['A1'].str.len() == 1) & (gwas['A2'].str.len() == 1)]
gwas = gwas[gwas['SNPID'].str.startswith('rs')].drop_duplicates(subset='SNPID')

gwas_PRSice = gwas[['CHR', 'POS', 'SNPID', 'A1', 'A2', 'BETA', 'P']]
gwas_PRSice.columns = ['CHR', 'POS', 'SNP', 'A1', 'A2', 'BETA', 'P']
gwas_PRSice.to_csv(os.path.join('sum_stats_data/pd/gwas.catalog.EAS.GCST90278092/', 'gwas.PRSice.txt'), index=False, sep='\t')

gwas = pd.merge(hapmap3, gwas[['SNPID', 'A1', 'A2', 'BETA', 'SE']], on=['SNPID'])
gwas_LDpred2 = gwas[['CHR', 'POS', 'SNPID', 'A1', 'A2', 'BETA', 'SE']]
gwas_LDpred2['n_eff'] = 6050
gwas_LDpred2.columns = ['chr', 'pos', 'rsid', 'a1', 'a0', 'beta', 'beta_se', 'n_eff']

gwas_PRS_CS = gwas[['SNPID', 'A1', 'A2', 'BETA', 'SE']]
gwas_PRS_CS.columns = ['SNP', 'A1', 'A2', 'BETA', 'SE']

# save sumstats:
gwas_LDpred2.to_csv(os.path.join('sum_stats_data/pd/gwas.catalog.EAS.GCST90278092/', 'gwas.LDpred2.txt'), index=False, sep='\t')
gwas_PRS_CS.to_csv(os.path.join('sum_stats_data/pd/gwas.catalog.EAS.GCST90278092/', 'gwas.PRSCS.txt'), index=False, sep='\t')

In [None]:
# shaPRS:
gwas = pd.read_table('sum_stats_data/pd/gwas.catalog.EAS.GCST90278092/GCST90278092.h.tsv')
gwas = gwas[['chromosome', 'rsid', 'effect_allele', 'other_allele', 'beta', 'standard_error', 'p_value']]
gwas.columns = ['CHR', 'SNPID', 'A1', 'A2', 'BETA', 'SE', 'P']
gwas = pd.merge(hapmap3, gwas, how='inner', on=['CHR', 'SNPID'])
gwas = gwas[(gwas['A1'].str.len() == 1) & (gwas['A2'].str.len() == 1)]
# only keep SNPID startwith rs
gwas = gwas[gwas['SNPID'].str.startswith('rs')].drop_duplicates(subset='SNPID')

gwas['Freq1.Hapmap'] = 'X'
gwas['N'] = 6050
gwas.columns = ['chr', 'pos', 'SNP', 'A1', 'A2', 'b', 'se', 'p', 'Freq1.Hapmap', 'N']
gwas[['chr', 'pos', 'SNP', 'A1', 'A2', 'Freq1.Hapmap', 'b', 'se', 'p', 'N']].to_csv(os.path.join('sum_stats_data/pd/gwas.catalog.EAS.GCST90278092/', 
                                                                                                 'gwas.shaPRS.txt'), index=False, sep='\t')

## b. EUR

In [None]:
gwas_path_pd = 'sum_stats_data/pd/hum0197.v3.EUR.PD.v1/GWASsummary_Parkinsons_Disease_EUR_SakaueKanai2020.auto.txt'
output_folder_pd = 'sum_stats_data/pd/hum0197.v3.EUR.PD.v1/'
n_gwas_pd = 480018

In [None]:
# n_gwas = 480018
preprocess_gwas_BBJ(hapmap3=hapmap3,
                    path_gwas=gwas_path_pd,
                    output_folder=output_folder_pd,
                    n_gwas=n_gwas_pd,
                    prsice2=False) 

In [None]:
process_gwas_shaPRS(hapmap3=hapmap3,
                    path_gwas=gwas_path_pd,
                    output_folder=output_folder_pd,
                    n_gwas=n_gwas_pd)

# 5. CKD (Chronic Kidney Disease)

## a. EAS (BBJ)

In [None]:
gwas_path_ckd = 'sum_stats_data/ckd/hum0197.v3.BBJ.CRF.v1/GWASsummary_Chronic_Renal_Failure_Japanese_SakaueKanai2020.auto.txt'
output_folder_ckd = 'sum_stats_data/ckd/hum0197.v3.BBJ.CRF.v1/'
n_gwas_ckd = 176462

In [None]:
# n_gwas = 176462
preprocess_gwas_BBJ(hapmap3=hapmap3,
                    path_gwas=gwas_path_ckd,
                    output_folder=output_folder_ckd,
                    n_gwas=n_gwas_ckd,
                    prsice2=True)

In [None]:
process_gwas_shaPRS(hapmap3=hapmap3,
                    path_gwas=gwas_path_ckd,
                    output_folder=output_folder_ckd,
                    n_gwas=n_gwas_ckd)

## b. EUR

In [None]:
gwas_path_ckd = 'sum_stats_data/ckd/hum0197.v3.EUR.CRF.v1/GWASsummary_Chronic_Renal_Failure_EUR_SakaueKanai2020.auto.txt'
output_folder_ckd = 'sum_stats_data/ckd/hum0197.v3.EUR.CRF.v1/'
n_gwas_ckd = 482858

In [None]:
# n_gwas = 482858
preprocess_gwas_BBJ(hapmap3=hapmap3,
                    path_gwas=gwas_path_ckd,
                    output_folder=output_folder_ckd,
                    n_gwas=n_gwas_ckd,
                    prsice2=False)

In [None]:
process_gwas_shaPRS(hapmap3=hapmap3,
                    path_gwas=gwas_path_ckd,
                    output_folder=output_folder_ckd,
                    n_gwas=n_gwas_ckd)

# 6. Osteoporosis

## a. EAS (BBJ)

In [None]:
gwas_path_osteoporosis = 'sum_stats_data/osteoporosis/hum0197.v3.BBJ.OP.v1/GWASsummary_Osteoporosis_Japanese_SakaueKanai2020.auto.txt'
output_folder_osteoporosis = 'sum_stats_data/osteoporosis/hum0197.v3.BBJ.OP.v1/'
n_gwas_osteoporosis = 178726

In [None]:
# n_gwas = 178726
preprocess_gwas_BBJ(hapmap3=hapmap3,
                    path_gwas=gwas_path_osteoporosis,
                    output_folder=output_folder_osteoporosis,
                    n_gwas=n_gwas_osteoporosis,
                    prsice2=True)

In [None]:
process_gwas_shaPRS(hapmap3=hapmap3,
                    path_gwas=gwas_path_osteoporosis,
                    output_folder=output_folder_osteoporosis,
                    n_gwas=n_gwas_osteoporosis)

## b. EUR

In [None]:
# n_gwas = 456348
gwas = pd.read_table('sum_stats_data/osteoporosis/gwas.catalog.EUR.GCST90044600/34737426-GCST90044600-EFO_0003882.h.tsv')
gwas = gwas[['hm_chrom', 'hm_rsid', 'hm_effect_allele', 'hm_other_allele', 'hm_beta', 'standard_error']]
gwas.columns = ['CHR', 'SNPID', 'A1', 'A2', 'BETA', 'SE']
gwas = pd.merge(hapmap3, gwas)
gwas = gwas[(gwas['A1'].str.len() == 1) & (gwas['A2'].str.len() == 1)]
# only keep SNPID startwith rs
gwas = gwas[gwas['SNPID'].str.startswith('rs')].drop_duplicates(subset='SNPID')

gwas_PRS_CSx = gwas[['SNPID', 'A1', 'A2', 'BETA', 'SE']]
gwas_PRS_CSx.columns = ['SNP', 'A1', 'A2', 'BETA', 'SE']

# save sumstats:
gwas_PRS_CSx.to_csv(os.path.join('sum_stats_data/osteoporosis/gwas.catalog.EUR.GCST90044600/', 'gwas.PRSCS.txt'), index=False, sep='\t')

In [None]:
# shaPRS:
gwas = pd.read_table('sum_stats_data/osteoporosis/gwas.catalog.EUR.GCST90044600/34737426-GCST90044600-EFO_0003882.h.tsv')
gwas = gwas[['hm_chrom', 'hm_rsid', 'hm_effect_allele', 'hm_other_allele', 'hm_beta', 'standard_error', 'p_value']]
gwas.columns = ['CHR', 'SNPID', 'A1', 'A2', 'BETA', 'SE', 'P']
gwas = pd.merge(hapmap3, gwas, how='inner', on=['CHR', 'SNPID'])
gwas = gwas[(gwas['A1'].str.len() == 1) & (gwas['A2'].str.len() == 1)]
# only keep SNPID startwith rs
gwas = gwas[gwas['SNPID'].str.startswith('rs')].drop_duplicates(subset='SNPID')

gwas['Freq1.Hapmap'] = 'X'
gwas['N'] = 456348
gwas.columns = ['chr', 'pos', 'SNP', 'A1', 'A2', 'b', 'se', 'p', 'Freq1.Hapmap', 'N']
gwas[['chr', 'pos', 'SNP', 'A1', 'A2', 'Freq1.Hapmap', 'b', 'se', 'p', 'N']].to_csv(os.path.join('sum_stats_data/osteoporosis/gwas.catalog.EUR.GCST90044600/', 
                                                                                                 'gwas.shaPRS.txt'), index=False, sep='\t')

# 7. CAD (Coronary Artery Disease)

## a. EAS (BBJ)

In [None]:
# n_gwas = 212453
gwas = pd.read_table('sum_stats_data/cad/gwas.BBJ.CAD/CAD.auto.rsq07.mac10.txt', sep='\s+')
gwas = gwas[['CHR', 'POS', 'Allele1', 'Allele2', 'N', 'BETA', 'SE', 'p.value']]
gwas = gwas[gwas['CHR'].isin(range(1, 23))]
gwas = gwas[(gwas['Allele1'].str.len() == 1) & (gwas['Allele2'].str.len() == 1)]

gwas = pd.merge(hapmap3, gwas)
gwas = gwas[gwas['SNPID'].str.startswith('rs')].drop_duplicates(subset='SNPID')
gwas_LDpred2 = gwas[['CHR', 'POS', 'SNPID', 'Allele2', 'Allele1', 'BETA', 'SE', 'N']]
gwas_LDpred2.columns = ['chr', 'pos', 'rsid', 'a1', 'a0', 'beta', 'beta_se', 'n_eff']

gwas_PRS_CS = gwas[['SNPID', 'Allele2', 'Allele1', 'BETA', 'SE']]
gwas_PRS_CS.columns = ['SNP', 'A1', 'A2', 'BETA', 'SE']

# save sumstats:
gwas_LDpred2.to_csv(os.path.join('sum_stats_data/cad/gwas.BBJ.CAD/', 'gwas.LDpred2.txt'), index=False, sep='\t')
gwas_PRS_CS.to_csv(os.path.join('sum_stats_data/cad/gwas.BBJ.CAD/', 'gwas.PRSCS.txt'), index=False, sep='\t')

In [None]:
# shaPRS
gwas = pd.read_table('sum_stats_data/cad/gwas.BBJ.CAD/CAD.auto.rsq07.mac10.txt', sep='\s+')
gwas = gwas[['CHR', 'POS', 'Allele1', 'Allele2', 'N', 'BETA', 'SE', 'p.value']]
gwas = gwas[gwas['CHR'].isin(range(1, 23))]
gwas = gwas[(gwas['Allele1'].str.len() == 1) & (gwas['Allele2'].str.len() == 1)]

gwas = pd.merge(hapmap3, gwas)
gwas = gwas[gwas['SNPID'].str.startswith('rs')].drop_duplicates(subset='SNPID')
gwas['Freq1.Hapmap'] = 'X'
gwas.columns = ['chr', 'pos', 'SNP', 'A2', 'A1', 'N', 'b', 'se', 'p', 'Freq1.Hapmap']
gwas[['chr', 'pos', 'SNP', 'A1', 'A2', 'Freq1.Hapmap', 'b', 'se', 'p', 'N']].to_csv(os.path.join('sum_stats_data/cad/gwas.BBJ.CAD/', 
                                                                                                 'gwas.shaPRS.txt'), index=False, sep='\t')

In [None]:
# PRSice-2

# Load GWAS data
gwas = pd.read_table('sum_stats_data/cad/gwas.BBJ.CAD/CAD.auto.rsq07.mac10.txt', sep='\s+')
gwas = gwas[['CHR', 'POS', 'Allele2', 'Allele1', 'BETA', 'p.value', 'AF_Allele2', 'SE']]
gwas.columns = ['CHR', 'POS', 'A1', 'A2', 'BETA', 'P', 'AF_A1', 'SE']
gwas = gwas[gwas['CHR'].isin(range(1, 23))]
gwas = gwas[(gwas['A1'].str.len() == 1) & (gwas['A2'].str.len() == 1)]
gwas['id'] = gwas['CHR'].astype(str) + ':' + gwas['POS'].astype(str)

# Save GWAS data to BED format
gwas_bed = gwas[['CHR', 'POS', 'POS', 'id']]
gwas_bed.columns = ['#chrom', 'start', 'end', 'id']
gwas_bed['start'] = gwas_bed['start'] - 1  # BED format uses 0-based start
gwas_bed.to_csv('sum_stats_data/cad/gwas.BBJ.CAD/gwas.bed', index=False, sep='\t')

# Run CrossMap to convert coordinates
chain_file = './grch37_to_grch38.over.chain.gz'
input_bed = 'sum_stats_data/cad/gwas.BBJ.CAD/gwas.bed'
output_bed = 'sum_stats_data/cad/gwas.BBJ.CAD/gwas_hg38.bed'
subprocess.run(['CrossMap', 'bed', chain_file, input_bed, output_bed])

# Read the converted BED file back into a DataFrame
gwas_hg38 = pd.read_csv(output_bed, sep='\t', names=['#chrom', 'start', 'end', 'id'])

# Merge with original data: POS (old POS in hg37) to end (new POS in hg38)
gwas = pd.merge(gwas_hg38, gwas, on='id')[['CHR', 'end', 'A1', 'A2', 'BETA', 'P', 'AF_A1', 'SE']]
gwas.columns = ['CHR', 'POS', 'A1', 'A2', 'BETA', 'P', 'AF_A1', 'SE']

# bim file in target data
bim_file_target = pd.read_table('./target_data/target.data.impute.bim', sep='\t', header=None)[[0, 1, 3]]
bim_file_target.columns = ['CHR', 'SNP', 'POS']

# Merge with target data
gwas = pd.merge(bim_file_target, gwas, on=['CHR', 'POS'])
gwas = gwas.drop_duplicates(subset='SNP', keep=False)
print('GWAS shape:', gwas.shape)

# Save the final sumstats
# gwas.to_csv(os.path.join('./cad/gwas.BBJ.CAD/', 'gwas.PRSice.txt'), index=False, sep='\t')
gwas.to_csv(os.path.join('sum_stats_data/cad/gwas.BBJ.CAD/', 'gwas.BBJ.CAD.hg38.txt'), index=False, sep='\t')

2025-02-21 11:09:28 [INFO]  Read the chain file "../grch37_to_grch38.over.chain.gz" 


GWAS shape: (5197372, 9)


## b. EUR

In [None]:
# n_gwas = 352063
gwas = pd.read_table('sum_stats_data/cad/gwas.catalog.EUR.GCST90013864/34017140-GCST90013864-EFO_0001645.h.tsv')
gwas = gwas[['hm_chrom', 'hm_rsid', 'hm_effect_allele', 'hm_other_allele', 'hm_beta', 'standard_error']]
gwas.columns = ['CHR', 'SNPID', 'A1', 'A2', 'BETA', 'SE']
gwas = gwas[gwas['CHR'].isin(range(1, 23))]
gwas = gwas[(gwas['A1'].str.len() == 1) & (gwas['A2'].str.len() == 1)]
gwas = gwas[gwas['SNPID'].str.startswith('rs')].drop_duplicates(subset='SNPID')
gwas = pd.merge(hapmap3, gwas)

gwas_PRS_CSx = gwas[['SNPID', 'A1', 'A2', 'BETA', 'SE']]
gwas_PRS_CSx.columns = ['SNP', 'A1', 'A2', 'BETA', 'SE']

# save sumstats:
gwas_PRS_CSx.to_csv(os.path.join('sum_stats_data/cad/gwas.catalog.EUR.GCST90013864/', 'gwas.PRSCS.txt'), index=False, sep='\t')

In [None]:
# shaPRS
gwas = pd.read_table('sum_stats_data/cad/gwas.catalog.EUR.GCST90013864/34017140-GCST90013864-EFO_0001645.h.tsv')
gwas = gwas[['hm_chrom', 'hm_rsid', 'hm_effect_allele', 'hm_other_allele', 'hm_beta', 'standard_error', 'p_value']]
gwas.columns = ['CHR', 'SNPID', 'A1', 'A2', 'BETA', 'SE', 'P']
gwas = gwas[gwas['CHR'].isin(range(1, 23))]
gwas = gwas[(gwas['A1'].str.len() == 1) & (gwas['A2'].str.len() == 1)]
gwas = gwas[gwas['SNPID'].str.startswith('rs')].drop_duplicates(subset='SNPID')
gwas = pd.merge(hapmap3, gwas)

gwas['Freq1.Hapmap'] = 'X'
gwas['N'] = 352063
gwas.columns = ['chr', 'pos', 'SNP', 'A1', 'A2', 'b', 'se', 'p', 'Freq1.Hapmap', 'N']
gwas[['chr', 'pos', 'SNP', 'A1', 'A2', 'Freq1.Hapmap', 'b', 'se', 'p', 'N']].to_csv(os.path.join('sum_stats_data/cad/gwas.catalog.EUR.GCST90013864/', 
                                                                                                 'gwas.shaPRS.txt'), index=False, sep='\t')

# 8. Hyperlipidemia

## a. EAS (BBJ)

In [None]:
# n_gwas = 9714
gwas = pd.read_table('sum_stats_data/hyperlipidemia/gwas.catalog.EAS.GCST90090994/35046404-GCST90090994-MONDO_0021187.h.tsv')
gwas = gwas[['hm_chrom', 'hm_rsid', 'hm_effect_allele', 'hm_other_allele', 'hm_beta', 'standard_error', 'p_value']]
gwas.columns = ['CHR', 'SNPID', 'A1', 'A2', 'BETA', 'SE', 'P']
gwas = gwas[gwas['CHR'].isin(range(1, 23))]
gwas = gwas[(gwas['A1'].str.len() == 1) & (gwas['A2'].str.len() == 1)]
gwas = gwas[gwas['SNPID'].str.startswith('rs')].drop_duplicates(subset='SNPID')

gwas_PRSice2 = gwas[['CHR', 'SNPID', 'A1', 'A2', 'BETA', 'P']]
gwas_PRSice2.columns = ['CHR', 'SNP', 'A1', 'A2', 'BETA', 'P']
gwas_PRSice2.to_csv(os.path.join('sum_stats_data/hyperlipidemia/gwas.catalog.EAS.GCST90090994/', 'gwas.PRSice.txt'), index=False, sep='\t')

gwas = pd.merge(hapmap3, gwas)

gwas_LDpred2 = gwas[['CHR', 'POS', 'SNPID', 'A1', 'A2', 'BETA', 'SE']]
gwas_LDpred2['n_eff'] = 9714
gwas_LDpred2.columns = ['chr', 'pos', 'rsid', 'a1', 'a0', 'beta', 'beta_se', 'n_eff']

gwas_PRS_CS = gwas[['SNPID', 'A1', 'A2', 'BETA', 'SE']]
gwas_PRS_CS.columns = ['SNP', 'A1', 'A2', 'BETA', 'SE']

# save sumstats:
gwas_LDpred2.to_csv(os.path.join('sum_stats_data/hyperlipidemia/gwas.catalog.EAS.GCST90090994/', 'gwas.LDpred2.txt'), index=False, sep='\t')
gwas_PRS_CS.to_csv(os.path.join('sum_stats_data/hyperlipidemia/gwas.catalog.EAS.GCST90090994/', 'gwas.PRSCS.txt'), index=False, sep='\t')

In [None]:
# shaPRS
gwas = pd.read_table('sum_stats_data/hyperlipidemia/gwas.catalog.EAS.GCST90090994/35046404-GCST90090994-MONDO_0021187.h.tsv')
gwas = gwas[['hm_chrom', 'hm_rsid', 'hm_effect_allele', 'hm_other_allele', 'hm_beta', 'standard_error', 'p_value']]
gwas.columns = ['CHR', 'SNPID', 'A1', 'A2', 'BETA', 'SE', 'P']
gwas = pd.merge(hapmap3, gwas, how='inner', on=['CHR', 'SNPID'])
gwas = gwas[(gwas['A1'].str.len() == 1) & (gwas['A2'].str.len() == 1)]
# only keep SNPID startwith rs
gwas = gwas[gwas['SNPID'].str.startswith('rs')].drop_duplicates(subset='SNPID')

gwas['Freq1.Hapmap'] = 'X'
gwas['N'] = 9714
gwas.columns = ['chr', 'pos', 'SNP', 'A1', 'A2', 'b', 'se', 'p', 'Freq1.Hapmap', 'N']
gwas[['chr', 'pos', 'SNP', 'A1', 'A2', 'Freq1.Hapmap', 'b', 'se', 'p', 'N']].to_csv(os.path.join('sum_stats_data/hyperlipidemia/gwas.catalog.EAS.GCST90090994/', 
                                                                                                 'gwas.shaPRS.txt'), index=False, sep='\t')

## b. EUR

In [None]:
# n_gwas = 349222
gwas = pd.read_table('sum_stats_data/hyperlipidemia/gwas.catalog.EUR.GCST90104006/34906840-GCST90104006-MONDO_0001336.h.tsv')
gwas = gwas[['hm_chrom', 'hm_rsid', 'hm_effect_allele', 'hm_other_allele', 'hm_beta', 'standard_error']]
gwas.columns = ['CHR', 'SNPID', 'A1', 'A2', 'BETA', 'SE']
gwas = gwas[gwas['CHR'].isin(range(1, 23))]
gwas = gwas[(gwas['A1'].str.len() == 1) & (gwas['A2'].str.len() == 1)]
gwas = gwas[gwas['SNPID'].str.startswith('rs')].drop_duplicates(subset='SNPID')
gwas = pd.merge(hapmap3, gwas)

gwas_PRS_CS = gwas[['SNPID', 'A1', 'A2', 'BETA', 'SE']]
gwas_PRS_CS.columns = ['SNP', 'A1', 'A2', 'BETA', 'SE']

# save sumstats:
gwas_PRS_CS.to_csv(os.path.join('sum_stats_data/hyperlipidemia/gwas.catalog.EUR.GCST90104006/', 'gwas.PRSCS.txt'), index=False, sep='\t')

In [None]:
# shaPRS
gwas = pd.read_table('sum_stats_data/hyperlipidemia/gwas.catalog.EUR.GCST90104006/34906840-GCST90104006-MONDO_0001336.h.tsv')
gwas = gwas[['hm_chrom', 'hm_rsid', 'hm_effect_allele', 'hm_other_allele', 'hm_beta', 'standard_error', 'p_value']]
gwas.columns = ['CHR', 'SNPID', 'A1', 'A2', 'BETA', 'SE', 'P']
gwas = pd.merge(hapmap3, gwas, how='inner', on=['CHR', 'SNPID'])
gwas = gwas[(gwas['A1'].str.len() == 1) & (gwas['A2'].str.len() == 1)]
# only keep SNPID startwith rs
gwas = gwas[gwas['SNPID'].str.startswith('rs')].drop_duplicates(subset='SNPID')

gwas['Freq1.Hapmap'] = 'X'
gwas['N'] = 349222
gwas.columns = ['chr', 'pos', 'SNP', 'A1', 'A2', 'b', 'se', 'p', 'Freq1.Hapmap', 'N']
gwas[['chr', 'pos', 'SNP', 'A1', 'A2', 'Freq1.Hapmap', 'b', 'se', 'p', 'N']].to_csv(os.path.join('sum_stats_data/hyperlipidemia/gwas.catalog.EUR.GCST90104006/', 
                                                                                                 'gwas.shaPRS.txt'), index=False, sep='\t')

# 9. Osteoarthritis

## a. EAS (BBJ)

In [None]:
# n_gwas = 1704
gwas = pd.read_csv('sum_stats_data/osteoarthritis/gwas.catalog.EAS.GCST90134281/GCST90134281_buildGRCh37.csv')
gwas = gwas[['chromosome', 'base_pair_location', 'effect_allele', 'other_allele', 'beta', 'standard_error']]
gwas.columns = ['CHR', 'POS', 'A1', 'A2', 'BETA', 'SE']
gwas = gwas[gwas['CHR'].isin(range(1, 23))]
gwas = gwas[(gwas['A1'].str.len() == 1) & (gwas['A2'].str.len() == 1)]

gwas = pd.merge(hapmap3, gwas)
gwas = gwas[gwas['SNPID'].str.startswith('rs')].drop_duplicates(subset='SNPID')

gwas_LDpred2 = gwas[['CHR', 'POS', 'SNPID', 'A1', 'A2', 'BETA', 'SE']]
gwas_LDpred2['n_eff'] = 1704
gwas_LDpred2.columns = ['chr', 'pos', 'rsid', 'a1', 'a0', 'beta', 'beta_se', 'n_eff']

gwas_PRS_CS = gwas[['SNPID', 'A1', 'A2', 'BETA', 'SE']]
gwas_PRS_CS.columns = ['SNP', 'A1', 'A2', 'BETA', 'SE']

# save sumstats:
gwas_LDpred2.to_csv(os.path.join('sum_stats_data/osteoarthritis/gwas.catalog.EAS.GCST90134281/', 'gwas.LDpred2.txt'), index=False, sep='\t')
gwas_PRS_CS.to_csv(os.path.join('sum_stats_data/osteoarthritis/gwas.catalog.EAS.GCST90134281/', 'gwas.PRSCS.txt'), index=False, sep='\t')

In [None]:
# shaPRS
gwas = pd.read_csv('sum_stats_data/osteoarthritis/gwas.catalog.EAS.GCST90134281/GCST90134281_buildGRCh37.csv')
gwas = gwas[['chromosome', 'base_pair_location', 'effect_allele', 'other_allele', 'beta', 'standard_error', 'p_value']]
gwas.columns = ['CHR', 'POS', 'A1', 'A2', 'BETA', 'SE', 'P']
gwas = pd.merge(hapmap3, gwas, how='inner', on=['CHR', 'POS'])
gwas = gwas[(gwas['A1'].str.len() == 1) & (gwas['A2'].str.len() == 1)]
# only keep SNPID startwith rs
gwas = gwas[gwas['SNPID'].str.startswith('rs')].drop_duplicates(subset='SNPID')

gwas['Freq1.Hapmap'] = 'X'
gwas['N'] = 1704
gwas.columns = ['chr', 'pos', 'SNP', 'A1', 'A2', 'b', 'se', 'p', 'Freq1.Hapmap', 'N']
gwas[['chr', 'pos', 'SNP', 'A1', 'A2', 'Freq1.Hapmap', 'b', 'se', 'p', 'N']].to_csv(os.path.join('sum_stats_data/osteoarthritis/gwas.catalog.EAS.GCST90134281/', 
                                                                                                 'gwas.shaPRS.txt'), index=False, sep='\t')

In [None]:
# PRSice-2

# Load GWAS data
gwas = pd.read_csv('sum_stats_data/osteoarthritis/gwas.catalog.EAS.GCST90134281/GCST90134281_buildGRCh37.csv')
gwas = gwas[['chromosome', 'base_pair_location', 'effect_allele', 'other_allele', 'beta', 'p_value', 'effect_allele_frequency', 'standard_error']]
gwas.columns = ['CHR', 'POS', 'A1', 'A2', 'BETA', 'P', 'AF_A1', 'SE']
gwas = gwas[gwas['CHR'].isin(range(1, 23))]
gwas = gwas[(gwas['A1'].str.len() == 1) & (gwas['A2'].str.len() == 1)]
gwas['id'] = gwas['CHR'].astype(str) + ':' + gwas['POS'].astype(str)

# Save GWAS data to BED format
gwas_bed = gwas[['CHR', 'POS', 'POS', 'id']]
gwas_bed.columns = ['#chrom', 'start', 'end', 'id']
gwas_bed['start'] = gwas_bed['start'] - 1  # BED format uses 0-based start
gwas_bed.to_csv('sum_stats_data/osteoarthritis/gwas.catalog.EAS.GCST90134281/gwas.bed', index=False, sep='\t')

# Run CrossMap to convert coordinates
chain_file = './grch37_to_grch38.over.chain.gz'
input_bed = 'sum_stats_data/osteoarthritis/gwas.catalog.EAS.GCST90134281/gwas.bed'
output_bed = 'sum_stats_data/osteoarthritis/gwas.catalog.EAS.GCST90134281/gwas_hg38.bed'
subprocess.run(['CrossMap', 'bed', chain_file, input_bed, output_bed])

# Read the converted BED file back into a DataFrame
gwas_hg38 = pd.read_csv(output_bed, sep='\t', names=['#chrom', 'start', 'end', 'id'])

# Merge with original data: POS (old POS in hg37) to end (new POS in hg38)
gwas = pd.merge(gwas_hg38, gwas, on='id')[['CHR', 'end', 'A1', 'A2', 'BETA', 'P', 'AF_A1', 'SE']]
gwas.columns = ['CHR', 'POS', 'A1', 'A2', 'BETA', 'P', 'AF_A1', 'SE']

# bim file in target data
bim_file_target = pd.read_table('./target_data/target.data.impute.bim', sep='\t', header=None)[[0, 1, 3]]
bim_file_target.columns = ['CHR', 'SNP', 'POS']

# Merge with target data
gwas = pd.merge(bim_file_target, gwas, on=['CHR', 'POS'])
gwas = gwas.drop_duplicates(subset='SNP', keep=False)
print('GWAS shape:', gwas.shape)

# Save the final sumstats
# gwas.to_csv(os.path.join('./osteoarthritis/gwas.catalog.EAS.GCST90134281/', 'gwas.PRSice.txt'), index=False, sep='\t')
gwas.to_csv(os.path.join('sum_stats_data/osteoarthritis/gwas.catalog.EAS.GCST90134281/', 'gwas.BBJ.osteoarthritis.hg38.txt'), index=False, sep='\t')

2025-02-21 11:20:44 [INFO]  Read the chain file "../grch37_to_grch38.over.chain.gz" 


GWAS shape: (5399812, 9)


## b. EUR

In [None]:
# n_gwas = 413170
gwas = pd.read_table('sum_stats_data/osteoarthritis/gwas.catalog.EUR.GCST90134288/GCST90134288.h.tsv')
gwas = gwas[['chromosome', 'rsid', 'effect_allele', 'other_allele', 'beta', 'standard_error']]
gwas.columns = ['CHR', 'SNPID', 'A1', 'A2', 'BETA', 'SE']
gwas = gwas[gwas['CHR'].isin(range(1, 23))]
gwas = gwas[(gwas['A1'].str.len() == 1) & (gwas['A2'].str.len() == 1)]
gwas = pd.merge(hapmap3, gwas)   
gwas = gwas[gwas['SNPID'].str.startswith('rs')].drop_duplicates(subset='SNPID')

gwas_PRS_CS = gwas[['SNPID', 'A1', 'A2', 'BETA', 'SE']]
gwas_PRS_CS.columns = ['SNP', 'A1', 'A2', 'BETA', 'SE']

# save sumstats:
gwas_PRS_CS.to_csv(os.path.join('sum_stats_data/osteoarthritis/gwas.catalog.EUR.GCST90134288/', 'gwas.PRSCS.txt'), index=False, sep='\t')

In [None]:
# shaPRS
gwas = pd.read_table('sum_stats_data/osteoarthritis/gwas.catalog.EUR.GCST90134288/GCST90134288.h.tsv')
gwas = gwas[['chromosome', 'rsid', 'effect_allele', 'other_allele', 'beta', 'standard_error', 'p_value']]
gwas.columns = ['CHR', 'SNPID', 'A1', 'A2', 'BETA', 'SE', 'P']
gwas = pd.merge(hapmap3, gwas, how='inner', on=['CHR', 'SNPID'])
gwas = gwas[(gwas['A1'].str.len() == 1) & (gwas['A2'].str.len() == 1)]
# only keep SNPID startwith rs
gwas = gwas[gwas['SNPID'].str.startswith('rs')].drop_duplicates(subset='SNPID')

gwas['Freq1.Hapmap'] = 'X'
gwas['N'] = 413170
gwas.columns = ['chr', 'pos', 'SNP', 'A1', 'A2', 'b', 'se', 'p', 'Freq1.Hapmap', 'N']
gwas[['chr', 'pos', 'SNP', 'A1', 'A2', 'Freq1.Hapmap', 'b', 'se', 'p', 'N']].to_csv(os.path.join('sum_stats_data/osteoarthritis/gwas.catalog.EUR.GCST90134288/', 
                                                                                                 'gwas.shaPRS.txt'), index=False, sep='\t')