In [None]:
# For each chromosome, merge the REGENIE variant associations table with gene/variant annotations

In [2]:
import pandas as pd
import os
import json

In [3]:
def merge(chrom):
    
    if chrom==22:
        pheno_header = pd.read_table(f'/home/jupyter/gcs/ukb/regenie/chr22_phenos.txt', nrows=1, header=None)
        directory = '/home/jupyter/gcs/ukb/regenie/gwas_chr22prots/step2/'
    else:
        pheno_header = pd.read_table(f'/home/jupyter/gcs/ukb/regenie/phenos/chr{chrom}_phenos.txt', nrows=1, header=None)
        directory = f'/home/jupyter/gcs/ukb/regenie/gwas_allOthers/step2/chr{chrom}prots/'
    
    # List of the chrom olink proteins
    proteins = [i.removesuffix('_rint').upper() for i in pheno_header.iloc[0,2:]]

    # List of regenie step2 assoc files for chrom for all chrom olink proteins
    file_names = [filename for filename in os.listdir(directory)
                    if filename.startswith(f'chr{chrom}_') and filename.endswith('.regenie')]

    # Read and append DataFrames for each protein
    dfs = []
    for file_name in file_names:
        file_path = os.path.join(directory, file_name)
        df = pd.read_csv(file_path, sep=' ')
        dfs.append(df)

    # Read in chrom variant annotations table
    colNames = ['locus', 'alleles', 'ID', 'cm_position', 'vep', 'vep_proc_id']
    annot = pd.read_table(f'/home/jupyter/annot_by_chromosome/second_run/chr{chrom}_olink.txt', names=colNames)

    # expand VEP key:value pairs into columns
    annot_expd = annot['vep'].apply(json.loads).dropna().apply(pd.Series)
    annot_expd = pd.concat([annot[['locus', 'alleles', 'ID', 'cm_position',]], annot_expd], axis=1)

    # for each chrom olink protein,
    # filter where VEP annotation gene symbol = protein
    # merge regenie and annotation table
    dfs2 = []
    for i in range(len(proteins)-1):
        df = annot_expd.loc[annot_expd['symbol']==proteins[i],:]
        merge = pd.merge(dfs[i], df, how='inner', on='ID')
        dfs2.append(merge)

    # Concatenate DataFrames
    chr_df = pd.concat(dfs2, ignore_index=True)
    
    # Reformat 'consequence' annotations to strings instead of single-element lists
    chr_df['c2'] = chr_df.loc[:, 'consequence']
    chr_df['consequence'] = [x[0] for x in chr_df.loc[:,'c2']]
    chr_df.drop('c2', axis=1, inplace=True)
    
    # Export table
    chr_df.to_csv(f'/home/jupyter/gcs/ukb/regenie/gwas_annotated/chr{chrom}_merged.txt', sep='\t')


In [4]:
chroms = [str(i) for i in range(1, 19, 1)]
chroms.append('20')
chroms.append('X')
chroms

['1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '20',
 'X']

In [5]:
for i in chroms:
    merge(i)