In [1]:
from io import StringIO
import numpy as np
import pandas as pd
import subprocess

### Set parameters

In [None]:
# DNAnexus job tag and project ID
job_tag = 'wgs_042425' # DNAnexus job tag
proj_id = 'project-J07x038JgGYK8bKFq6Z117kk'

# for WGS analysis
dx_wgs_vcf_path = f"{proj_id}:/Bulk/DRAGEN WGS/DRAGEN population level WGS variants, pVCF format [500k release]"

# create vcf output folder
dx_vcf_out_path = f"{proj_id}:/Tian_folder/{job_tag}/"
cmd = f'dx mkdir -p "{dx_vcf_out_path}"'
subprocess.run(cmd, shell=True)

# DNAnexus resources
dx_resource_path = f"{proj_id}:/GRCh38_resources/"
diff_bed = 'GRCh38_alldifficultregions.bed.gz'
ref_genome = 'GRCh38_reference_genome.fa'


### List of gene symbols as input

In [48]:
genes = ['LDLR', 'ALB']

### Load pVCF block coordinates

In [49]:
# WGS blocks
def fill_blk_pos(df_wgs_blk, df_chrom_sizes):
    df_wgs_blk_out = pd.DataFrame()
    for chrom, df_wgs_blk_chrom in df_wgs_blk.groupby('chrom'):
        chrom_size = df_chrom_sizes.loc[df_chrom_sizes['chrom'] == chrom, 'size'].values[0]
        df_wgs_blk_chrom.sort_values(by='blk', inplace=True, ignore_index=True)

        start_pos = df_wgs_blk_chrom['start_pos'].copy()
        if pd.isna(start_pos.iloc[0]):
            start_pos[0] = 1
        next_start_pos = start_pos.shift(-1)
        end_pos = next_start_pos - 1
        end_pos[-1] = chrom_size

        start_pos = start_pos.ffill()
        end_pos = end_pos.bfill()
        df_wgs_blk_chrom['start_pos'] = start_pos.astype(int)
        df_wgs_blk_chrom['end_pos'] = end_pos.astype(int)

        df_wgs_blk_out = pd.concat([df_wgs_blk_out, df_wgs_blk_chrom], ignore_index=True)
    return df_wgs_blk_out


df_wgs_blk = pd.read_csv("./resources/WGS_dragen_pvcf_coordinates.csv")
df_wgs_blk.rename(columns={'chromosome': 'chrom', 'starting_position': 'start_pos'}, inplace=True)
chr_list = ['chr' + str(x) for x in range(1, 23)] + ['chrX', 'chrY']
df_chrom_sizes = pd.read_csv("./resources/hg38.chrom.sizes.txt", sep="\t", names=['chrom', 'size'])

# sort by block
df_wgs_blk['chrom'] = df_wgs_blk.apply(lambda row: row['filename'].split('_')[1].replace('c', 'chr'), axis=1)
df_wgs_blk['chrom'] = pd.Categorical(df_wgs_blk['chrom'], categories=chr_list, ordered=True)
df_wgs_blk['blk'] = df_wgs_blk.apply(lambda row: int(row['filename'].split('_')[2].replace('b', '')), axis=1)
df_wgs_blk.sort_values(by=['chrom', 'blk'], inplace=True, ignore_index=True)

# filter out NAs
df_wgs_blk = fill_blk_pos(df_wgs_blk, df_chrom_sizes)
df_wgs_blk

  for chrom, df_wgs_blk_chrom in df_wgs_blk.groupby('chrom'):


Unnamed: 0,filename,chrom,start_pos,blk,end_pos
0,ukb24310_c1_b0_v1.vcf.gz,chr1,10061,0,20018
1,ukb24310_c1_b1_v1.vcf.gz,chr1,20019,1,40008
2,ukb24310_c1_b2_v1.vcf.gz,chr1,40009,2,60001
3,ukb24310_c1_b3_v1.vcf.gz,chr1,60002,3,80000
4,ukb24310_c1_b4_v1.vcf.gz,chr1,80001,4,100000
...,...,...,...,...,...
154425,ukb24310_cY_b2857_v1.vcf.gz,chrY,56887879,2857,57227415
154426,ukb24310_cY_b2858_v1.vcf.gz,chrY,56887879,2858,57227415
154427,ukb24310_cY_b2859_v1.vcf.gz,chrY,56887879,2859,57227415
154428,ukb24310_cY_b2860_v1.vcf.gz,chrY,56887879,2860,57227415


### Regions to extract variants

In [None]:
# WGS analysis
def attach_blk_filename(df_blk, chrom, start_pos, end_pos):
    blk_filename = df_blk.loc[(df_blk['chrom'] == chrom) & 
                              ((df_blk['end_pos'] >= start_pos) & (df_blk['start_pos'] <= end_pos)),
                              'filename'
    ].to_list()
    return blk_filename

df_wgs_regions = pd.read_excel("/Users/tianyu/Downloads/encode_e2g_predictions_livercombined_ALB_LDLR.xlsx", sheet_name='nonoverlapping_coordinates')
df_wgs_regions.rename(columns={'#chr': 'chrom', 'TargetGene': 'gene_name'}, inplace=True)
df_wgs_regions['region'] = ((df_wgs_regions['chrom'] + ':').str.cat(df_wgs_regions['start'].astype(str)) + '-').str.cat(df_wgs_regions['end'].astype(str))

# attach block filenames
df_wgs_regions['filenames'] = df_wgs_regions.apply(lambda row: attach_blk_filename(df_wgs_blk, row['chrom'], row['start'], row['end']), axis=1)
df_wgs_regions['filenames2'] = df_wgs_regions.apply(lambda row: [f"{row['chrom']}/{x}" for x in row['filenames']], axis=1) # filanme with chrom path
df_wgs_regions


Unnamed: 0,chrom,start,end,class,gene_name,Length,region,filenames,filenames2
0,chr19,10960245,10962351,intergenic,LDLR,2107,chr19:10960245-10962351,[ukb24310_c19_b548_v1.vcf.gz],[chr19/ukb24310_c19_b548_v1.vcf.gz]
1,chr19,10986972,10987472,genic,LDLR,501,chr19:10986972-10987472,[ukb24310_c19_b549_v1.vcf.gz],[chr19/ukb24310_c19_b549_v1.vcf.gz]
2,chr19,10987813,10988313,genic,LDLR,501,chr19:10987813-10988313,[ukb24310_c19_b549_v1.vcf.gz],[chr19/ukb24310_c19_b549_v1.vcf.gz]
3,chr19,10988667,10989658,genic,LDLR,992,chr19:10988667-10989658,[ukb24310_c19_b549_v1.vcf.gz],[chr19/ukb24310_c19_b549_v1.vcf.gz]
4,chr19,10998025,10998525,genic,LDLR,501,chr19:10998025-10998525,[ukb24310_c19_b549_v1.vcf.gz],[chr19/ukb24310_c19_b549_v1.vcf.gz]
5,chr19,11016544,11017775,genic,LDLR,1232,chr19:11016544-11017775,[ukb24310_c19_b550_v1.vcf.gz],[chr19/ukb24310_c19_b550_v1.vcf.gz]
6,chr19,11017930,11018838,genic,LDLR,909,chr19:11017930-11018838,[ukb24310_c19_b550_v1.vcf.gz],[chr19/ukb24310_c19_b550_v1.vcf.gz]
7,chr19,11020048,11020548,genic,LDLR,501,chr19:11020048-11020548,[ukb24310_c19_b551_v1.vcf.gz],[chr19/ukb24310_c19_b551_v1.vcf.gz]
8,chr19,11020811,11021311,genic,LDLR,501,chr19:11020811-11021311,[ukb24310_c19_b551_v1.vcf.gz],[chr19/ukb24310_c19_b551_v1.vcf.gz]
9,chr19,11022267,11023130,genic,LDLR,864,chr19:11022267-11023130,[ukb24310_c19_b551_v1.vcf.gz],[chr19/ukb24310_c19_b551_v1.vcf.gz]


### Run Swiss-army-knife on DNAnexus
- get region info for each gene from block file
- bcftools command for step 2 finished
- bcftools command for step 3 & 4 TBD
- list genes not included in MANE set

In [51]:
genes_not_found = []
genes_found = []
known_large_genes = ["DSP", "TSC2", "TTN", "NCOA3"]

for gene in genes:
    df_gene = df_wgs_regions.loc[df_wgs_regions['gene_name'] == gene]

    if df_gene.shape[0] > 0:
        genes_found.append(gene)
        
        vcf_files = set(x for sublist in df_gene['filenames'] for x in sublist)
        vcf_files2 = set(x for sublist in df_gene['filenames2'] for x in sublist) # with chrom path
        vcf_str = ' '.join(vcf_files)
        region_str = ','.join(df_gene['region'].to_list())
        
        if ((len(vcf_files) > 1) or (gene in known_large_genes)):
            mem_level = "mem2_ssd1_v2_x16" # dynamically change memory level when submitting jobs on DNAnexus
        else:
            mem_level = "mem1_ssd1_v2_x4"
        
        vcf_outfile = gene + ".wgs_variants.vcf.gz"
        ssv_outfile = gene + ".hetero_carriers.ssv"

        # filtering variants
        bcftools_cmd1 = f"bcftools concat -Ou -a -r {region_str} {vcf_str}"
        bcftools_cmd2 = f"bcftools view -Ou --max-alleles 5 -T ^{diff_bed}"
        bcftools_cmd3 = "bcftools +fill-tags -Ou -- -t all"
        bcftools_cmd4 = f"bcftools norm -Ou -m - -f {ref_genome}" 
        bcftools_cmd5 = f"bcftools view -Oz -i 'AF<=0.001 && MAC >=1 && F_MISSING<0.1 && F_PASS(GT!=\\\"mis\\\")> 0.9' > {vcf_outfile}"
        
        # index vcf files
        bcftools_cmd6 = f"bcftools index -t {vcf_outfile}"

        # extract heterozygous carriers
        bcftools_cmd7 = "mkdir -p hetero_carriers"
        bcftools_cmd8 = f"bcftools query -i 'GT=\\\"RA\\\"|GT=\\\"AR\\\"' -f '%CHROM  %POS %REF %ALT %INFO/AF [%SAMPLE|]\n' {vcf_outfile} > hetero_carriers/{ssv_outfile}"
        
        # parsing command
        bcftools_command = " | ".join([bcftools_cmd1, bcftools_cmd2, bcftools_cmd3, bcftools_cmd4, bcftools_cmd5])
        bcftools_command = " && ".join([bcftools_command, bcftools_cmd6, bcftools_cmd7, bcftools_cmd8])
        
        # parsing input
        dx_input_str = ' '.join([f'-iin="{dx_wgs_vcf_path}/{x}" -iin="{dx_wgs_vcf_path}/{x}.tbi"' for x in vcf_files2])
        dx_input_str = dx_input_str + f' -iin="{dx_resource_path}/{diff_bed}"'
        dx_input_str = dx_input_str + f' -iin="{dx_resource_path}/{ref_genome}"' + f' -iin="{dx_resource_path}/{ref_genome}.fai"'
        
        # final dx command
        dx_command = f'dx run swiss-army-knife --instance-type {mem_level} -y --brief {dx_input_str} -icmd="{bcftools_command}" --destination {dx_vcf_out_path} --tag "{job_tag}" --property gene={gene}'
        subprocess.run(dx_command, shell=True, check=True)
    else:
        genes_not_found.append(gene)
            
print('Genes not found in MANE database:')
print(genes_not_found)

job-J0864vjJgGY9KGYffJb6xYpY
job-J0864x0JgGYKZGq11kJF8k0g
Genes not found in MANE database:
[]


In [None]:
dx_wgs_vcf_path = f"{proj_id}:/Bulk/DRAGEN WGS/DRAGEN population level WGS variants, pVCF format [500k release]"
dx_input_str = f'-iin="{dx_wgs_vcf_path}/chr19/ukb24310_c19_b552_v1.vcf.gz"'
bcftools_command = f'bcftools view -h ukb24310_c19_b552_v1.vcf.gz > vcf_header.txt'

dx_command = f'dx run swiss-army-knife --instance-type {mem_level} -y --brief {dx_input_str} -icmd="{bcftools_command}" --destination {dx_vcf_out_path} --tag "{job_tag}" --property gene={gene}'
subprocess.run(dx_command, shell=True, check=True)

job-J085Vf0JgGYP9917Z1yK11j1


CompletedProcess(args='dx run swiss-army-knife --instance-type mem2_ssd1_v2_x16 -y --brief -iin="project-J07x038JgGYK8bKFq6Z117kk:/Bulk/DRAGEN WGS/DRAGEN population level WGS variants, pVCF format [500k release]/chr19/ukb24310_c19_b552_v1.vcf.gz" -icmd="bcftools view -h ukb24310_c19_b552_v1.vcf.gz > vcf_header.txt" --destination project-J07x038JgGYK8bKFq6Z117kk:/Tian_folder/wgs_042425/ --tag "wgs_042425" --property gene=ALB', returncode=0)

job-J085xpjJgGYPkZ4b7qfQ3bj7


CompletedProcess(args='dx run swiss-army-knife --instance-type mem2_ssd1_v2_x16 -y --brief -iin="project-J07x038JgGYK8bKFq6Z117kk:/Bulk/Exome sequences/Population level exome OQFE variants, pVCF format - final release//ukb23157_c2_b39_v1.vcf.gz" -icmd="bcftools view -h ukb23157_c2_b39_v1.vcf.gz > ex_vcf_header.txt" --destination project-J07x038JgGYK8bKFq6Z117kk:/Tian_folder/wgs_042425/ --tag "wgs_042425" --property gene=ALB', returncode=0)

In [23]:
dx_input_str

'-iin="project-J07x038JgGYK8bKFq6Z117kk:/Bulk/DRAGEN WGS/DRAGEN population level WGS variants, pVCF format [500k release]/ukb24310_c19_b552_v1.vcf.gz" -iin="project-J07x038JgGYK8bKFq6Z117kk:/Bulk/DRAGEN WGS/DRAGEN population level WGS variants, pVCF format [500k release]/ukb24310_c19_b552_v1.vcf.gz.tbi" -iin="project-J07x038JgGYK8bKFq6Z117kk:/Bulk/DRAGEN WGS/DRAGEN population level WGS variants, pVCF format [500k release]/ukb24310_c19_b556_v1.vcf.gz" -iin="project-J07x038JgGYK8bKFq6Z117kk:/Bulk/DRAGEN WGS/DRAGEN population level WGS variants, pVCF format [500k release]/ukb24310_c19_b556_v1.vcf.gz.tbi" -iin="project-J07x038JgGYK8bKFq6Z117kk:/Bulk/DRAGEN WGS/DRAGEN population level WGS variants, pVCF format [500k release]/ukb24310_c19_b548_v1.vcf.gz" -iin="project-J07x038JgGYK8bKFq6Z117kk:/Bulk/DRAGEN WGS/DRAGEN population level WGS variants, pVCF format [500k release]/ukb24310_c19_b548_v1.vcf.gz.tbi" -iin="project-J07x038JgGYK8bKFq6Z117kk:/Bulk/DRAGEN WGS/DRAGEN population level WGS var

### Below is for testing purposes

In [None]:
gene = 'APOB'
df_gene = df.filter(pl.col('gene_name') == gene)
vcf_files = get_overlapping_UKB_vcfs(df_gene, df_blk)
vcf_str = ' '.join(vcf_files)
region_str = ','.join(df_gene['region'].to_list())
mem_level = len(vcf_files)

bcftools_cmd1 = "bcftools concat -Ou -a -r " + region_str + " " + vcf_str
bcftools_cmd2 = "bcftools view -Ou -T ^" + diff_bed
bcftools_cmd3 = "bcftools norm -Ou -m - -f " + ref_genome
bcftools_cmd4 = "bcftools annotate -Ou --set-id '%CHROM\_%POS\_%REF\_%ALT'"
bcftools_cmd5 = "bcftools view -Oz --threads " + str(number_of_threads) + " -i 'AF<=0.001 && MAC >=1 && F_MISSING<0.1 && F_PASS(DP>=10 & GT!=\\\"mis\\\")> 0.9' > " + gene + ".vcf.gz"
bcftools_command = " | ".join([bcftools_cmd1, bcftools_cmd2, bcftools_cmd3, bcftools_cmd4, bcftools_cmd5])

dx_input_str = ' '.join(set('-iin="' + dx_vcf_path + x + '"' for x in vcf_files).union(set('-iin="' + dx_vcf_path + x + '.tbi"' for x in vcf_files)))
dx_input_str = dx_input_str + ' -iin="' + dx_resource_path + diff_bed + '"'
dx_input_str = dx_input_str + ' -iin="' + dx_resource_path + ref_genome + '"'
dx_input_str = dx_input_str + ' -iin="' + dx_resource_path + ref_genome + '.fai"'

dx_command = 'dx run swiss-army-knife --instance-type mem' + str(mem_level) + '_ssd1_v2_x4 -y --brief ' + dx_input_str + ' -icmd="' + bcftools_command + '" --destination ' + dx_vcf_out_path + ' --tag "' + job_tag + '" --property gene=' + gene

print(bcftools_command)
print(dx_command)



bcftools concat -Ou -a -r chr2:21043859-21044078,chr2:21043508-21043556,chr2:21042356-21042481,chr2:21040933-21041088,chr2:21037953-21038116,chr2:21037095-21037260,chr2:21035579-21035713,chr2:21034811-21034906,chr2:21033294-21033523,chr2:21032349-21032586,chr2:21029893-21030020,chr2:21029634-21029790,chr2:21028322-21028543,chr2:21027823-21028070,chr2:21026783-21026969,chr2:21024928-21025129,chr2:21023520-21023697,chr2:21022826-21023047,chr2:21019718-21019910,chr2:21018987-21019118,chr2:21016434-21016654,chr2:21015365-21015550,chr2:21015068-21015265,chr2:21014443-21014598,chr2:21013155-21013538,chr2:21005075-21012656,chr2:21004556-21004680,chr2:21004264-21004457,chr2:21001424-21003339 ukb23157_c2_b3_v1.vcf.gz ukb23157_c2_b4_v1.vcf.gz | bcftools view -Ou -T ^GRCh38_alldifficultregions.bed.gz | bcftools norm -Ou -m - -f GRCh38_reference_genome.fa | bcftools annotate -Ou --set-id '%CHROM\_%POS\_%REF\_%ALT' | bcftools view -Oz --threads 4 -i 'AF<=0.001 && MAC >=1 && F_MISSING<0.1 && F_PASS(

In [44]:
!{dx_command}

job-GPG9KY8JqBj5gK412JXv3f8G
[0m