In [1]:
from io import StringIO
import numpy as np
import polars as pl
import pandas as pd
from subprocess import call
from gtfparse import read_gtf

### Set parameters

In [2]:
exon_flank_nt = 5 # flanking nucleotides from the start and end of exons
number_of_threads = 4 # number of threads used in bcftools output compression
tag_str = 'my_tag' # DNAnexus job tag

project_path = 'project-GGy3Bb0JqBj7zfxY8v4by61X:/'

# input vcf path (end with '/')
dx_vcf_path = project_path + "Bulk/Exome\ sequences/Population\ level\ exome\ OQFE\ variants,\ pVCF\ format\ -\ final\ release/"
# output vcf path (end with '/')
dx_vcf_out_path = project_path + "temp_vcf_out/"
# resource path (end with '/')
dx_resource_path = project_path + "GRCh38_resources/"
# difficult regions bed filename
diff_bed = 'GRCh38_alldifficultregions.bed.gz'
# reference genome filename
ref_genome = 'GRCh38_reference_genome.fa' # index filename is inferred

!dx mkdir -p {dx_vcf_out_path} # create gene.vcf output folder

Error while creating /temp_vcf_out in project-GGy3Bb0JqBj7zfxY8v4by61X
  The supplied authentication token has expired, code 401. Request Time=1677014317.689944, Request ID=unavailable
Details: {
    "reason": "ExpiredToken",
    "expired": 1676670908898,
    "timeNow": 1677014318469
}
[0m

### Helper functions

In [3]:
def get_overlapping_UKB_vcfs(df_gene, df_blk):
    vcf_prefix = 'ukb23157_'
    vcf_suffix = '_v1.vcf.gz'
    
    vcf_files = []
    for i in range(df_gene.shape[0]):
        vcf_files = vcf_files + df_blk.loc[(df_blk['seqname'] == df_gene[i,'seqname']) & 
                                           # when the gene is completely contained inside the block
                                           (((df_blk['start_pos'] <= df_gene[i,'exon_flank_start']) & (df_blk['end_pos'] >= df_gene[i,'exon_flank_end'])) |
                                            # when the block is completely contained inside the gene
                                            ((df_blk['start_pos'] >= df_gene[i,'exon_flank_start']) & (df_blk['end_pos'] <= df_gene[i,'exon_flank_end'])) |
                                            # when the gene overlaps with the start of the block
                                            (df_blk['start_pos'].between(df_gene[i,'exon_flank_start'], df_gene[i,'exon_flank_end'])) |
                                            # when the gene overlaps with the end of the block
                                            (df_blk['end_pos'].between(df_gene[i,'exon_flank_start'], df_gene[i,'exon_flank_end']))),
                                           'chr_blk_str'].tolist()
    vcf_files = set(vcf_prefix + x + vcf_suffix for x in set(vcf_files))
    return(vcf_files)

### List of gene symbols as input

In [54]:
genes = ['BRCA1', 'TWNK', 'TENT2', 'PRR26', 'AFDN', 'UPK3B', 'SELENOS', 'DENND11', 'APOB']
genes = ['BRCA1', 'APOB', 'TWNK']
genes = ['']

### Load pVCF block coordinates

In [36]:
df_blk = pd.read_table("./resources/pvcf_blocks.txt", sep = '\t', names = ['ind', 'chr', 'blk', 'start_pos', 'end_pos'])
df_blk['seqname'] = 'chr' + df_blk['chr'].map(str)
df_blk.loc[df_blk['seqname'] == 'chr23', 'seqname'] = 'chrX'
df_blk.loc[df_blk['seqname'] == 'chr24', 'seqname'] = 'chrY'
df_blk['chr_blk_str'] = df_blk['seqname'].str.replace('chr', 'c') + '_b' + df_blk['blk'].map(str)
df_blk

Unnamed: 0,ind,chr,blk,start_pos,end_pos,seqname,chr_blk_str
0,1,1,0,1,1218130,chr1,c1_b0
1,2,1,1,1218131,1426969,chr1,c1_b1
2,3,1,2,1426970,1758871,chr1,c1_b2
3,4,1,3,1758872,2514221,chr1,c1_b3
4,5,1,4,2514222,3782130,chr1,c1_b4
...,...,...,...,...,...,...,...
972,973,23,20,135552245,141897932,chrX,cX_b20
973,974,23,21,141897933,152168662,chrX,cX_b21
974,975,23,22,152168663,153788223,chrX,cX_b22
975,976,23,23,153788224,156040895,chrX,cX_b23


### Load MANE transcript coordinates

In [37]:
df = read_gtf("./resources/MANE.GRCh38.v1.0.select_ensembl_genomic.gtf.gz")
df = df.filter(pl.col('feature') == 'exon')
df = df.filter(pl.col('gene_name').is_in(genes))
df = df.select(pl.col(['seqname', 'start', 'end', 'gene_name']))
df = df.with_columns([(pl.col('start') - exon_flank_nt).alias('exon_flank_start'), 
                      (pl.col('end') + exon_flank_nt).alias('exon_flank_end')])
df = df.with_column((pl.col('seqname')+':'+pl.col('exon_flank_start')+'-'+pl.col('exon_flank_end')).alias('region'))
df

Polars found a filename. Ensure you pass a path to the file instead of a python file object when possible for best performance.INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'transcript_id', 'transcript_type', 'transcript_name', 'tag', 'protein_id', 'db_xref', 'exon_number', 'exon_id']


seqname,start,end,gene_name,exon_flank_start,exon_flank_end,region
cat,i64,i64,str,i64,i64,str
"""chr2""",21043864,21044073,"""APOB""",21043859,21044078,"""chr2:21043859-..."
"""chr2""",21043513,21043551,"""APOB""",21043508,21043556,"""chr2:21043508-..."
"""chr2""",21042361,21042476,"""APOB""",21042356,21042481,"""chr2:21042356-..."
"""chr2""",21040938,21041083,"""APOB""",21040933,21041088,"""chr2:21040933-..."
"""chr2""",21037958,21038111,"""APOB""",21037953,21038116,"""chr2:21037953-..."
"""chr2""",21037100,21037255,"""APOB""",21037095,21037260,"""chr2:21037095-..."
"""chr2""",21035584,21035708,"""APOB""",21035579,21035713,"""chr2:21035579-..."
"""chr2""",21034816,21034901,"""APOB""",21034811,21034906,"""chr2:21034811-..."
"""chr2""",21033299,21033518,"""APOB""",21033294,21033523,"""chr2:21033294-..."
"""chr2""",21032354,21032581,"""APOB""",21032349,21032586,"""chr2:21032349-..."


### Run Swiss-army-knife on DNAnexus
- get region info for each gene from block file
- bcftools command for step 2 finished
- bcftools command for step 3 & 4 TBD
- list genes not included in MANE set

In [55]:
genes_not_found = []
genes_found = []

for gene in genes:
    df_gene = df.filter(pl.col('gene_name') == gene)
    if df_gene.shape[0] > 0:
        genes_found.append(gene)
        
        vcf_files = get_overlapping_UKB_vcfs(df_gene, df_blk)
        vcf_str = ' '.join(vcf_files)
        region_str = ','.join(df_gene['region'].to_list())
        mem_level = len(vcf_files) # dynamically change memory level when submitting jobs on DNAnexus

        bcftools_cmd1 = "bcftools concat -Ou -a -r " + region_str + " " + vcf_str
        bcftools_cmd2 = "bcftools view -Ou -T ^" + diff_bed
        bcftools_cmd3 = "bcftools norm -Ou -m - -f " + ref_genome
        bcftools_cmd4 = "bcftools annotate -Ou --set-id '%CHROM\_%POS\_%REF\_%ALT'"
        bcftools_cmd5 = "bcftools view -Oz --threads " + str(number_of_threads) + " -i 'AF<=0.001 && MAC >=1 && F_MISSING<0.1 && F_PASS(DP>=10 & GT!=\\\"mis\\\")> 0.9' > " + gene + ".vcf.gz"
        bcftools_command = " | ".join([bcftools_cmd1, bcftools_cmd2, bcftools_cmd3, bcftools_cmd4, bcftools_cmd5])

        dx_input_str = ' '.join(set('-iin="' + dx_vcf_path + x + '"' for x in vcf_files).union(set('-iin="' + dx_vcf_path + x + '.tbi"' for x in vcf_files)))
        dx_input_str = dx_input_str + ' -iin="' + dx_resource_path + diff_bed + '"'
        dx_input_str = dx_input_str + ' -iin="' + dx_resource_path + ref_genome + '"'
        dx_input_str = dx_input_str + ' -iin="' + dx_resource_path + ref_genome + '.fai"'

        dx_command = 'dx run swiss-army-knife --instance-type mem' + str(mem_level) + '_ssd1_v2_x4 -y --brief ' + dx_input_str + ' -icmd="' + bcftools_command + '" --destination ' + dx_vcf_out_path + ' --tag "' + tag_str + '" --property gene=' + gene
        !{dx_command}
    else:
        genes_not_found.append(gene)
            
print('Genes not found in MANE database:')
print(genes_not_found)

job-GPGYKP8JqBj7XK92KqZ52Qpz
[0mjob-GPGYKPQJqBjPbv6Q5X7JVB7G
[0mjob-GPGYKPjJqBjJyXxpzJq3Xfv9
[0mGenes not found in MANE database:
[]


### Below is for testing purposes

In [53]:
gene = 'APOB'
df_gene = df.filter(pl.col('gene_name') == gene)
vcf_files = get_overlapping_UKB_vcfs(df_gene, df_blk)
vcf_str = ' '.join(vcf_files)
region_str = ','.join(df_gene['region'].to_list())
mem_level = len(vcf_files)

bcftools_cmd1 = "bcftools concat -Ou -a -r " + region_str + " " + vcf_str
bcftools_cmd2 = "bcftools view -Ou -T ^" + diff_bed
bcftools_cmd3 = "bcftools norm -Ou -m - -f " + ref_genome
bcftools_cmd4 = "bcftools annotate -Ou --set-id '%CHROM\_%POS\_%REF\_%ALT'"
bcftools_cmd5 = "bcftools view -Oz --threads " + str(number_of_threads) + " -i 'AF<=0.001 && MAC >=1 && F_MISSING<0.1 && F_PASS(DP>=10 & GT!=\\\"mis\\\")> 0.9' > " + gene + ".vcf.gz"
bcftools_command = " | ".join([bcftools_cmd1, bcftools_cmd2, bcftools_cmd3, bcftools_cmd4, bcftools_cmd5])

dx_input_str = ' '.join(set('-iin="' + dx_vcf_path + x + '"' for x in vcf_files).union(set('-iin="' + dx_vcf_path + x + '.tbi"' for x in vcf_files)))
dx_input_str = dx_input_str + ' -iin="' + dx_resource_path + diff_bed + '"'
dx_input_str = dx_input_str + ' -iin="' + dx_resource_path + ref_genome + '"'
dx_input_str = dx_input_str + ' -iin="' + dx_resource_path + ref_genome + '.fai"'

dx_command = 'dx run swiss-army-knife --instance-type mem' + str(mem_level) + '_ssd1_v2_x4 -y --brief ' + dx_input_str + ' -icmd="' + bcftools_command + '" --destination ' + dx_vcf_out_path + ' --tag "' + tag_str + '" --property gene=' + gene

print(bcftools_command)
print(dx_command)



bcftools concat -Ou -a -r chr2:21043859-21044078,chr2:21043508-21043556,chr2:21042356-21042481,chr2:21040933-21041088,chr2:21037953-21038116,chr2:21037095-21037260,chr2:21035579-21035713,chr2:21034811-21034906,chr2:21033294-21033523,chr2:21032349-21032586,chr2:21029893-21030020,chr2:21029634-21029790,chr2:21028322-21028543,chr2:21027823-21028070,chr2:21026783-21026969,chr2:21024928-21025129,chr2:21023520-21023697,chr2:21022826-21023047,chr2:21019718-21019910,chr2:21018987-21019118,chr2:21016434-21016654,chr2:21015365-21015550,chr2:21015068-21015265,chr2:21014443-21014598,chr2:21013155-21013538,chr2:21005075-21012656,chr2:21004556-21004680,chr2:21004264-21004457,chr2:21001424-21003339 ukb23157_c2_b3_v1.vcf.gz ukb23157_c2_b4_v1.vcf.gz | bcftools view -Ou -T ^GRCh38_alldifficultregions.bed.gz | bcftools norm -Ou -m - -f GRCh38_reference_genome.fa | bcftools annotate -Ou --set-id '%CHROM\_%POS\_%REF\_%ALT' | bcftools view -Oz --threads 4 -i 'AF<=0.001 && MAC >=1 && F_MISSING<0.1 && F_PASS(

In [44]:
!{dx_command}

job-GPG9KY8JqBj5gK412JXv3f8G
[0m