In [316]:
from io import StringIO
import numpy as np
import polars as pl
import pandas as pd
from subprocess import call
from gtfparse import read_gtf

### Set parameters

In [338]:
exon_flank_nt = 5 # flanking nucleotides from the start and end of exons
number_of_threads = 4 # number of threads used in bcftools output compression
tag_str = 'my_tag' # DNAnexus job tag

project_path = 'project-GGy3Bb0JqBj7zfxY8v4by61X:/'

# input vcf path (end with '/')
dx_vcf_path = project_path + "Bulk/Exome\ sequences/Population\ level\ exome\ OQFE\ variants,\ pVCF\ format\ -\ final\ release/"
# output vcf path (end with '/')
dx_vcf_out_path = project_path + "temp_vcf_out/"
# resource path (end with '/')
dx_resource_path = project_path + "GRCh38_resources/"
# difficult regions bed filename
diff_bed = 'GRCh38_alldifficultregions.bed.gz'
# reference genome filename
ref_genome = 'GRCh38_reference_genome.fa' # index filename is inferred

!dx mkdir -p {dx_vcf_out_path} # create gene.vcf output folder

[0m

### Helper functions

In [330]:
def get_overlapping_UKB_vcfs(df_gene, df_blk):
    vcf_prefix = 'ukb23157_'
    vcf_suffix = '_v1.vcf.gz'
    
    vcf_files = []
    for i in range(df_gene.shape[0]):
        vcf_files = vcf_files + df_blk.loc[(df_blk['seqname'] == df_gene[i,'seqname']) & 
                                           # when the gene is completely contained inside the block
                                           (((df_blk['start_pos'] <= df_gene[i,'exon_flank_start']) & (df_blk['end_pos'] >= df_gene[i,'exon_flank_end'])) |
                                            # when the block is completely contained inside the gene
                                            ((df_blk['start_pos'] >= df_gene[i,'exon_flank_start']) & (df_blk['end_pos'] <= df_gene[i,'exon_flank_end'])) |
                                            # when the gene overlaps with the start of the block
                                            (df_blk['start_pos'].between(df_gene[i,'exon_flank_start'], df_gene[i,'exon_flank_end'])) |
                                            # when the gene overlaps with the end of the block
                                            (df_blk['end_pos'].between(df_gene[i,'exon_flank_start'], df_gene[i,'exon_flank_end']))),
                                           'chr_blk_str'].tolist()
    vcf_files = set(vcf_prefix + x + vcf_suffix for x in set(vcf_files))
    return(vcf_files)

### List of gene symbols as input

In [300]:
genes = ['BRCA1', 'TWNK', 'TENT2', 'PRR26', 'AFDN', 'UPK3B', 'SELENOS', 'DENND11', 'FAKE_GENE']

### Load pVCF block coordinates

In [219]:
df_blk = pd.read_table("./resources/pvcf_blocks.txt", sep = '\t', names = ['ind', 'chr', 'blk', 'start_pos', 'end_pos'])
df_blk['seqname'] = 'chr' + df_blk['chr'].map(str)
df_blk.loc[df_blk['seqname'] == 'chr23', 'seqname'] = 'chrX'
df_blk.loc[df_blk['seqname'] == 'chr24', 'seqname'] = 'chrY'
df_blk['chr_blk_str'] = df_blk['seqname'].str.replace('chr', 'c') + '_b' + df_blk['blk'].map(str)
df_blk

Unnamed: 0,ind,chr,blk,start_pos,end_pos,seqname,chr_blk_str
0,1,1,0,1,1218130,chr1,c1_b0
1,2,1,1,1218131,1426969,chr1,c1_b1
2,3,1,2,1426970,1758871,chr1,c1_b2
3,4,1,3,1758872,2514221,chr1,c1_b3
4,5,1,4,2514222,3782130,chr1,c1_b4
...,...,...,...,...,...,...,...
972,973,23,20,135552245,141897932,chrX,cX_b20
973,974,23,21,141897933,152168662,chrX,cX_b21
974,975,23,22,152168663,153788223,chrX,cX_b22
975,976,23,23,153788224,156040895,chrX,cX_b23


### Load MANE transcript coordinates

In [280]:
df = read_gtf("./resources/MANE.GRCh38.v1.0.select_ensembl_genomic.gtf.gz")
df = df.filter(pl.col('feature') == 'exon')
df = df.filter(pl.col('gene_name').is_in(genes))
df = df.select(pl.col(['seqname', 'start', 'end', 'gene_name']))
df = df.with_columns([(pl.col('start') - exon_flank_nt).alias('exon_flank_start'), 
                      (pl.col('end') + exon_flank_nt).alias('exon_flank_end')])
df = df.with_column((pl.col('seqname')+':'+pl.col('exon_flank_start')+'-'+pl.col('exon_flank_end')).alias('region'))
df

Polars found a filename. Ensure you pass a path to the file instead of a python file object when possible for best performance.INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_name', 'transcript_id', 'transcript_type', 'transcript_name', 'tag', 'protein_id', 'db_xref', 'exon_number', 'exon_id']


seqname,start,end,gene_name,exon_flank_start,exon_flank_end,region
cat,i64,i64,str,i64,i64,str
"""chr5""",79612441,79613075,"""TENT2""",79612436,79613080,"""chr5:79612436-..."
"""chr5""",79619612,79619785,"""TENT2""",79619607,79619790,"""chr5:79619607-..."
"""chr5""",79619994,79620083,"""TENT2""",79619989,79620088,"""chr5:79619989-..."
"""chr5""",79623252,79623489,"""TENT2""",79623247,79623494,"""chr5:79623247-..."
"""chr5""",79640851,79640965,"""TENT2""",79640846,79640970,"""chr5:79640846-..."
"""chr5""",79641105,79641196,"""TENT2""",79641100,79641201,"""chr5:79641100-..."
"""chr5""",79642832,79642910,"""TENT2""",79642827,79642915,"""chr5:79642827-..."
"""chr5""",79645123,79645192,"""TENT2""",79645118,79645197,"""chr5:79645118-..."
"""chr5""",79648617,79648693,"""TENT2""",79648612,79648698,"""chr5:79648612-..."
"""chr5""",79649062,79649190,"""TENT2""",79649057,79649195,"""chr5:79649057-..."


### Run Swiss-army-knife on DNAnexus
- get region info for each gene from block file
- bcftools command for step 2 finished
- bcftools command for step 3 & 4 TBD
- list genes not included in MANE set

In [347]:
genes_not_found = []
genes_found = []

for gene in genes:
    df_gene = df.filter(pl.col('gene_name') == gene)
    if df_gene.shape[0] > 0:
        genes_found.append(gene)
        
        vcf_files = get_overlapping_UKB_vcfs(df_gene, df_blk)
        vcf_str = ' '.join(vcf_files)
        region_str = ','.join(df_gene['region'].to_list())

        bcftools_cmd1 = "bcftools view -Ou -r " + region_str + " -T ^" + diff_bed + " " + vcf_str
        bcftools_cmd2 = "bcftools norm -Ou -m - -f " + ref_genome
        bcftools_cmd3 = "bcftools annotate -Ou --set-id '%CHROM\_%POS\_%REF\_%ALT'"
        bcftools_cmd4 = "bcftools view -Oz --threads " + str(number_of_threads) + " -i 'AF<=0.001 && MAC >=1 && F_MISSING<0.1 && F_PASS(DP>=10 & GT!=\\\"mis\\\")> 0.9' > " + gene + ".vcf.gz"
        bcftools_command = " | ".join([bcftools_cmd1, bcftools_cmd2, bcftools_cmd3, bcftools_cmd4])

        dx_input_str = ' '.join(set('-iin="' + dx_vcf_path + x + '"' for x in vcf_files).union(set('-iin="' + dx_vcf_path + x + '.tbi"' for x in vcf_files)))
        dx_input_str = dx_input_str + ' -iin="' + dx_resource_path + diff_bed + '"'
        dx_input_str = dx_input_str + ' -iin="' + dx_resource_path + ref_genome + '"'
        dx_input_str = dx_input_str + ' -iin="' + dx_resource_path + ref_genome + '.fai"'

        dx_command = 'dx run swiss-army-knife -y --brief ' + dx_input_str + ' -icmd="' + bcftools_command + '" --destination ' + dx_vcf_out_path + ' --tag "' + tag_str + '" --property gene=' + gene
        !{dx_command}
    else:
        genes_not_found.append(gene)
            
print('Genes not found in MANE database:')
print(genes_not_found)

job-GP4q1GjJqBj12z5y2x15Gf90
[0mjob-GP4q1J0JqBj4fF5K657Zx6Yy
[0mjob-GP4q1J8JqBj0X1qg5v8G25xz
[0mjob-GP4q1JQJqBjK2K3x7Bb6QGb5
[0mjob-GP4q1K0JqBj7Vj105PBpzf4Q
[0mjob-GP4q1K8JqBjPVgxF8b9FV92j
[0mjob-GP4q1KQJqBj13ZG95vBy5b8k
[0mGenes not found in MANE database:
['PRR26', 'FAKE_GENE']


### Below is for testing purposes

In [346]:
gene = 'BRCA1'
df_gene = df.filter(pl.col('gene_name') == gene)
vcf_files = get_overlapping_UKB_vcfs(df_gene, df_blk)
vcf_str = ' '.join(vcf_files)
region_str = ','.join(df_gene['region'].to_list())

bcftools_cmd1 = "bcftools view -Ou -r " + region_str + " -T ^" + diff_bed + " " + vcf_str
bcftools_cmd2 = "bcftools norm -Ou -m - -f " + ref_genome
bcftools_cmd3 = "bcftools annotate -Ou --set-id '%CHROM\_%POS\_%REF\_%ALT'"
bcftools_cmd4 = "bcftools view -Oz --threads " + str(number_of_threads) + " -i 'AF<=0.001 && MAC >=1 && F_MISSING<0.1 && F_PASS(DP>=10 & GT!=\\\"mis\\\")> 0.9' > " + gene + ".vcf.gz"
bcftools_command = " | ".join([bcftools_cmd1, bcftools_cmd2, bcftools_cmd3, bcftools_cmd4])

dx_input_str = ' '.join(set('-iin="' + dx_vcf_path + x + '"' for x in vcf_files).union(set('-iin="' + dx_vcf_path + x + '.tbi"' for x in vcf_files)))
dx_input_str = dx_input_str + ' -iin="' + dx_resource_path + diff_bed + '"'
dx_input_str = dx_input_str + ' -iin="' + dx_resource_path + ref_genome + '"'
dx_input_str = dx_input_str + ' -iin="' + dx_resource_path + ref_genome + '.fai"'

dx_command = 'dx run swiss-army-knife -y --brief ' + dx_input_str + ' -icmd="' + bcftools_command + '" --destination ' + dx_vcf_out_path + ' --tag "' + tag_str + '" --property gene=' + gene


print(dx_command)


# dx mkdir -p $VCF_OUT_PATH
# dx run swiss-army-knife -iin="$VCF_PATH/${VCF_PREFIX}.vcf.gz" -iin="$VCF_PATH/${VCF_PREFIX}.vcf.gz.tbi" -iin="$BED_FILE" -icmd="$CMD_STRING" --destination $VCF_OUT_PATH -y

dx run swiss-army-knife -y --brief -iin="project-GGy3Bb0JqBj7zfxY8v4by61X:/Bulk/Exome\ sequences/Population\ level\ exome\ OQFE\ variants,\ pVCF\ format\ -\ final\ release/ukb23157_c17_b29_v1.vcf.gz" -iin="project-GGy3Bb0JqBj7zfxY8v4by61X:/Bulk/Exome\ sequences/Population\ level\ exome\ OQFE\ variants,\ pVCF\ format\ -\ final\ release/ukb23157_c17_b30_v1.vcf.gz" -iin="project-GGy3Bb0JqBj7zfxY8v4by61X:/Bulk/Exome\ sequences/Population\ level\ exome\ OQFE\ variants,\ pVCF\ format\ -\ final\ release/ukb23157_c17_b29_v1.vcf.gz.tbi" -iin="project-GGy3Bb0JqBj7zfxY8v4by61X:/Bulk/Exome\ sequences/Population\ level\ exome\ OQFE\ variants,\ pVCF\ format\ -\ final\ release/ukb23157_c17_b30_v1.vcf.gz.tbi" -iin="project-GGy3Bb0JqBj7zfxY8v4by61X:/GRCh38_resources/GRCh38_alldifficultregions.bed.gz" -iin="project-GGy3Bb0JqBj7zfxY8v4by61X:/GRCh38_resources/GRCh38_reference_genome.fa" -iin="project-GGy3Bb0JqBj7zfxY8v4by61X:/GRCh38_resources/GRCh38_reference_genome.fa.fai" -icmd="bcftools view -Ou -r chr

In [343]:
!{dx_command}

job-GP4jb6QJqBjP39pF64PJJxG2
[0m

In [296]:
dx_input_str = ' '.join(set('-iin=' + dx_vcf_path + '/' + x for x in vcf_files))

'ukb23157_c17_b30_v1.vcf.gz ukb23157_c17_b29_v1.vcf.gz'