In [1]:
from io import StringIO
import numpy as np
import pandas as pd
import subprocess

### Set parameters

In [2]:
# DNAnexus job tag and project ID
job_tag = 'Vineel_exon_variants_051325' # DNAnexus job tag
proj_id = 'project-J07x038JgGYK8bKFq6Z117kk'

# for exome analysis
dx_ex_vcf_path =  f"{proj_id}:/Bulk/Exome sequences/Population level exome OQFE variants, pVCF format - final release"

# create vcf output folder
dx_vcf_out_path = f"{proj_id}:/Tian_folder/{job_tag}/"
cmd = f'dx mkdir -p "{dx_vcf_out_path}"'
subprocess.run(cmd, shell=True)

# DNAnexus resources
dx_resource_path = f"{proj_id}:/GRCh38_resources/"
diff_bed = 'GRCh38_alldifficultregions.bed.gz'
ref_genome = 'GRCh38_reference_genome.fa'



### List of gene symbols as input

In [3]:
# read gene list from txt
gene_list_path = "gene_list/Vineel_genes_051325.txt"

with open(gene_list_path, "r") as gene_file:
    genes = gene_file.readlines()
    genes = [l.replace("\n", "") for l in genes]

genes

['LDLR',
 'APOB',
 'PCSK9',
 'BRCA1',
 'BRCA2',
 'CHEK2',
 'ATM',
 'MSH2',
 'MSH6',
 'MLH1',
 'PMS2',
 'ACTC1',
 'MYBPC3',
 'MYH7',
 'MYL2',
 'TPM1',
 'TNNT2',
 'TTN',
 'DSC2',
 'DSG2',
 'DSP',
 'PKP2',
 'TMEM43',
 'SCN5A',
 'TP53',
 'HNF1A',
 'HNF1B',
 'HNF4A',
 'GCK',
 'KCNH2',
 'KCNQ1',
 'ACVRL1',
 'ENG',
 'BMPR1A',
 'SMAD4',
 'STK11',
 'PTEN',
 'HFE',
 'COL3A1',
 'RPE65',
 'WT1']

### Convert MANE gtf file into csv file

In [19]:
def parse_attributes(attr_str):
    attr_dict = {}
    for field in attr_str.strip().split(";"):
        if field.strip() == "":
            continue
        key, value = field.strip().split(" ", 1)
        attr_dict[key] = value.replace('"', '')
    return attr_dict

gtf_file = "resources/MANE.GRCh38.v1.4.ensembl_genomic.gtf.gz"

# read GTF file
df_gtf = pd.read_csv(gtf_file, sep="\t",
    comment="#",    # GTF files often start with '#' comments
    header=None,    # No header
    names=["chrom", "source", "feature", "start", "end", "score", "strand", "frame", "attribute"]
)

# Apply parsing
attributes_parsed = df_gtf["attribute"].apply(parse_attributes)
df_attr = pd.json_normalize(attributes_parsed)

# Merge back
df_gtf_parsed = pd.concat([df_gtf.drop(columns=["attribute"]), df_attr], axis=1)

df_gtf_parsed.to_csv("resources/MANE.GRCh38.v1.4.ensembl_genomic_parsed.csv.gz", index=False, compression='gzip')
df_gtf_parsed

Unnamed: 0,chrom,source,feature,start,end,score,strand,frame,gene_id,gene_type,gene_name,transcript_id,transcript_type,transcript_name,tag,protein_id,db_xref,exon_number,exon_id
0,chr1,ensembl_havana,gene,3069168,3438621,.,+,.,ENSG00000142611.17,protein_coding,PRDM16,,,,,,,,
1,chr1,ensembl_havana,transcript,3069203,3438621,.,+,.,ENSG00000142611.17,protein_coding,PRDM16,ENST00000270722.10,protein_coding,PRDM16-201,MANE_Select,ENSP00000270722.5,RefSeq:NM_022114.4,,
2,chr1,ensembl_havana,exon,3069203,3069296,.,+,.,ENSG00000142611.17,protein_coding,PRDM16,ENST00000270722.10,protein_coding,PRDM16-201,MANE_Select,ENSP00000270722.5,RefSeq:NM_022114.4,1,ENSE00003850248.1
3,chr1,ensembl_havana,CDS,3069260,3069296,.,+,0,ENSG00000142611.17,protein_coding,PRDM16,ENST00000270722.10,protein_coding,PRDM16-201,MANE_Select,ENSP00000270722.5,RefSeq:NP_071397.3,1,ENSE00003850248.1
4,chr1,ensembl_havana,start_codon,3069260,3069262,.,+,0,ENSG00000142611.17,protein_coding,PRDM16,ENST00000270722.10,protein_coding,PRDM16-201,MANE_Select,ENSP00000270722.5,RefSeq:NP_071397.3,1,ENSE00003850248.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
524608,chrX_MU273396v1_alt,havana,CDS,169184,169642,.,+,0,ENSG00000292243.1,protein_coding,UBE2NL,ENST00000710304.1,protein_coding,UBE2NL-202,MANE_Select,ENSP00000518187.1,RefSeq:NP_001013007.1,1,ENSE00004011314.1
524609,chrX_MU273396v1_alt,havana,start_codon,169184,169186,.,+,0,ENSG00000292243.1,protein_coding,UBE2NL,ENST00000710304.1,protein_coding,UBE2NL-202,MANE_Select,ENSP00000518187.1,RefSeq:NP_001013007.1,1,ENSE00004011314.1
524610,chrX_MU273396v1_alt,havana,stop_codon,169643,169645,.,+,0,ENSG00000292243.1,protein_coding,UBE2NL,ENST00000710304.1,protein_coding,UBE2NL-202,MANE_Select,ENSP00000518187.1,RefSeq:NP_001013007.1,1,ENSE00004011314.1
524611,chrX_MU273396v1_alt,havana,UTR,169154,169183,.,+,.,ENSG00000292243.1,protein_coding,UBE2NL,ENST00000710304.1,protein_coding,UBE2NL-202,MANE_Select,ENSP00000518187.1,,1,ENSE00004011314.1


### Load pVCF block coordinates

In [4]:
# exome blocks
df_ex_blk = pd.read_table("./resources/exome_pvcf_blocks.txt", sep = '\t', names = ['index', 'chrom', 'blk', 'start_pos', 'end_pos'])
df_ex_blk['chrom'] = df_ex_blk['chrom'].apply(lambda x: 'chrX' if x == 23 else ('chrY' if x == 24 else f'chr{x}'))
df_ex_blk['filename'] = df_ex_blk.apply(lambda row: f"ukb23157_c{row['chrom'].replace('chr', '')}_b{row['blk']}_v1.vcf.gz", axis=1)
df_ex_blk



Unnamed: 0,index,chrom,blk,start_pos,end_pos,filename
0,1,chr1,0,1,1218130,ukb23157_c1_b0_v1.vcf.gz
1,2,chr1,1,1218131,1426969,ukb23157_c1_b1_v1.vcf.gz
2,3,chr1,2,1426970,1758871,ukb23157_c1_b2_v1.vcf.gz
3,4,chr1,3,1758872,2514221,ukb23157_c1_b3_v1.vcf.gz
4,5,chr1,4,2514222,3782130,ukb23157_c1_b4_v1.vcf.gz
...,...,...,...,...,...,...
972,973,chrX,20,135552245,141897932,ukb23157_cX_b20_v1.vcf.gz
973,974,chrX,21,141897933,152168662,ukb23157_cX_b21_v1.vcf.gz
974,975,chrX,22,152168663,153788223,ukb23157_cX_b22_v1.vcf.gz
975,976,chrX,23,153788224,156040895,ukb23157_cX_b23_v1.vcf.gz


### region to extract variants

In [5]:
# exome analysis
def attach_blk_filename(df_blk, chrom, start_pos, end_pos):
    blk_filename = df_blk.loc[(df_blk['chrom'] == chrom) & 
                              ((df_blk['end_pos'] >= start_pos) & (df_blk['start_pos'] <= end_pos)),
                              'filename'
    ].to_list()
    return blk_filename

df_mane = pd.read_csv("resources/MANE.GRCh38.v1.4.ensembl_genomic_parsed.csv.gz")
exon_flank_nt = 5 # flanking nucleotides from the start and end of exons
df_exon_regions = df_mane.loc[
    (df_mane['feature'] == 'exon') & 
    (df_mane['gene_name'].isin(genes)), 
    ['chrom', 'start', 'end', 'gene_name']
].copy()
df_exon_regions['exon_flank_start'] = df_exon_regions['start'] - exon_flank_nt
df_exon_regions['exon_flank_end'] = df_exon_regions['end'] + exon_flank_nt
df_exon_regions['region'] = ((df_exon_regions['chrom'] + ':').str.cat(df_exon_regions['exon_flank_start'].astype(str)) + '-').str.cat(df_exon_regions['exon_flank_end'].astype(str))

# attach block filenames
df_exon_regions['filenames'] = df_exon_regions.apply(lambda row: attach_blk_filename(df_ex_blk, row['chrom'], row['start'], row['end']), axis=1)
df_exon_regions


Unnamed: 0,chrom,start,end,gene_name,exon_flank_start,exon_flank_end,region,filenames
47788,chr1,55039548,55040044,PCSK9,55039543,55040049,chr1:55039543-55040049,[ukb23157_c1_b36_v1.vcf.gz]
47791,chr1,55043843,55044034,PCSK9,55043838,55044039,chr1:55043838-55044039,[ukb23157_c1_b36_v1.vcf.gz]
47793,chr1,55046523,55046646,PCSK9,55046518,55046651,chr1:55046518-55046651,[ukb23157_c1_b36_v1.vcf.gz]
47795,chr1,55052278,55052411,PCSK9,55052273,55052416,chr1:55052273-55052416,[ukb23157_c1_b36_v1.vcf.gz]
47797,chr1,55052650,55052791,PCSK9,55052645,55052796,chr1:55052645-55052796,[ukb23157_c1_b36_v1.vcf.gz]
...,...,...,...,...,...,...,...,...
500311,chr22,28695710,28695873,CHEK2,28695705,28695878,chr22:28695705-28695878,[ukb23157_c22_b6_v1.vcf.gz]
500313,chr22,28695127,28695242,CHEK2,28695122,28695247,chr22:28695122-28695247,[ukb23157_c22_b6_v1.vcf.gz]
500315,chr22,28694032,28694117,CHEK2,28694027,28694122,chr22:28694027-28694122,[ukb23157_c22_b6_v1.vcf.gz]
500317,chr22,28689135,28689215,CHEK2,28689130,28689220,chr22:28689130-28689220,[ukb23157_c22_b6_v1.vcf.gz]


### Run Swiss-army-knife on DNAnexus
- get region info for each gene from block file
- bcftools command for step 2 finished
- bcftools command for step 3 & 4 TBD
- list genes not included in MANE set

In [6]:
genes_not_found = []
genes_found = []
known_large_genes = ["DSP", "TSC2", "TTN", "NCOA3", "TET2"]
AF_threshold = 0.01 # typically 0.001 (0.1%) for exome analysis

for gene in genes:
    df_gene = df_exon_regions.loc[df_exon_regions['gene_name'] == gene]

    if df_gene.shape[0] > 0:
        genes_found.append(gene)
        
        vcf_files = set(x for sublist in df_gene['filenames'] for x in sublist)
        vcf_str = ' '.join(vcf_files)
        region_str = ','.join(df_gene['region'].to_list())
        
        if ((len(vcf_files) > 1) or (gene in known_large_genes)):
            mem_level = "mem2_ssd1_v2_x16" # dynamically change memory level when submitting jobs on DNAnexus
        else:
            mem_level = "mem1_ssd1_v2_x4"
        
        vcf_outfile = gene + ".exon_variants.vcf.gz"

        # filtering variants
        bcftools_cmd1 = f"bcftools concat -Ou -a -r {region_str} {vcf_str}"
        bcftools_cmd2 = f"bcftools view -Ou --max-alleles 5 -T ^{diff_bed}"
        bcftools_cmd3 = "bcftools +fill-tags -Ou -- -t all"
        bcftools_cmd4 = f"bcftools norm -Ou -m - -f {ref_genome}" 
        bcftools_cmd5 = f"bcftools view -Oz -i 'AF<={AF_threshold} && MAC >=1 && F_MISSING<0.1 && F_PASS(DP>=10 & GT!=\\\"mis\\\")> 0.9' > {vcf_outfile}"
        
        # index vcf files
        bcftools_cmd6 = f"bcftools index -t {vcf_outfile}"

        # extract heterozygous carriers
        bcftools_cmd7 = "mkdir -p carriers"
        bcftools_cmd8 = f"bcftools query -i 'GT!=\\\"0/0\\\"' -f '%CHROM  %POS %REF %ALT %INFO/AF [%SAMPLE|]\n' {vcf_outfile} > carriers/{gene}.all_carriers.ssv"
        bcftools_cmd9 = f"bcftools query -i 'GT=\\\"0/1\\\"|GT=\\\"1/0\\\"' -f '%CHROM  %POS %REF %ALT %INFO/AF [%SAMPLE|]\n' {vcf_outfile} > carriers/{gene}.hetero_carriers.ssv"
        
        # parsing command
        bcftools_command = " | ".join([bcftools_cmd1, bcftools_cmd2, bcftools_cmd3, bcftools_cmd4, bcftools_cmd5])
        bcftools_command = " && ".join([bcftools_command, bcftools_cmd6, bcftools_cmd7, bcftools_cmd8, bcftools_cmd9])
        
        # parsing input
        dx_input_str = ' '.join([f'-iin="{dx_ex_vcf_path}/{x}" -iin="{dx_ex_vcf_path}/{x}.tbi"' for x in vcf_files])
        dx_input_str = dx_input_str + f' -iin="{dx_resource_path}/{diff_bed}"'
        dx_input_str = dx_input_str + f' -iin="{dx_resource_path}/{ref_genome}"' + f' -iin="{dx_resource_path}/{ref_genome}.fai"'
        
        # final dx command
        dx_command = f'dx run swiss-army-knife --instance-type {mem_level} -y --brief {dx_input_str} -icmd="{bcftools_command}" --destination {dx_vcf_out_path} --tag "{job_tag}" --property gene={gene}'
        subprocess.run(dx_command, shell=True, check=True)
    else:
        genes_not_found.append(gene)
            
print('Genes not found in MANE database:')
print(genes_not_found)

job-J0VfJKjJgGY3qv0zJ2b7Q3PP
job-J0VfJP0JgGYBQffkFq7KZx7Z
job-J0VfJP8JgGY4Q6yYJx4z9yY0
job-J0VfJP8JgGY4Jv6z35P998b8
job-J0VfJPQJgGYBK76BXVYbbP48
job-J0VfJPjJgGY01pbX39J7p6vz
job-J0VfJQ0JgGY0qbJXXQ5pzF3Q
job-J0VfJQ8JgGY9V6Zj6BP3ZG20
job-J0VfJQQJgGY4q3zJy6jZ6bZP
job-J0VfJQQJgGYJb14v2PyPZ7XK
job-J0VfJQjJgGYPQgyJz5BYYPz4
job-J0VfJV0JgGYG1fqJp7vg83J6
job-J0VfJV8JgGY5V5Z1Z3g12kBb
job-J0VfJVQJgGYJYJy7zJPQqX06
job-J0VfJVjJgGY91k2zjbZBkVVP
job-J0VfJX0JgGY3fk18zjYkkJB4
job-J0VfJX8JgGYKQ5p4Z0625Zx9
job-J0VfJXQJgGY5GKyYpKz91X0j
job-J0VfJXjJgGYGPGgJV17055xq
job-J0VfJY8JgGY16fJ0BzvB1Yzk
job-J0VfJYQJgGY4Jv6z35P998bF
job-J0VfJYQJgGY9V6Zj6BP3ZG2F
job-J0VfJYjJgGYKQ5p4Z0625ZxF
job-J0VfJZ0JgGY3BXZzGpkzb1V5
job-J0VfJZ8JgGY1v106Vfyjg48F
job-J0VfJZQJgGY0jzqXgQZFp7qp
job-J0VfJZjJgGY5V5Z1Z3g12kBj
job-J0VfJb0JgGYJb14v2PyPZ7XX
job-J0VfJb8JgGY8y67P8G32jpKK
job-J0VfJbQJgGY0qbJXXQ5pzF3g
job-J0VfJbjJgGY5V5Z1Z3g12kBv
job-J0VfJf0JgGY3qv0zJ2b7Q3Q2
job-J0VfJf8JgGYF9y23V5VZq3jq
job-J0VfJfQJgGY5GKyYpKz91X0p
job-J0VfJfjJgG

### Below is for testing purposes

In [23]:
gene = "TET2"

df_gene = df_exon_regions.loc[df_exon_regions['gene_name'] == gene]

if df_gene.shape[0] > 0:
    genes_found.append(gene)
    
    vcf_files = set(x for sublist in df_gene['filenames'] for x in sublist)
    vcf_str = ' '.join(vcf_files)
    region_str = ','.join(df_gene['region'].to_list())
    
    if ((len(vcf_files) > 1) or (gene in known_large_genes)):
        mem_level = "mem2_ssd1_v2_x16" # dynamically change memory level when submitting jobs on DNAnexus
    else:
        mem_level = "mem1_ssd1_v2_x4"
    
    vcf_outfile = gene + ".exon_variants.vcf.gz"

    # filtering variants
    bcftools_cmd1 = f"bcftools concat -Ou -a -r {region_str} {vcf_str}"
    bcftools_cmd2 = f"bcftools view -Ou --max-alleles 5 -T ^{diff_bed}"
    bcftools_cmd3 = "bcftools +fill-tags -Ou -- -t all"
    bcftools_cmd4 = f"bcftools norm -Ou -m - -f {ref_genome}" 
    bcftools_cmd5 = f"bcftools view -Oz -i 'AF<=0.001 && MAC >=1 && F_MISSING<0.1 && F_PASS(DP>=10 & GT!=\\\"mis\\\")> 0.9' > {vcf_outfile}"
    
    # index vcf files
    bcftools_cmd6 = f"bcftools index -t {vcf_outfile}"

    # extract heterozygous carriers
    bcftools_cmd7 = "mkdir -p carriers"
    bcftools_cmd8 = f"bcftools query -i 'GT!=\\\"0/0\\\"' -f '%CHROM  %POS %REF %ALT %INFO/AF [%SAMPLE|]\n' {vcf_outfile} > carriers/{gene}.all_carriers.ssv"
    bcftools_cmd9 = f"bcftools query -i 'GT=\\\"0/1\\\"|GT=\\\"1/0\\\"' -f '%CHROM  %POS %REF %ALT %INFO/AF [%SAMPLE|]\n' {vcf_outfile} > carriers/{gene}.hetero_carriers.ssv"
    
    # parsing command
    bcftools_command = " | ".join([bcftools_cmd1, bcftools_cmd2, bcftools_cmd3, bcftools_cmd4, bcftools_cmd5])
    bcftools_command = " && ".join([bcftools_command, bcftools_cmd6, bcftools_cmd7, bcftools_cmd8, bcftools_cmd9])
    
    # parsing input
    dx_input_str = ' '.join([f'-iin="{dx_ex_vcf_path}/{x}" -iin="{dx_ex_vcf_path}/{x}.tbi"' for x in vcf_files])
    dx_input_str = dx_input_str + f' -iin="{dx_resource_path}/{diff_bed}"'
    dx_input_str = dx_input_str + f' -iin="{dx_resource_path}/{ref_genome}"' + f' -iin="{dx_resource_path}/{ref_genome}.fai"'
    
    # final dx command
    dx_command = f'dx run swiss-army-knife --instance-type {mem_level} -y --brief {dx_input_str} -icmd="{bcftools_command}" --destination {dx_vcf_out_path} --tag "{job_tag}" --property gene={gene}'
    subprocess.run(dx_command, shell=True, check=True)

job-J08PjG0JgGY0y7PJQF78vp14


In [16]:
dx_ex_vcf_path =  f"{proj_id}:/Bulk/Exome sequences/Population level exome OQFE variants, pVCF format - final release/"
dx_input_str = f'-iin="{dx_ex_vcf_path}/ukb23157_c2_b39_v1.vcf.gz"'
bcftools_command = f'bcftools view -H ukb23157_c2_b39_v1.vcf.gz | head -n 10 > ex_vcf_sample2.txt'

dx_command = f'dx run swiss-army-knife --instance-type {mem_level} -y --brief {dx_input_str} -icmd="{bcftools_command}" --destination {dx_vcf_out_path} --tag "{job_tag}" --property gene={gene}'
subprocess.run(dx_command, shell=True, check=True)

job-J08KvpjJgGY5x319ZKp02kX7


CompletedProcess(args='dx run swiss-army-knife --instance-type mem1_ssd1_v2_x4 -y --brief -iin="project-J07x038JgGYK8bKFq6Z117kk:/Bulk/Exome sequences/Population level exome OQFE variants, pVCF format - final release//ukb23157_c2_b39_v1.vcf.gz" -icmd="bcftools view -H ukb23157_c2_b39_v1.vcf.gz | head -n 10 > ex_vcf_sample2.txt" --destination project-J07x038JgGYK8bKFq6Z117kk:/Tian_folder/Albumin_genes_exon_variants_042425/ --tag "Albumin_genes_exon_variants_042425" --property gene=ABCA6', returncode=0)