### Filter and split vcf files
This notebook tries to filter and split vcf files for a start to be extended to substracting SNPs and overlapping with other things.  
Currently used tools:  
- vcf-subset
- vcffilter
- bcftools
- bedtools
- pandas

In [34]:
import os
import subprocess
import datetime

In [90]:
def comand_print(comand):
    "This function prints out the current executed command."
    header = "Currently working on the following command. Please be patient."
    print(header)
    print(comand)

In [89]:
def get_vcf_sample_names(vcf_fn):
    """This function take parses out the sample names of a vcf file. The file can be in plain text
    or .gz format. It makes use of the #CHROM line in the vcf file."""
    if vcf_fn.endswith('.gz'):
        import gzip
        with gzip.open(vcf_fn) as fn:
            for line in fn:
                if line.decode("utf-8").startswith('#CHROM'):
                    good_line = line.decode("utf-8").rstrip()
                    break
            samples = good_line.split('\t')[9:]
    else:
        with open(vcf_fn) as fn:
            for line in fn:
                if line.startswith('#CHROM'):
                    good_line = line.rstrip()
                    break
            samples = good_line.split('\t')[9:]
    return samples

In [None]:
def vcf_subset(in_fn, sample, OUTFOLDER):
    """This function subsets the input vcf file for one specific sample using the following command.
    vcf-subset --exclude-ref -c.
    Input:
        in_fn is the input vcf.
        sample is the sample to be subset on.
        OUTFOLDER is the folder were the new file should be saved in.
    Output:
        The function runs the command and returns the name of the output file."""
    out = os.path.basename(in_fn).replace('.vcf', '.%s.vcf' % sample)
    out_fn = os.path.join(OUTFOLDER, out)
    comand = 'vcf-subset --exclude-ref -c %s %s > %s' % (sample, in_fn, out_fn)
    comand_print(comand)
    subprocess.check_output(comand, stderr=subprocess.STDOUT, shell=True)
    return out_fn

In [None]:
def vcffilter(in_fn, OUTFOLDER, filters={"DP": ">10", "QUAL": ">20"}):
    """This function filters the input vcf file for a specific set of filter criterio using the following command.
    vcffilter -f .
    Input:
        in_fn is the input vcf.
        OUTFOLDER is the folder were the new file should be saved in.
        filters is a dictionary where the key is the filter name and the value the filter value.
        Accepts filter values that are >, =, or < of specific number. The outfile convention is
        g, e, and s, respectively. 
    Output:
        The function runs the command and returns the name of the output file."""
    appendix =''
    for key, value in filters.items():
        if '>' in value:
            value = value.replace('>', 'g')
        elif '<' in value:
            value = value.replace('<', 's')
        elif '=' in value:
            value = value.replace('=', 'e')
        appendix = '%s%s%s' % (appendix, key, value.strip(' ')) 
    out = os.path.basename(in_fn).replace('.vcf', '.%s.vcf' % appendix)
    out_fn = os.path.join(OUTFOLDER, out) 
    genotype_filter = ''
    for key, value in filters.items():
        genotype_filter += '-f "%s %s" ' % (key, value)
    comand = 'vcffilter %s %s > %s' % (genotype_filter, in_fn, out_fn)
    print(comand)
    subprocess.check_output(comand, stderr=subprocess.STDOUT, shell=True)
    return out_fn

In [47]:
###HERE starts the program###
###Provide your filter values in the dictionary and your input folder###
filters={"DP": ">10", "QUAL": ">20"}
INFOLDER = "/home/benjamin/genome_assembly/Warrior/DK0911_v04/p_SNPs/freebays"

In [86]:
#This generates the outfolder name for the flitered vcf files
appendix =''
for key, value in filters.items():
    if '>' in value:
        value = value.replace('>', 'g')
    elif '<' in value:
        value = value.replace('<', 's')
    elif '=' in value:
        value = value.replace('=', 'e')
    appendix = '%s%s%s' % (appendix, key, value.strip(' ')) 
filtered_out_folder = os.path.join(INFOLDER, appendix)
if not os.path.exists(filtered_out_folder):
    os.mkdir(filtered_out_folder)

In [88]:
#captures vcf files and filters them accordingly to the filter setting
vcfs = [os.path.join(INFOLDER, vcf) for vcf in os.listdir(INFOLDER) if vcf.endswith('.vcf') or vcf.endswith('.vcf.gz')]
filtered_vcfs = []
for vcf in vcfs:
    filtered_vcf = vcffilter(vcf, filtered_out_folder)
    filtered_vcfs.append(filtered_vcf)

vcffilter -f "QUAL >20" -f "DP >10"  /home/benjamin/genome_assembly/Warrior/DK0911_v04/p_SNPs/freebays/DK_0911_v04_p_ctg.samples_15.03_06_19.freebayes.vcf > /home/benjamin/genome_assembly/Warrior/DK0911_v04/p_SNPs/freebays/QUALg20DPg10/DK_0911_v04_p_ctg.samples_15.03_06_19.freebayes.QUALg20DPg10.vcf
vcffilter -f "QUAL >20" -f "DP >10"  /home/benjamin/genome_assembly/Warrior/DK0911_v04/p_SNPs/freebays/Pst_104E_v13_p_ctg.samples_14.03_14_19.freebayes.vcf > /home/benjamin/genome_assembly/Warrior/DK0911_v04/p_SNPs/freebays/QUALg20DPg10/Pst_104E_v13_p_ctg.samples_14.03_14_19.freebayes.QUALg20DPg10.vcf


In [91]:
for sample in samples:
    print(sample)

S008326
Pst79_folder5
S008327
S008329
S008321
S008323
S008320
S008328
DK0911_gDNA
SRR172674
S00832A
S00832B
S008322
S008325
S008324


In [None]:
#splits up the filtered vcf files according to the samples contained in them. 
#one vcf per filtered sample
split_and_filtered_vcfs = []
for filtered_vcf in filtered_vcfs:
    samples = get_vcf_sample_names(filtered_vcf)
    split_out_folder = os.path.join(os.path.dirname(filtered_vcf), 'split_vcf')
    if not os.path.exists(split_out_folder):
        os.mkdir(split_out_folder)
    for sample in samples:
        print(sample)
        split_and_filtered_vcf = vcf_subset(filtered_vcf, sample, split_out_folder)
        split_and_filtered_vcfs.append(split_and_filtered_vcf)

In [93]:
#splits up the filtered vcf files according to the samples contained in them. 
#one vcf per filtered sample
split_and_filtered_vcfs = []
for filtered_vcf in filtered_vcfs:
    samples = get_vcf_sample_names(filtered_vcf)
    split_out_folder = os.path.join(os.path.dirname(filtered_vcf), 'split_vcf')
    if not os.path.exists(split_out_folder):
        os.mkdir(split_out_folder)
    for sample in ['Pst79_folder5', 'DK0911_gDNA']:
        print(sample)
        split_and_filtered_vcf = vcf_subset(filtered_vcf, sample, split_out_folder)
        split_and_filtered_vcfs.append(split_and_filtered_vcf)

Pst79_folder5
vcf-subset --exclude-ref -c Pst79_folder5 /home/benjamin/genome_assembly/Warrior/DK0911_v04/p_SNPs/freebays/QUALg20DPg10/DK_0911_v04_p_ctg.samples_15.03_06_19.freebayes.QUALg20DPg10.vcf > /home/benjamin/genome_assembly/Warrior/DK0911_v04/p_SNPs/freebays/QUALg20DPg10/split_vcf/DK_0911_v04_p_ctg.samples_15.03_06_19.freebayes.QUALg20DPg10.Pst79_folder5.vcf
DK0911_gDNA
vcf-subset --exclude-ref -c DK0911_gDNA /home/benjamin/genome_assembly/Warrior/DK0911_v04/p_SNPs/freebays/QUALg20DPg10/DK_0911_v04_p_ctg.samples_15.03_06_19.freebayes.QUALg20DPg10.vcf > /home/benjamin/genome_assembly/Warrior/DK0911_v04/p_SNPs/freebays/QUALg20DPg10/split_vcf/DK_0911_v04_p_ctg.samples_15.03_06_19.freebayes.QUALg20DPg10.DK0911_gDNA.vcf
Pst79_folder5
vcf-subset --exclude-ref -c Pst79_folder5 /home/benjamin/genome_assembly/Warrior/DK0911_v04/p_SNPs/freebays/QUALg20DPg10/Pst_104E_v13_p_ctg.samples_14.03_14_19.freebayes.QUALg20DPg10.vcf > /home/benjamin/genome_assembly/Warrior/DK0911_v04/p_SNPs/freeba

In [94]:
samples = get_vcf_sample_names(vcf_out)
split_out_folder = os.path.join(os.path.dirname(vcf_out), 'split_vcf')
if not os.path.exists(split_out_folder):
    os.mkdir(split_out_folder)
test_vcf_split = vcf_subset(vcf_out, 'S008326', split_out_folder)
for sample in samples:
    print(sample)

FileNotFoundError: [Errno 2] No such file or directory: '/home/benjamin/genome_assembly/Warrior/DK0911_v04/p_SNPs/freebays/QUALg20DPg10/DK_0911_v04_p_ctg.samples_15.03_06_19.freebayesQUALg20DPg10.vcf'