-
Notifications
You must be signed in to change notification settings - Fork 1
/
verify_variants.py
115 lines (87 loc) · 3.35 KB
/
verify_variants.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/python
"""
This script will verify that the expected variants in the truth file are in the corresponding VCF file. If all the
variants in the truth file are found in the VCF file, then this VCF file passes validation. Otherwise, the VCF file
fails validation.
"""
import argparse
import re
parser = argparse.ArgumentParser()
parser.add_argument("-t", "--truth", help="The truth file")
parser.add_argument("-v", "--vcf", help="The vcf file")
args = parser.parse_args()
test_file = args.truth
vcf_file = args.vcf
def get_test_variants():
"""
This function returns a tuple of a set of tuples of the test variants and the chromosome.
:return: Set of tuples of (chrom, pos, ref alt, filter) and the chromosome
:rtype: tuple of set and str
"""
test_variant_set = set()
chromosome = ''
with open(test_file, 'r') as test_obj:
for line in test_obj:
if not re.search('^#', line):
line_items = line.strip().split('\t')
chrom = line_items[0]
pos = line_items[1]
ref = line_items[2]
alt = line_items[3]
vcf_filter = line_items[4]
chromosome = chrom + '\t'
var = (chrom, pos, ref, alt, vcf_filter)
test_variant_set.add(var)
else:
continue
return test_variant_set, chromosome
def get_vcf_variants(chromosome):
"""
This function returns a set of variants with the specified chromosome from the VCF file.
:param chromosome: The variants' chromosome
:type chromosome: str
:return: Set of tuples of (chrom, pos, ref alt, filter)
:rtype: set
"""
vcf_variant_set = set()
with open(vcf_file, 'r') as vcf_obj:
for line in vcf_obj:
if re.search('^' + chromosome, line) and not re.search('^#', line):
line_items = line.strip().split('\t')
vcf_filter = line_items[6]
if vcf_filter == 'PASS':
chrom = line_items[0]
pos = line_items[1]
ref = line_items[3]
alt = line_items[4]
var = (chrom, pos, ref, alt, vcf_filter)
vcf_variant_set.add(var)
else:
continue
else:
continue
return vcf_variant_set
########################################################################################################################
#
# MAIN
#
########################################################################################################################
def main():
"""
This is the main function. It prints 'Pass' to the screen if the test variants are found in the VCF variants;
otherwise, it prints 'Fail'.
:rtype: void
"""
test_variant_set, chromosome = get_test_variants()
vcf_variant_set = get_vcf_variants(chromosome)
print 'VCF File:', vcf_file.split('/')[-1]
print 'Truth File:', test_file.split('/')[-1]
if test_variant_set.issubset(vcf_variant_set):
print 'Truth Set: ', test_variant_set
print 'Intersect:', test_variant_set.intersection(vcf_variant_set)
print 'Pass\n'
else:
print 'Truth Set: ', test_variant_set
print 'Intersect:', test_variant_set.intersection(vcf_variant_set)
print 'Fail\n'
main()