-
Notifications
You must be signed in to change notification settings - Fork 1
/
excess-heterozygosity.py
executable file
·105 lines (56 loc) · 1.89 KB
/
excess-heterozygosity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import sys
chr = sys.argv[2]
region_start = int(sys.argv[3])
region_end = int(sys.argv[4].rstrip("\n"))
#use base 1 columns and python will put in base 0!!
to_extract = []
for i in sys.argv[5:]:
to_extract.append(int(i))
bin_start = region_start
bin_end = region_start + 99
#danger count tracks how many problematic het-rich sites are in a bin
danger_count = 0
danger="no"
with open(sys.argv[1]) as file:
for line in file:
het_count = 0
split_line = line.split("\t")
if line.startswith("#"):
continue
if line.startswith(chr):
#skip if line is smaller than bin start
if (split_line[1] < bin_start):
continue
#end the loop if we are past the region of interest
if (int(split_line[1]) > region_end):
#reset as we have to check at the end of the script
het_count = 0
break
#if the bin is finished, we need to update
while (int(split_line[1]) > bin_end & bin_end <= region_end):
print "updating...danger count: " + str(danger_count)
#we need to check how many danger sites were found
if (danger_count >= int(round((bin_end - bin_start + 1) / 20)) ):
print str(bin_start) + " " + str(bin_end)
danger="yes"
#reset danger counter for next bin
danger_count = 0
#move the bin forward
bin_start += 100
bin_end += 100
if (bin_end > region_end):
bin_end = region_end
#check if bin
if ((int(split_line[1]) >= bin_start) & (int(split_line[1]) <= bin_end)):
for ind in [split_line[i] for i in to_extract]:
if ind.startswith("0/1"):
het_count += 1
print "adding a het:" + str(het_count)
#check hets at the end of line
if (het_count >= (round(len(to_extract) / 3)) ):
danger_count += 1
if (danger_count >= int(round((bin_end - bin_start + 1) / 20)) ):
print str(bin_start) + " " + str(bin_end)
danger="yes"
if (danger=="yes"):
print "Too SNPpy region/s detected"