-
Notifications
You must be signed in to change notification settings - Fork 1
/
derived_sharing_haplo.py
executable file
·278 lines (166 loc) · 6.47 KB
/
derived_sharing_haplo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
from __future__ import division
import sys
import numpy as np
import random
def main():
#take samples to test
#going by order in sample file, outgroup in position one
#H1,H2,H3
SAMPLES = sys.argv[2].rstrip("\n").split(",")
#check if we are doing a 4/5 pop test
TEST=len(SAMPLES)+1
#print out the samples
H1 = int(SAMPLES[0])+2
H2 = int(SAMPLES[1])+2
H3 = int(SAMPLES[2])+2
if TEST == 5: H4 = int(SAMPLES[3])+2
#lists to store the ABBA BABA sites
ABBA = []
BABA = []
ABABA = []
BAABA = []
#bootstrapping
#Going with 5Mb regions
#assuming bootstrap regions are found at /home/kdaly/bootstrap.list
#create a list storing the bootstrap region information
BOOTSTRAPS = []
with open("/home/kdaly/bootstrap.list") as FILE:
for LINE in FILE:
LINE = LINE.rstrip("\n")
REGION = [0,0,0,0,0]
REGION[0] = int(LINE.split(":")[0])
REGION[1] = int(LINE.split(":")[1].split("-")[0])
REGION[2] = int(LINE.split(":")[1].split("-")[1])
BOOTSTRAPS.append(REGION)
ABBA.append(0)
BABA.append(0)
ABABA.append(0)
BAABA.append(0)
#go through each position in the haplo file
with open(sys.argv[1]) as FILE:
for LINE in FILE:
LINE = LINE.rstrip("\n")
SPLINE = LINE.split("\t")
#skip over header, printing the samples of interest
if LINE.startswith("chr"):
print "H1 is " + SPLINE[H1]
print "H2 is " + SPLINE[H2]
print "H3 is " + SPLINE[H3]
if TEST == 5: print "H4 is " + SPLINE[H4]
continue
#skip if missing ancestral information
#ancestral genome MUST be in position 1
if SPLINE[3] == "N":
continue
#get position and chromosome for later
CHR = int(SPLINE[0])
POS = int(SPLINE[1])
#keep bootstrap estimates
#create a counter to loop through each block
COUNTER = 0
#for each line, interate over the bs regions
for BOOTSTRAP in BOOTSTRAPS:
#if the position matches the current bs region
if CHR == BOOTSTRAP[0] and POS >= BOOTSTRAP[1] and POS <= BOOTSTRAP[2]:
#if we are doing a 4 pop test:
if TEST == 4:
#add to the current ABBA BABA count for the current bs region
BOOTSTRAPS[COUNTER][3],BOOTSTRAPS[COUNTER][4] = GET_ABBA_BABA(SPLINE,H1,H2,H3,BOOTSTRAPS[COUNTER][3],BOOTSTRAPS[COUNTER][4])
#also add to the ABBA BABA lists
ABBA[COUNTER] = BOOTSTRAPS[COUNTER][3]
BABA[COUNTER] = BOOTSTRAPS[COUNTER][4]
elif TEST == 5:
#add to ABABA BAABA counter
BOOTSTRAPS[COUNTER][3],BOOTSTRAPS[COUNTER][4] = GET_ABBA_BABA(SPLINE,H1,H2,H3,BOOTSTRAPS[COUNTER][3],BOOTSTRAPS[COUNTER][4],H4)
ABABA[COUNTER] = BOOTSTRAPS[COUNTER][3]
BAABA[COUNTER] = BOOTSTRAPS[COUNTER][4]
continue
#update the counter to keep track of the current bs region
COUNTER += 1
#new counter, for tracking bs regions as we filter out those with no data
COUNTER = 0
#we want to exclude bs regions with no information
TO_INCLUDE = []
for BLOCK in BOOTSTRAPS:
#if missing both ABBA or BABA information, skip
if BLOCK[3] != 0 or BLOCK[4] != 0:
TO_INCLUDE.append(COUNTER)
#update counter for new bs region
COUNTER += 1
#strip out ABBA / BABA information from bs regions we are keeping; also sum
ABBA_TO_INCLUDE = list(map(ABBA.__getitem__,TO_INCLUDE))
ABBA_TO_INCLUDE_SUM = sum(ABBA_TO_INCLUDE)
BABA_TO_INCLUDE = list(map(BABA.__getitem__,TO_INCLUDE))
BABA_TO_INCLUDE_SUM = sum(BABA_TO_INCLUDE)
BOOT_NUM = len(TO_INCLUDE)
#if we are doing a 5 pop test, do the same for ABABA BAABA
if TEST == 5:
ABABA_TO_INCLUDE = list(map(ABABA.__getitem__,TO_INCLUDE))
ABABA_TO_INCLUDE_SUM = sum(ABABA_TO_INCLUDE)
BAABA_TO_INCLUDE = list(map(BAABA.__getitem__,TO_INCLUDE))
BAABA_TO_INCLUDE_SUM = sum(BAABA_TO_INCLUDE)
#new counter for jk regions we are keeping
COUNTER = 0
#list for each bs estimate of D
BOOT_D = []
#if we are doing a four pop test
if TEST == 4:
#for each bs block
for BLOCK in ABBA_TO_INCLUDE:
ABBA_SUBSET = []
BABA_SUBSET = []
#generate a new set of regions by sampling with replacement
#1000 bootstrap replicates
for I in range(0,1000):
RANDOM_INDEX = random.randrange(0,BOOT_NUM)
ABBA_SUBSET.append(ABBA_TO_INCLUDE[RANDOM_INDEX])
BABA_SUBSET.append(BABA_TO_INCLUDE[RANDOM_INDEX])
#add the bs D estimate to the bs D list
BOOT_D.append(D(sum(ABBA_SUBSET), sum(BABA_SUBSET) ))
#update counter for each bs region
COUNTER += 1
#get the total D estimate
D_EST = D(ABBA_TO_INCLUDE_SUM, BABA_TO_INCLUDE_SUM)
#calculate the sd of the jk D estimates
BOOT_ERR = np.std(BOOT_D)
#calculate Z score of the D estimate
Z = D_EST / BOOT_ERR
#print out the total ABBA, BABA sites, D estimates, jk error, and the Z score
print " ".join(str(e) for e in [ABBA_TO_INCLUDE_SUM, BABA_TO_INCLUDE_SUM, D_EST, BOOT_ERR, Z])
if TEST == 5:
for BLOCK in ABBA_TO_INCLUDE:
ABABA_SUBSET = []
BAABA_SUBSET = []
#1000 bootstrap replicates
for I in range(0,1000):
RANDOM_INDEX = random.randrange(0,BOOT_NUM)
ABABA_SUBSET.append(ABABA_TO_INCLUDE[RANDOM_INDEX])
BAABA_SUBSET.append(BAABA_TO_INCLUDE[RANDOM_INDEX])
BOOT_D.append(D(sum(ABABA_SUBSET), sum(BAABA_SUBSET) ))
COUNTER += 1
D_EST = D(ABABA_TO_INCLUDE_SUM, BAABA_TO_INCLUDE_SUM)
BOOT_ERR = np.std(BOOT_D)
Z = D_EST / BOOT_ERR
print " ".join(str(e) for e in [ABABA_TO_INCLUDE_SUM, BAABA_TO_INCLUDE_SUM, D_EST, BOOT_ERR, Z])
def GET_ABBA_BABA(SPLINE_IN,H1_IN,H2_IN,H3_IN,NABBA_IN, NBABA_IN,H4_IN="none"):
" Function for taking in a .haplo row and determining if ABBA/BABA count should be added to"
ANCESTRAL_IN = SPLINE_IN[3]
DERIVED_IN = "".join(SPLINE_IN[4:]).replace(ANCESTRAL_IN,"").replace("N","")[0]
if H4_IN == "none":
#add to the current ABBA/BABA count when appropriate
if SPLINE_IN[H1_IN] == ANCESTRAL_IN and SPLINE_IN[H2_IN] == DERIVED_IN and SPLINE_IN[H2_IN] == SPLINE_IN[H3_IN]:
NABBA_IN += 1
elif SPLINE_IN[H2_IN] == ANCESTRAL_IN and SPLINE_IN[H1_IN] == DERIVED_IN and SPLINE_IN[H1_IN] == SPLINE_IN[H3_IN]:
NBABA_IN += 1
else:
if SPLINE_IN[H1_IN] == ANCESTRAL_IN and SPLINE_IN[H2_IN] == DERIVED_IN and SPLINE_IN[H1_IN] == SPLINE_IN[H3_IN] and SPLINE_IN[H2_IN] == SPLINE_IN[H4_IN]:
#NABBA is a stand in for ABABA
NABBA_IN += 1
if SPLINE_IN[H2_IN] == ANCESTRAL_IN and SPLINE_IN[H1_IN] == DERIVED_IN and SPLINE_IN[H2_IN] == SPLINE_IN[H3_IN] and SPLINE_IN[H1_IN] == SPLINE_IN[H4_IN]:
#again, NBABA is a stand in for BAABA
NBABA_IN += 1
return(NABBA_IN, NBABA_IN)
def D(ABBA, BABA):
" Function for calculating D statistic "
return( (ABBA - BABA) / (ABBA + BABA))
main()