-
Notifications
You must be signed in to change notification settings - Fork 1
/
Poisson_HMM.py
198 lines (169 loc) · 6.87 KB
/
Poisson_HMM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
from __future__ import division
import csv
import numpy as np
from scipy.stats import rankdata
from hmm import PoissonHMM
def is_outlier(out, thresh=3.5):
"""
only consider positions with variants larger than 2, otherwise will be driven by positionof 0/1/2 mutations (most case)
Returns a boolean array with True if points are outliers and False
otherwise.
Parameters:
-----------
points : An numobservations by numdimensions array of observations
thresh : The modified z-score to use as a threshold. Observations with
a modified z-score (based on the median absolute deviation) greater
than this value will be classified as outliers.
Returns:
--------
mask : A numobservations-length boolean array.
References:
----------
Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and
Handle Outliers", The ASQC Basic References in Quality Control:
Statistical Techniques, Edward F. Mykytka, Ph.D., Editor.
"""
# median-absolute-deviation (MAD) test
points = np.copy(out)
if len(points.shape) == 1:
points = points[:,None]
median = np.median(points[points > 2,None], axis=0)
diff = np.sum((points[points > 2,None] - median)**2, axis=-1)
diff = np.sqrt(diff)
med_abs_deviation = np.median(diff)
if med_abs_deviation == 0:
med_abs_deviation = np.median(diff[diff > 0])
points[points <= 2] = median
diff = np.sum((points - median)**2, axis=-1)
diff = np.sqrt(diff)
modified_z_score = 0.6745 * diff / med_abs_deviation
return modified_z_score > thresh
def movingaverage(interval, window_size):
'''
Function that average the count in a interval
'''
window= np.ones(int(window_size))/float(window_size)
return np.convolve(interval, window, 'same')
def data_process(out,n):
'''
Function that first remove the outlier , replace with neighbour and moving average of size n
'''
length = out.shape[0]
pos = np.array(range(int(length)))
outlier_idx = is_outlier(out)
outlier_pos = pos[outlier_idx]
for p in outlier_pos:
previous = p - 1
while previous in outlier_pos:
previous -= 1
out[p] = out[previous]
out = movingaverage(out,n)
out = out * n
out = out.astype(int) # for Poisson HMM
return outlier_idx, pos, out
def mean_start(counts, percentage, windws_size):
'''
Function that determine the initial value of mean of poission distribution
'''
mean1 = sum(counts) * percentage / len(counts) * windws_size
mean3 = max(counts) #* windws_size
mean2 = (mean3 + mean1 ) / 2
return mean1, mean2, mean3
def main(ENST, gene, counts, threshod):
cover_thres = 0.50
if gene in threshod:
if threshod[gene] > 0:
cover_thres = threshod[gene]
windws_size = 10
out = np.copy(counts)
outlier_idx, pos, out = data_process(np.copy(out),windws_size)
means1_prior,mean2_prior, mean3_prior = mean_start(counts[-outlier_idx], cover_thres, windws_size)
###############################################################################
# Run Gaussian HMM
print "fitting to HMM and decoding ..."
n_components = 3
X = np.column_stack([out])
# make an HMM instance and execute fit
transprior = np.array([[0.99,0.05,0.05],[0.05,0.99,0.05],[0.05,0.05,0.99]])
means_prior = np.array([means1_prior, mean2_prior, mean3_prior])
model = PoissonHMM(n_components, n_iter = 50 ,transmat_prior = transprior, means_prior = means_prior, thresh = 1e-5)
model.fit([X])
# predict the optimal sequence of internal hidden state
hidden_states = model.predict(X)
print "done"
###############################################################################
# print trained parameters and plot
print "Transition matrix"
print model.transmat_
print ""
print "means and vars of each hidden state"
for i in xrange(n_components):
print "%dth hidden state" % i
print "mean = ", model.means_[i]
print ""
# mutatuions in state 2 and 3 no more than estimated percentage
rank = map(int,rankdata(model.means_))
if 2 not in rank or 3 not in rank: return ['', '', '']
recur_pos = pos[outlier_idx]
state2_idx = (hidden_states == rank.index(2) ) & -outlier_idx
state2_pos = pos[state2_idx]
state3_idx = (hidden_states == rank.index(3) ) & -outlier_idx
state3_pos = pos[state3_idx]
if sum(counts[state2_pos]) + sum(counts[state3_pos]) +sum(counts[recur_pos]) <= sum(counts) * cover_thres:
return ['\t'.join(map(str,recur_pos)), '\t'.join(map(str,state2_pos)), '\t'.join(map(str,state3_pos))]
# mutatuions in state 2 and 3 are more than estimated percentage, use posterior prob to filter
logprob, posteriors = model.score_samples(X)
if 1 not in rank: return ['', '', '']
po1 = posteriors[:,[rank.index(1)]]
po1 = po1.ravel()
in_hotspot_count = 0
driver_counts = sum(counts) * cover_thres - sum(counts[outlier_idx])
idx = np.argsort(po1)
i = 0
hotspot_pos = []
while in_hotspot_count <= driver_counts:
if idx[i] not in pos[outlier_idx]:
hotspot_pos.append(idx[i])
in_hotspot_count += counts[idx[i]]
i += 1
hotspot = ['\t'.join(map(str,recur_pos)), '\t'.join(map(str,hotspot_pos)), '\t'.join(map(str,[]))]
return hotspot
with open( 'src/COSMIC_counts.txt') as f:
observed ={}
head = f.readline()
for line in f:
_, gene , MIS, _, silent, _ = line.split()
if gene not in observed:
if int(silent) > 0 and int(MIS) > 0:
observed[gene] = [int(MIS), int(silent)]
else:
observed[gene][0] += int(MIS)
observed[gene][1] += int(silent)
with open('src/D_rate.csv','rU') as f:
r = csv.reader(f)
head = r.next()
LOF_rate = {}
for line in r:
info = dict(zip(head, line))
if float(info['SYN']) > 0:
LOF_rate[info['GENE']] = float(info['MIS']) / float(info['SYN'])
threshod = {}
for gene, counts in observed.items():
if gene in LOF_rate:
rate = float(counts[0] )/ counts[1]
MIS_simu = np.random.poisson(counts[0], 10000)
silent_simu = np.random.poisson(counts[1], 10000)
rate = MIS_simu / silent_simu
rate = np.percentile(rate,95)
rate = (rate - LOF_rate[gene])/rate
threshod[gene] = rate
with open( 'src/MIS_counts.txt') as f:
fw = open('result/hotspot_pos.txt','w')
for line in f:
line = line.strip('\n').split()
ENST = line[0]
gene = line[1].split(',')[0]
counts = np.array(map(int,line[2:]))
hotspot = map(str,main(ENST, gene, counts, threshod))
fw.write(';'.join([ENST, gene] + hotspot)+ '\n')
fw.close()