-
Notifications
You must be signed in to change notification settings - Fork 46
/
core.py
384 lines (343 loc) · 12.9 KB
/
core.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
# Copyright(c) 2014, The scLVM developers (Forian Buettner, Paolo Francesco Casale, Oliver Stegle)
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import sys
sys.path.append('./..')
import limix
def versiontuple(v):
return tuple(map(int, (v.split("."))))
try:
limix.__version__
import limix.modules.panama as PANAMA
import limix.modules.varianceDecomposition as VAR
import limix.modules.qtl as QTL
if versiontuple(limix.__version__)>versiontuple('0.7.3'):
import limix.deprecated as limix
except:
import limix.deprecated as limix
import limix.deprecated.modules.panama as PANAMA
import limix.deprecated.modules.varianceDecomposition as VAR
import limix.deprecated.modules.qtl as QTL
#print 'Limix version', limix.__version__
import scipy as SP
import scipy.linalg
import scipy.stats
import pdb
import h5py
import time
import copy
import warnings
import os
from utils.misc import dumpDictHdf5
from utils.misc import PCA
from utils.misc import warning_on_one_line
from gp_clvm import gpCLVM
class scLVM:
"""
Single Cell Latent Varible Model module (scLVM)
This class takes care of fitting and interpreting latent variable models to account for confounders in single-cell RNA-Seq data
This module requires LIMIX
"""
def __init__(self,Y,geneID=None,tech_noise=None):
"""
Args:
Y: gene expression matrix [N, G]
geneID: G vector of geneIDs
tech_noise: G vector of tech_noise
"""
#store dimensions
self.N = Y.shape[0]
self.G = Y.shape[1]
#set data
self.Y = Y
self.geneID = geneID
self.tech_noise = None
self.var=None
if tech_noise!=None:
self.set_tech_noise(tech_noise)
def fitFactor(self,idx=None,X0=None,k=1,standardize=False, use_ard=False, interaction=True, initMethod='fast'):
"""
Args:
idx: index of the genes involved
(e.g., for capturing cell cycle, index of cell cycle genes)
X0: known factor(s) on which to condition on when fitting the GPLVM
k: number of latent factors
standardize: if True, rescale gene expression by std prior to fitting GPLVM
(data are always mean-centered)
use_ard: use automatic relevance detection (switch off unimportant factors)
Returns:
X: hidden variable
Kconf: similarity matrix based on the confounding effect (XX.T)
varGPLVM: variance contributions of latent factors and residual biological noise
"""
assert idx!=None, 'scLVM:: specify idx'
if use_ard==True and k<2:
warnings.formatwarning = warning_on_one_line
warnings.warn('when using ARD consider choosing k>1')
if X0!=None:
assert use_ard == False, 'scLVM:: when fitting conditional GPLVM, use_ard has to be False'
Yconf = self.Y[:,idx]
Yconf-= Yconf.mean(0)
Kint = None
if X0==None:
#use PANAMA
# prepare data
# fit gplvm
panama = PANAMA.PANAMA(Y=Yconf,use_Kpop=False,standardize=standardize)
panama.train(rank=k,LinearARD=use_ard)
X = panama.get_Xpanama()
Kconf = panama.get_Kpanama()
var = panama.get_varianceComps()
if use_ard==False:
varGPLVM = {'K':var['Kpanama'],'noise':var['noise']}
else:
varGPLVM = {'X_ARD':var['LinearARD'],'noise':var['noise']}
else:
# use gpCLVM
gp = gpCLVM(Y=Yconf,X0=X0,k=k,standardize=standardize,interaction=interaction)
params0 = gp.initParams(method=initMethod)
conv = gp.optimize(params0)
X = gp.getX()
Kconf = gp.getK()
if interaction==True:
Kint = gp.getKi()
varGPLVM = gp.getVarianceComps()
return X,Kconf,Kint,varGPLVM
def set_tech_noise(self,tech_noise):
"""
Args:
tech_noise: G vector of technical noise
"""
assert tech_noise.shape[0]==self.G, 'scLVM:: tech_noise dimension dismatch'
self.tech_noise = tech_noise
def varianceDecomposition(self,K=None,tech_noise=None,idx=None,i0=None,i1=None,max_iter=10,verbose=False, cache=True):
"""
Args:
K: list of random effects to be considered in the analysis
idx: indices of the genes to be considered in the analysis
i0: gene index from which the anlysis starts
i1: gene index to which the analysis stops
max_iter: maximum number of random restarts
verbose: if True, print progresses
"""
if tech_noise!=None: self.set_tech_noise(tech_noise)
assert self.tech_noise!=None, 'scLVM:: specify technical noise'
assert K!=None, 'scLVM:: specify K'
if type(K)!=list: K = [K]
for k in K:
assert k.shape[0]==self.N, 'scLVM:: K dimension dismatch'
assert k.shape[1]==self.N, 'scLVM:: K dimension dismatch'
if idx==None:
if i0==None or i1==None:
i0 = 0; i1 = self.G
idx = SP.arange(i0,i1)
elif type(idx)!=SP.ndarray:
idx = SP.array(idx)
idx = SP.intersect1d(SP.array(idx),SP.where(self.Y.std(0)>0)[0]) #only makes sense if gene is expressed in at least one cell
_G = len(idx)
var = SP.zeros((_G,len(K)+2))
_idx = SP.zeros(_G)
geneID = SP.zeros(_G,dtype=str)
conv = SP.zeros(_G)==1
Ystar = [SP.zeros((self.N,_G)) for i in range(len(K))]
count = 0
Yidx = self.Y[:,idx]
Ystd = Yidx-Yidx.mean(0)
Ystd/= Yidx.std(0) #delta optimization might be more efficient
tech_noise = self.tech_noise[idx]/SP.array(Yidx.std(0))**2
for ids in range(_G):
if verbose:
print '.. fitting gene %d'%ids
# extract a single gene
y = Ystd[:,ids:ids+1]
# build and fit variance decomposition model
vc= VAR.VarianceDecomposition(y)
vc.addFixedEffect()
for k in K:
vc.addRandomEffect(k)
vc.addRandomEffect(SP.eye(self.N))
vc.addRandomEffect(SP.eye(self.N))
vc.vd.getTerm(len(K)+1).getKcf().setParamMask(SP.zeros(1))
for iter_i in range(max_iter):
scales0 = y.std()*SP.randn(len(K)+2)
scales0[len(K)+1]=SP.sqrt(tech_noise[ids]);
_conv = vc.optimize(scales0=scales0)
if _conv: break
conv[count] = _conv
if not _conv:
var[count,-2] = SP.maximum(0,y.var()-tech_noise[ids])
var[count,-1] = tech_noise[ids]
count+=1;
continue
_var = vc.getVarianceComps()[0,:]
KiY = vc.gp.agetKEffInvYCache().ravel()
for ki in range(len(K)):
Ystar[ki][:,count]=_var[ki]*SP.dot(K[ki],KiY)
var[count,:] = _var
count+=1;
if self.geneID!=None: geneID = SP.array(self.geneID)[idx]
col_header = ['hidden_%d'%i for i in range(len(K))]
col_header.append('biol_noise')
col_header.append('tech_noise')
col_header = SP.array(col_header)
#annotate column and rows of var and Ystar
var_info = {'gene_idx':idx,'col_header':col_header,'conv':conv}
if geneID!=None: var_info['geneID'] = SP.array(geneID)
Ystar_info = {'gene_idx':idx,'conv':conv}
if geneID!=None: Ystar_info['geneID'] = SP.array(geneID)
# cache stuff
if cache == True:
self.var = var
self.Ystar = Ystar
self.var_info = var_info
self.Ystar_info = Ystar_info
else:
return var, var_info
def getVarianceComponents(self,normalize=False):
"""
Returns:
var: variance component matrix [G_0,m]
(m = number of variance components=len(K)+2)
(G0 = genes which were considered)
normalize: if True, variance components are normalized to sum up to 1 (default False)
var_info: dictionary annotating rows and columns of var
gene_idx: index of the gene in the full matrix
col_header: labels of variance components in the matrix
conv: boolean vetor marking genes for which variance decomposition has converged
geneID: annotate rows of the variance component matrix
"""
assert self.var!=None, 'scLVM:: use varianceDecomposition method before'
if normalize: var = self.var/self.var.sum(1)[:,SP.newaxis]
else: var = self.var
return var, self.var_info
def getPredictions(self):
"""
Returns:
Ystar: predictions [N,G_0]
(G0 = genes which were considered in the analysis)
Remark: if more than 1 random effect is considered Ystar is a list of predicion matrices, ones per random effec
Ystar_info: annotate rows and columns of Ystar
gene_idx: index of the gene in the full matrix
conv: boolean vetor marking genes for which variance decomposition has converged
geneID: annotate rows of the variance component matrix
"""
assert self.var!=None, 'scLVM:: use varianceDecomposition method before'
if len(self.Ystar)==1: Ystar = self.Ystar[0]
else: Ystar = self.Ystar
return Ystar, self.Ystar_info
def getCorrectedExpression(self,rand_eff_ids=None):
"""
Args:
rand_eff_ids: index of the random effects that are to consider for the correction
Returns:
Ycorr: corrected expression levels
"""
assert self.var!=None, 'scLVM:: use varianceDecomposition method before'
# check rand_eff_ids
if rand_eff_ids==None: rand_eff_ids=range(len(self.Ystar))
elif type(rand_eff_ids)!=list: rand_eff_ids=[rand_eff_ids]
# loop on random effect to consider and correct
#predicitive means were calculated for standardised expression
idx = self.Ystar_info['gene_idx']
Yidx = self.Y[:,idx]
Ystd = Yidx-Yidx.mean(0)
Ystd/= Yidx.std(0) #delta optimization might be more efficient
Ycorr = Ystd#copy.copy(self.Y[:,self.Ystar_info['gene_idx']])
for i in rand_eff_ids:
Ycorr -= self.Ystar[i]
Ycorr*=self.Y[:,self.Ystar_info['gene_idx']].std(0) #bring back to original scale
Ycorr+=self.Y[:,self.Ystar_info['gene_idx']].mean(0)
return Ycorr
def fitLMM(self,expr = None,K=None,tech_noise=None,idx=None,i0=None,i1=None,verbose=False, recalc=True, standardize=True):
"""
Args:
K: list of random effects to be considered in the analysis
if K is none, it does not consider any random effect
expr: correlations are calculated between the gene expression data (self.Y) and these measures provided in expr. If None, self.Y i sused
idx:
indices of the genes to be considered in the analysis
i0: gene index from which the anlysis starts
i1: gene index to which the analysis stops
verbose: if True, print progress
recalc: if True, re-do variance decomposition
standardize: if True, standardize also expression
Returns:
pv: matrix of pvalues
beta: matrix of correlations
info: dictionary annotates pv and beta rows and columns, containing
gene_idx_row: index of the genes in rows
conv: boolean vetor marking genes for which variance decomposition has converged
gene_row: annotate rows of matrices
"""
if idx==None:
if i0==None or i1==None:
i0 = 0; i1 = self.G
idx = SP.arange(i0,i1)
elif type(idx)!=SP.ndarray:
idx = SP.array(idx)
idx = SP.intersect1d(idx,SP.where(self.Y.std(0)>0)[0]) #only makes sense if gene is expressed in at least one cell
if K!=None:
if type(K)!=list: K = [K]
if (recalc==True and len(K)>1) or (recalc==True and self.var==None):
print 'performing variance decomposition first...'
var_raw,var_info = self.varianceDecomposition(K=K,idx=idx, cache=False)
var = var_raw/var_raw.sum(1)[:,SP.newaxis]
elif recalc==False and len(K)>1:
assert self.var!=None, 'scLVM:: when multiple hidden factors are considered, varianceDecomposition decomposition must be used prior to this method'
warnings.warn('scLVM:: recalc should only be set to False by advanced users: scLVM then assumes that the random effects are the same as those for which the variance decompostion was performed earlier.')
var_raw = self.var
var_info = self.var_info
var = var_raw/var_raw.sum(1)[:,SP.newaxis]
lmm_params = {'covs':SP.ones([self.N,1]),'NumIntervalsDeltaAlt':100,'NumIntervalsDelta0':100,'searchDelta':True}
Yidx = self.Y[:,idx]
Ystd = Yidx-Yidx.mean(0)
Ystd/= Yidx.std(0) #delta optimization might be more efficient
if expr==None:
expr = Ystd
elif standardize==True:
exprStd = expr
exprStd = expr-expr.mean(0)
exprStd/= expr.std(0)
expr = exprStd
_G1 = idx.shape[0]
_G2 = expr.shape[1]
geneID = SP.zeros(_G1,dtype=str)
beta = SP.zeros((_G1,_G2))
pv = SP.zeros((_G1,_G2))
count = 0
for ids in range(_G1):
if verbose:
print '.. fitting gene %d'%ids
# extract a single gene
if K!=None:
if len(K)>1:
if var_info['conv'][count]==True:
_K = SP.sum([var[count,i]*K[i] for i in range(len(K))],0)
_K/= _K.diagonal().mean()
else:
_K = None
else:
_K = K[0]
else:
_K = None
lm = QTL.test_lmm(expr,Ystd[:,ids:ids+1],K=_K,verbose=False,**lmm_params)
pv[count,:] = lm.getPv()[0,:]
beta[count,:] = lm.getBetaSNP()[0,:]
count+=1
if self.geneID!=None: geneID = SP.array(self.geneID)[idx]
if recalc==True and K!=None and len(K)>1:
info = {'conv':var_info['conv'],'gene_idx_row':idx}
else:
info = {'gene_idx_row':idx}
if geneID!=None: info['gene_row'] = geneID
return pv, beta, info