/
__init__.py
1499 lines (1317 loc) · 55.5 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# __init__.py
#
# Copyright 2018 Chichau Miau <zmiao@ebi.ac.uk>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
# MA 02110-1301, USA.
#
#
import pandas as pd
import numpy as np
from scipy.sparse import issparse
from pandas.api.types import is_categorical_dtype
from collections import defaultdict
import louvain
import scipy
import seaborn as sns
import patsy
from scanpy import settings as sett
from scanpy import logging as logg
import scanpy as sc
# for color
from scanpy.plotting.palettes import *
import matplotlib.pyplot as plt
from matplotlib import rcParams
# for reading/saving clf model
from sklearn.mixture import BayesianGaussianMixture
from sklearn import metrics
from sklearn.metrics.pairwise import euclidean_distances, pairwise_distances
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate, train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
color_long = ['#e6194b','#3cb44b','#ffe119','#0082c8','#f58231','#911eb4',\
'#46f0f0','#f032e6','#d2f53c','#fabebe','#008080','#e6beff',\
'#aa6e28','#800000','#aaffc3','#808000','#ffd8b1','#000080',\
'#808080','#000000', "#FFFF00", "#1CE6FF", "#FF34FF", "#FF4A46", \
"#008941", "#006FA6", "#A30059","#FFDBE5", "#7A4900", "#0000A6", \
"#63FFAC", "#B79762", "#004D43", "#8FB0FF", "#997D87","#5A0007", \
"#809693", "#6A3A4C", "#1B4400", "#4FC601", "#3B5DFF","#4A3B53", \
"#FF2F80","#61615A", "#BA0900", "#6B7900", "#00C2A0", "#FFAA92", "#FF90C9", "#B903AA", "#D16100",\
"#DDEFFF", "#000035", "#7B4F4B", "#A1C299", "#300018", "#0AA6D8", "#013349", "#00846F",\
"#372101", "#FFB500", "#C2FFED", "#A079BF", "#CC0744", "#C0B9B2", "#C2FF99", "#001E09",\
"#00489C", "#6F0062", "#0CBD66", "#EEC3FF", "#456D75", "#B77B68", "#7A87A1", "#788D66",\
"#885578", "#FAD09F", "#FF8A9A", "#D157A0", "#BEC459", "#456648", "#0086ED", "#886F4C",\
"#34362D", "#B4A8BD", "#00A6AA", "#452C2C", "#636375", "#A3C8C9", "#FF913F", "#938A81",\
"#575329", "#00FECF", "#B05B6F", "#8CD0FF", "#3B9700", "#04F757", "#C8A1A1", "#1E6E00",\
"#7900D7", "#A77500", "#6367A9", "#A05837", "#6B002C", "#772600", "#D790FF", "#9B9700",\
"#549E79", "#FFF69F", "#201625", "#72418F", "#BC23FF", "#99ADC0", "#3A2465", "#922329",\
"#5B4534", "#FDE8DC", "#404E55", "#0089A3", "#CB7E98", "#A4E804", "#324E72"]
def run_BayesianGaussianMixture(Y, K):
"""
For K-means clustering
Input
-----
Y: the expression matrix
K: number of clusters
return
-----
clusters assigned to each cell.
"""
gmm = BayesianGaussianMixture(K, max_iter=1000)
gmm.fit(Y)
return gmm.predict(Y)
def bhattacharyya_distance(repr1, repr2):
"""Calculates Bhattacharyya distance (https://en.wikipedia.org/wiki/Bhattacharyya_distance)."""
sim = - np.log(np.sum([np.sqrt(p * q) for (p, q) in zip(repr1, repr2)]))
assert not np.isnan(sim), 'Error: Similarity is nan.'
if np.isinf(sim):
# the similarity is -inf if no term in the review is in the vocabulary
return 0
return sim
def bhattacharyya_matrix(prob, flags=None):
ndim = prob.shape[1]
m = np.zeros((ndim, ndim))
if flags is None:
for i in np.arange(ndim):
for j in np.arange(i + 1, ndim):
val = -bhattacharyya_distance(prob[:, i], prob[:, j])
m[i, j] = val
m[j, i] = val
else:
for i in np.arange(ndim):
for j in np.arange(i + 1, ndim):
df = pd.DataFrame({'i': prob[:, i], 'j': prob[:, j], 'f': flags})
df = df[df['f']]
val = -bhattacharyya_distance(df['i'], df['j'])
m[i, j] = val
m[j, i] = val
return m
def binary_accuracy(X, y, clf):
y_pred = clf.predict(X)
return y == y_pred
def normalize_confmat1(cmat, mod='1'):
"""
Normalize the confusion matrix based on the total number of cells in each class
x(i,j) = max(cmat(i,j)/diagnol(i),cmat(j,i)/diagnol(j))
confusion rate between i and j is defined by the maximum ratio i is confused as j or j is confused as i.
Input
cmat: the confusion matrix
return
-----
the normalized confusion matrix
"""
dmat = cmat.values
smat = np.diag(dmat)
dim = cmat.shape[0]
xmat = np.zeros([dim, dim])
for i in range(dim):
for j in range(i + 1, dim):
if mod is '1':
xmat[i, j] = xmat[j, i] = max(dmat[i, j] / smat[j], dmat[j, i] / smat[i])
else:
xmat[i, j] = xmat[j, i] = max(dmat[i, j] / smat[i], dmat[j, i] / smat[j])
return xmat
def normalize_confmat2(cmat):
"""
Normalize the confusion matrix based on the total number of cells.
x(i,j) = max(cmat(i,j)+cmat(j,i)/N)
N is total number of cells analyzed.
Confusion rate between i and j is defined by the sum of i confused as j or j confused as i.
Then divide by total number of cells.
Input
cmat: the confusion matrix
return
-----
the normalized confusion matrix
"""
emat = np.copy(cmat)
s = np.sum(cmat)
emat = emat + emat.T
np.fill_diagonal(emat, 0)
return emat * 1.0 / s
def cluster_adjmat(xmat,
resolution=1,
cutoff=0.1):
"""
Cluster the groups based on the adjacent matrix.
Use the cutoff to discretize the matrix used to construct the adjacent graph.
Then cluster the graph using the louvain clustering with a resolution value.
As the adjacent matrix is binary, the default resolution value is set to 1.
Input
-----
xmat: `numpy.array` or sparse matrix
the reference matrix/normalized confusion matrix
cutoff: `float` optional (default: 0.1)
threshold used to binarize the reference matrix
resolution: `float` optional (default: 1.0)
resolution parameter for louvain clustering
return
-----
new group names.
"""
g = sc._utils.get_igraph_from_adjacency((xmat > cutoff).astype(int), directed=False)
print(g)
part = louvain.find_partition(g, louvain.RBConfigurationVertexPartition,
resolution_parameter=resolution)
groups = np.array(part.membership)
return groups
def msample(x, n, frac):
"""
sample the matrix by number or by fraction.
if the fraction is larger than the sample number, use number for sampling. Otherwise, use fraction.
Input
-----
x: the matrix to be split
n: number of vectors to be sampled
frac: fraction of the total matrix to be sampled
return
-----
sampled selection.
"""
if len(x) <= np.floor(n / frac):
if len(x) < 10: frac = 0.9
return x.sample(frac=frac)
else:
return x.sample(n=n)
def train_test_split_per_type(X, y, n=100, frac=0.8):
"""
This function is identical to train_test_split, but can split the data either based on number of cells or by fraction.
Input
-----
X: `numpy.array` or sparse matrix
the feature matrix
y: `list of string/int`
the class assignments
n: `int` optional (default: 100)
maximum number sampled in each label
fraction: `float` optional (default: 0.8)
Fraction of data included in the training set. 0.5 means use half of the data for training,
if half of the data is fewer than maximum number of cells (n).
return
-----
X_train, X_test, y_train, y_test
"""
df = pd.DataFrame(y)
df.index = np.arange(len(y))
df.columns = ['class']
c_idx = df.groupby('class').apply(lambda x: msample(x, n=n, frac=frac)).index.get_level_values(None)
d_idx = ~np.isin(np.arange(len(y)), c_idx)
return X[c_idx, :], X[d_idx, :], y[c_idx], y[d_idx]
# functions for SCCAF
def SCCAF_assessment(*args, **kwargs):
"""
Assessment of clustering reliability using self-projection.
It is the same as the self_projection function.
"""
return self_projection(*args, **kwargs)
# need to check number of cells in each cluster of the training set.
def self_projection(X,
cell_types,
classifier="LR",
penalty='l1',
sparsity=0.5,
fraction=0.5,
random_state=1,
solver='liblinear',
n=0,
cv=5,
whole=False,
n_jobs=None):
# n = 100 should be good.
"""
This is the core function for running self-projection.
Input
-----
X: `numpy.array` or sparse matrix
the expression matrix, e.g. ad.raw.X.
cell_types: `list of String/int`
the cell clustering assignment
classifier: `String` optional (defatul: 'LR')
a machine learning model in "LR" (logistic regression), \
"RF" (Random Forest), "GNB"(Gaussion Naive Bayes), "SVM" (Support Vector Machine) and "DT"(Decision Tree).
penalty: `String` optional (default: 'l2')
the standardization mode of logistic regression. Use 'l1' or 'l2'.
sparsity: `fload` optional (default: 0.5)
The sparsity parameter (C in sklearn.linear_model.LogisticRegression) for the logistic regression model.
fraction: `float` optional (default: 0.5)
Fraction of data included in the training set. 0.5 means use half of the data for training,
if half of the data is fewer than maximum number of cells (n).
random_state: `int` optional (default: 1)
random_state parameter for logistic regression.
n: `int` optional (default: 100)
Maximum number of cell included in the training set for each cluster of cells.
only fraction is used to split the dataset if n is 0.
cv: `int` optional (default: 5)
fold for cross-validation on the training set.
0 means no cross-validation.
whole: `bool` optional (default: False)
if measure the performance on the whole dataset (include training and test).
n_jobs: `int` optional, number of threads to use with the different classifiers (default: None - unlimited).
return
-----
y_prob, y_pred, y_test, clf
y_prob: `matrix of float`
prediction probability
y_pred: `list of string/int`
predicted clustering of the test set
y_test: `list of string/int`
real clustering of the test set
clf: the classifier model.
"""
# split the data into training and testing
if n > 0:
X_train, X_test, y_train, y_test = \
train_test_split_per_type(X, cell_types, n=n, frac=(1 - fraction))
else:
X_train, X_test, y_train, y_test = \
train_test_split(X, cell_types,
stratify=cell_types, test_size=fraction) # fraction means test size
# set the classifier
if classifier == 'LR':
clf = LogisticRegression(random_state=1, penalty=penalty, C=sparsity, multi_class="ovr", solver=solver)
elif classifier == 'RF':
clf = RandomForestClassifier(random_state=1, n_jobs=n_jobs)
elif classifier == 'GNB':
clf = GaussianNB()
elif classifier == 'GPC':
clf = GaussianProcessClassifier(n_jobs=n_jobs)
elif classifier == 'SVM':
clf = SVC(probability=True)
elif classifier == 'SH':
clf = SGDClassifier(loss='squared_hinge', n_jobs=n_jobs)
elif classifier == 'PCP':
clf = SGDClassifier(loss='perceptron', n_jobs=n_jobs)
elif classifier == 'DT':
clf = DecisionTreeClassifier()
# mean cross validation score
cvsm = 0
if cv > 0:
cvs = cross_val_score(clf, X_train, np.array(y_train), cv=cv, scoring='accuracy', n_jobs=n_jobs)
cvsm = cvs.mean()
print("Mean CV accuracy: %.4f" % cvsm)
# accuracy on cross validation and on test set
clf.fit(X_train, y_train)
accuracy = clf.score(X_train, y_train)
print("Accuracy on the training set: %.4f" % accuracy)
accuracy_test = clf.score(X_test, y_test)
print("Accuracy on the hold-out set: %.4f" % accuracy_test)
# accuracy of the whole dataset
if whole:
accuracy = clf.score(X, cell_types)
print("Accuracy on the whole set: %.4f" % accuracy)
# get predicted probability on the test set
y_prob = None
if not classifier in ['SH', 'PCP']:
y_prob = clf.predict_proba(X_test)
y_pred = clf.predict(X_test)
return y_prob, y_pred, y_test, clf, cvsm, accuracy_test
def make_unique(dup_list):
"""
Make a name list unique by adding suffix "_%d". This function is identical to the make.unique function in R.
Input
-----
dup_list: a list
return
-----
a unique list with the same length as the input.
"""
from collections import Counter
counter = Counter()
deduped = []
for name in dup_list:
new = str(name) + "_%s" % str(counter[name]) if counter[name] else name
counter.update({name: 1})
deduped.append(new)
return deduped
def confusion_matrix(y_test, y_pred, clf, labels=None):
"""
Get confusion matrix based on the test set.
Input
-----
y_test, y_pred, clf: same as in self_projection
return
-----
the confusion matrix
"""
if labels is None: labels = clf.classes_
df = pd.DataFrame.from_records(metrics.confusion_matrix(y_test, y_pred, labels=labels))
df.index = labels
df.columns = labels
df.index.name = 'Real'
df.columns.name = 'Predicted'
return df
def per_cluster_accuracy(mtx, ad=None, clstr_name='louvain'):
"""
Measure the accuracy of each cluster and put into a metadata slot.
So the reliability of each cluster can be visualized.
Input
-----
mtx: `pandas.dataframe`
the confusion matrix
ad: `AnnData`
anndata object
clstr_name: `String`
the name of the clustering
return
-----
"""
ds = None
if not ad is None:
ds = (np.diag(mtx.values) / mtx.sum(0)).fillna(0)
rel_name = '%s_reliability' % clstr_name
ad.obs[rel_name] = ad.obs[clstr_name].astype('category')
ad.obs[rel_name].cat.categories = make_unique(ds)
ad.obs[rel_name] = ad.obs[rel_name].astype(str).str.split("_").str[0]
ad.obs[rel_name] = ad.obs[rel_name].astype(float)
return ds
def per_cell_accuracy(X, cell_types, clf):
y_prob = clf.predict_proba(X)
df = pd.DataFrame(y_prob, index=cell_types, columns=clf.classes_).sort_index().T
df.index.name = 'Predicted'
dy = np.empty([0])
for cell in df.index:
x = np.array(df.loc[cell][cell].values)
dy = np.concatenate((dy, x))
return dy/np.max(df.values, 0)
def get_topmarkers(clf, names, topn=10):
"""
Get the top weighted features from the logistic regressioin model.
Input
-----
clf: the logistic regression classifier
names: `list of Strings`
the names of the features (the gene names).
topn: `int`
number of top weighted featured to be returned.
return
-----
list of markers for each of the cluster.
"""
marker_genes = pd.DataFrame({
'cell_type': clf.classes_[clf.coef_.argmax(0)],
'gene': names,
'weight': clf.coef_.max(0)
})
top_markers = \
marker_genes \
.query('weight > 0.') \
.sort_values('weight', ascending=False) \
.groupby('cell_type') \
.head(topn) \
.sort_values(['cell_type', 'weight'], ascending=[True, False])
return top_markers
def eu_distance(X, gp1, gp2, cell):
"""
Measure the euclidean distance between two groups of cells and the third group.
Input
-----
X: `np.array` or `sparse matrix`
the total expression matrix
gp1: `bool list`
group1 of cells
gp2: `bool list`
group2 of cells
cell: `bool list`
group3 of cells, the group to be compared with gp1 and gp2.
return
-----
`float value`
the average distance difference.
"""
d1 = euclidean_distances(X[gp1 & (~cell), :], X[cell, :])
d2 = euclidean_distances(X[gp2 & (~cell), :], X[cell, :])
df1 = pd.DataFrame(d1[:, 0], columns=['distance'])
df1['type'] = 'cell'
df2 = pd.DataFrame(d2[:, 0], columns=['distance'])
df2['type'] = 'cell_pred'
df = pd.concat([df1, df2])
m1 = d1.mean()
m2 = d2.mean()
print('%f - %f' % (m1, m2))
return df
def get_distance_matrix(X, clusters, labels=None, metric='euclidean'):
"""
Get the mean distance matrix between all clusters.
Input
-----
X: `np.array` or `sparse matrix`
the total expression matrix
clusters: `string list`
the assignment of the clusters
labels: `string list`
the unique labels of the clusters
metric: `string` (optional, default: euclidean)
distance metrics, see (http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html)
return
-----
`np.array`
the all-cluster to all-cluster distance matrix.
"""
if labels is None:
labels = np.unique(clusters)
centers = []
if scipy.sparse.issparse(X):
for cl in labels:
centers.append(np.array(X[np.where(clusters == cl)[0], :].mean(0))[0, :])
else:
for cl in labels:
centers.append(np.array(X[np.where(clusters == cl)[0], :].mean(0)))
return pairwise_distances(np.array(centers), metric=metric)
def merge_cluster(ad, old_id, new_id, groups):
ad.obs[new_id] = ad.obs[old_id]
ad.obs[new_id] = ad.obs[new_id].astype('category')
ad.obs[new_id].cat.categories = make_unique(groups.astype(str))
ad.obs[new_id] = ad.obs[new_id].str.split('_').str[0]
return ad
def find_high_resolution(ad, resolution=4, n=100):
cut = resolution
while cut > 0.5:
print("clustering with resolution: %.1f" % cut)
sc.tl.leiden(ad, resolution=cut)
ad.obs['leiden_res%.1f' % cut] = ad.obs['leiden']
if ad.obs['leiden'].value_counts().min() > n:
break
cut -= 0.5
def get_connection_matrix(ad_obs, key1, key2):
df = ad_obs.groupby([key1,key2]).size().to_frame().reset_index()
df.columns = [key1,key2,'counts']
df2 = ad_obs[key2].value_counts()
df['size'] = df2[df[key2].tolist()].tolist()
df['percent'] = df['counts']/df['size']
df = df[df['percent']>0.1]
df2 = df.groupby(key1).size()
df2 = df2[df2>1]
df = df[df[key1].isin(df2.index)]
dim = len(ad_obs[key2].unique())
mat = pd.DataFrame(np.zeros([dim,dim])).astype(int)
mat.columns = mat.columns.astype(str)
mat.index = mat.index.astype(str)
import itertools
grouped = df.groupby(key1)
for name, group in grouped:
x = group[key2]
if len(x)>0:
for i,j in list(itertools.combinations(x.tolist(), 2)):
mat.loc[i,j] = mat.loc[j,i] = 1
return mat
def SCCAF_optimize_all(ad,
min_acc=0.9,
R1norm_cutoff=0.5,
R2norm_cutoff=0.05,
R1norm_step=0.01,
R2norm_step=0.001,
prefix='L1',
min_i = 3,
start = None,
start_iter = 0,
*args, **kwargs):
"""
ad: `AnnData`
The AnnData object of the expression profile.
min_acc: `float` optional (default: 0.9)
The minimum self-projection accuracy to be optimized for.
e.g., 0.9 means the clustering optimization (merging process)
will not stop until the self-projection accuracy is above 90%.
R1norm_cutoff: `float` optional (default: 0.5)
The start cutoff for R1norm of confusion matrix.
e.g., 0.5 means use 0.5 as a cutoff to discretize the confusion matrix after R1norm.
the discretized matrix is used to construct the connection graph for clustering optimization.
R2norm_cutoff: `float` optional (default: 0.05)
The start cutoff for R2norm of confusion matrix.
R1norm_step: `float` optional (default: 0.01)
The reduce step for minimum R1norm value.
Each round of optimization calls the function `SCCAF_optimize`.
The start of the next round of optimization is based on a new
cutoff for the R1norm of the confusion matrix. This cutoff is
determined by the minimum R1norm value in the previous round minus the R1norm_step value.
R2norm_step: `float` optional (default: 0.001)
The reduce step for minimum R2norm value.
"""
acc = 0
#'start_iter = -1
if start is None:
start = '%s_Round%d'%(prefix, start_iter)
if not start in ad.obs.keys():
raise ValueError("`adata.obs['%s']` doesn't exist. Please assign the initial clustering first."%(start))
else:
if not start in ad.obs.keys():
raise ValueError("`adata.obs['%s']` doesn't exist. Please assign the initial clustering first."%(start))
ad.obs['%s_Round%d'%(prefix, start_iter)] = ad.obs[start]
clstr_old = len(ad.obs['%s_Round%d'%(prefix, start_iter)].unique())
#'while acc < min_acc:
for i in range(10):
if start_iter >0:
print("start_iter: %d" % start_iter)
print("R1norm_cutoff: %f" % R1norm_cutoff)
print("R2norm_cutoff: %f" % R2norm_cutoff)
print("Accuracy: %f" % acc)
print("======================")
ad, m1, m2, acc, start_iter = SCCAF_optimize(ad=ad,
R1norm_cutoff=R1norm_cutoff,
R2norm_cutoff=R2norm_cutoff,
start_iter=start_iter,
min_acc=min_acc,
prefix=prefix,
*args, **kwargs)
print("m1: %f" % m1)
print("m2: %f" % m2)
print("Accuracy: %f" % acc)
R1norm_cutoff = m1 - R1norm_step
R2norm_cutoff = m2 - R2norm_step
clstr_new = len(ad.obs['%s_result'%prefix].unique())
if clstr_new >= clstr_old and i >= min_i:
print("converged SCCAF_all!")
break
if acc >=min_acc:
break
def SCCAF_optimize(ad,
prefix='L1',
use='raw',
use_projection=False,
R1norm_only=False,
R2norm_only=False,
dist_only=False,
dist_not=True,
plot=True,
basis='umap',
plot_dist=False,
plot_cmat=False,
mod='1',
low_res = None,
c_iter=3,
n_iter=10,
n_jobs=None,
start_iter=0,
sparsity=0.5,
n=100,
fraction=0.5,
R1norm_cutoff=0.1,
R2norm_cutoff=1,
dist_cutoff=8,
classifier="LR",
mplotlib_backend=None,
min_acc=1):
"""
This is a self-projection confusion matrix directed cluster optimization function.
Input
-----
ad: `AnnData`
The AnnData object of the expression profile.
prefix: `String`, optional (default: 'L1')
The name of the optimization, which set as a prefix.
e.g., the prefix = 'L1', the start round of optimization clustering is based on
'L1_Round0'. So we need to assign an over-clustering state as a start point.
e.g., ad.obs['L1_Round0'] = ad.obs['louvain']
use: `String`, optional (default: 'raw')
Use what features to train the classifier. Three choices:
'raw' uses all the features;
'hvg' uses the highly variable genes in the anndata object ad.var_names slot;
'pca' uses the PCA data in the anndata object ad.obsm['X_pca'] slot.
R1norm_only: `bool` optional (default: False)
If only use the confusion matrix(R1norm) for clustering optimization.
R2norm_only: `bool` optional (default: False)
If only use the confusion matrix(R2norm) for clustering optimization.
dist_only: `bool` optional (default: False)
If only use the distance matrix for clustering optimization.
dist_not: `bool` optional (default: True)
If not use the distance matrix for clustering optimization.
plot: `bool` optional (default: True)
If plot the self-projectioin results, ROC curves and confusion matrices,
during the optimization.
plot_tsne: `bool` optional (default: False)
If plot the self-projectioin results as tSNE. If False, the results are plotted as UMAP.
plot_dist: `bool` optional (default: False)
If make a scatter plot of the distance compared with the confusion rate for each of the cluster.
plot_cmat: `bool` optional (default: False)
plot the confusion matrix or not.
mod: `string` optional (default: '1')
two directions of normalization of confusion matrix for R1norm.
c_iter: `int` optional (default: 3)
Number of iterations of sampling for the confusion matrix.
The minimum value of confusion rate in all the iterations is used as the confusion rate between two clusters.
n_iter: `int` optional (default: 10)
Maximum number of iterations(Rounds) for the clustering optimization.
start_iter: `int` optional (default: 0)
The start round of the optimization. e.g., start_iter = 3,
the optimization will start from ad.obs['%s_3'%prefix].
sparsity: `fload` optional (default: 0.5)
The sparsity parameter (C in sklearn.linear_model.LogisticRegression) for the logistic regression model.
n: `int` optional (default: 100)
Maximum number of cell included in the training set for each cluster of cells.
n_jobs: `int` number of jobs/threads to use (default: None - unlimited).
fraction: `float` optional (default: 0.5)
Fraction of data included in the training set. 0.5 means use half of the data for training,
if half of the data is fewer than maximum number of cells (n).
R1norm_cutoff: `float` optional (default: 0.1)
The cutoff for the confusion rate (R1norm) between two clusters.
0.1 means we allow maximum 10% of the one cluster confused as another cluster.
R2norm_cutoff: `float` optional (default: 1.0)
The cutoff for the confusion rate (R2norm) between two clusters.
1.0 means the confusion between any two cluster should not exceed 1% of the total number of cells.
dist_cutoff: `float` optional (default: 8.0)
The cutoff for the euclidean distance between two clusters of cells.
8.0 means the euclidean distance between two cell types should be greater than 8.0.
low_res: `str` optional
the clustering boundary for under-clustering. Set a low resolution in louvain/leiden clustering and give
the key as the underclustering boundary.
classifier: `String` optional (default: 'LR')
a machine learning model in "LR" (logistic regression), \
"RF" (Random Forest), "GNB"(Gaussion Naive Bayes), "SVM" (Support Vector Machine) and "DT"(Decision Tree).
mplotlib_backend: `matplotlib.backends.backend_pdf` optional
MatPlotLib multi-page backend object instance, previously initialised (currently the only type supported is
PdfPages).
min_acc: `float`
the minimum total accuracy to be achieved. Above this threshold, the optimization will stop.
return
-----
The modified anndata object, with a slot "%s_result"%prefix
assigned as the clustering optimization results.
"""
# the space to use
X = None
if use == 'raw':
X = ad.raw.X
elif use == 'pca':
if 'X_pca' not in ad.obsm.keys():
raise ValueError("`adata.obsm['X_pca']` doesn't exist. Run `sc.pp.pca` first.")
X = ad.obsm['X_pca']
elif 'X_%s'%use in ad.obsm.dtype.fields:
X = ad.obsm['X_%s'%use]
else:
X = ad[:,ad.var['highly_variable']].X
for i in range(start_iter, start_iter + n_iter):
print("Round%d ..." % (i + 1))
old_id = '%s_Round%d' % (prefix, i)
new_id = '%s_Round%d' % (prefix, i + 1)
labels = np.sort(ad.obs[old_id].unique().astype(int)).astype(str)
# optimize
y_prob, y_pred, y_test, clf, cvsm, acc = \
self_projection(X, ad.obs[old_id], sparsity=sparsity, n=n,
fraction=fraction, classifier=classifier, n_jobs=n_jobs)
accs = [acc]
ad.obs['%s_self-projection' % old_id] = clf.predict(X)
if plot:
aucs = plot_roc(y_prob, y_test, clf, cvsm=cvsm, acc=acc, title="Self-project ROC {}".format(old_id))
if mplotlib_backend:
mplotlib_backend.savefig()
plt.clf()
else:
plt.show()
plt.close()
sc.pl.scatter(ad, basis=basis, color=['%s_self-projection' % old_id], show=(mplotlib_backend is None),
color_map="RdYlBu_r", legend_loc='on data', frameon=False)
if mplotlib_backend:
mplotlib_backend.savefig()
plt.clf()
cmat = confusion_matrix(y_test, y_pred, clf, labels=labels)
xmat = normalize_confmat1(cmat, mod)
xmats = [xmat]
cmats = [np.array(cmat)]
old_id1 = old_id
if use_projection:
old_id1 = '%s_self-projection' % old_id
for j in range(c_iter - 1):
y_prob, y_pred, y_test, clf, _, acc = self_projection(X, ad.obs[old_id1], sparsity=sparsity, n=n,
fraction=fraction, classifier=classifier, cv=0,
n_jobs=n_jobs)
accs.append(acc)
cmat = confusion_matrix(y_test, y_pred, clf, labels=labels)
xmat = normalize_confmat1(cmat, mod)
xmats.append(xmat)
cmats.append(np.array(cmat))
R1mat = np.minimum.reduce(xmats)
R2mat = normalize_confmat2(np.minimum.reduce(cmats))
m1 = np.max(R1mat)
if np.isnan(m1):
m1 = 1.
m2 = np.max(R2mat)
print("Max R1mat: %f" % m1)
print("Max R2mat: %f" % m2)
if np.min(accs) > min_acc:
ad.obs['%s_result' % prefix] = ad.obs[old_id]
print("Converge SCCAF_optimize min_acc!")
break
print("min_acc: %f" % np.min(accs))
if plot:
if plot_cmat:
plot_heatmap_gray(cmat, 'Confusion Matrix', mplotlib_backend=mplotlib_backend)
plot_heatmap_gray(R1mat, 'Normalized Confusion Matrix (R1norm) - %s' % old_id, mplotlib_backend=mplotlib_backend)
plot_heatmap_gray(R2mat, 'Normalized Confusion Matrix (R2norm) - %s' % old_id, mplotlib_backend=mplotlib_backend)
if R1norm_only:
groups = cluster_adjmat(R1mat, cutoff=R1norm_cutoff)
elif R2norm_only:
groups = cluster_adjmat(R2mat, cutoff=R2norm_cutoff)
else:
if not low_res is None:
conn_mat = get_connection_matrix(ad_obs = ad.obs, key1 = low_res, key2 = old_id)
zmat = np.minimum.reduce([(R1mat > R1norm_cutoff), conn_mat.values])
groups = cluster_adjmat(zmat, cutoff=0)
else:
zmat = np.maximum.reduce([(R1mat > R1norm_cutoff), (R2mat > R2norm_cutoff)])
groups = cluster_adjmat(zmat, cutoff=0)
if len(np.unique(groups)) == len(ad.obs[old_id].unique()):
ad.obs['%s_result' % prefix] = ad.obs[old_id]
print("Converge SCCAF_optimize no. cluster!")
break
merge_cluster(ad, old_id1, new_id, groups)
if plot:
sc.pl.scatter(ad, basis=basis, color=[new_id], color_map="RdYlBu_r", legend_loc='on data',
show=(mplotlib_backend is None))
if mplotlib_backend:
mplotlib_backend.savefig()
plt.clf()
if len(np.unique(groups)) <= 1:
ad.obs['%s_result' % prefix] = ad.obs[new_id]
print("no clustering!")
break
return ad, m1, m2, np.min(accs), i
######################## PLOT FUNCTIONS ################################
def plot_link(ad, ymat, old_id, basis='tsne', ax=None, line_color='#ffa500', line_weight=10, plot_name=False,
legend_fontsize=12):
centroids = {}
Y = ad.obsm['X_%s' % basis]
for c in ad.obs[old_id].cat.categories:
Y_mask = Y[ad.obs[old_id] == c, :]
centroids[c] = np.median(Y_mask, axis=0)
if plot_name:
for c in centroids.keys():
ax.text(centroids[c][0], centroids[c][1], c,
verticalalignment='center',
horizontalalignment='center',
fontsize=legend_fontsize)
df = ymat.copy()
df = df.where(np.triu(np.ones(df.shape)).astype(np.bool))
df = df.stack().reset_index()
df.columns = ['i', 'j', 'val']
for k in np.arange(df.shape[0]):
val = df.iloc[k]['val']
if df.iloc[k]['val'] > 0:
i = df.iloc[k]['i']
j = df.iloc[k]['j']
ax.plot([centroids[i][0], centroids[j][0]], [centroids[i][1], centroids[j][1]],
linewidth=val * line_weight, color=line_color)
return ax
def plot_center(ad, groupby, ax, basis='tsne', size=20):
centroids = {}
Y = ad.obsm['X_%s' % basis]
for c in ad.obs[groupby].cat.categories:
Y_mask = Y[ad.obs[groupby] == c, :]
centroids[c] = np.median(Y_mask, axis=0)
for c in centroids.keys():
ax.plot(centroids[c][0], centroids[c][1], 'wo', markersize=size, alpha=0.5)
return ax
def plot_link_scatter(ad, ymat, basis='pca', group='cell', title=''):
fig, ax = plt.subplots()
ax = plot_link(ad, ymat=ymat, old_id=group, basis=basis, ax=ax)
sc.pl.scatter(ad, basis=basis, color=[group], color_map="RdYlBu_r", legend_loc='on data',
ax=ax, legend_fontsize=20, frameon=False, title=title)
return ax
def plot_markers(top_markers, topn=10, save=None, mplotlib_backend=None):
"""
Plot the top marker genes as a figure.
Input
-----
top_markers: `pandas dataframe`
top weighted featured in a machine learning model.
topn: `int`
number of features to be plotted.
save: `str` or None.
Save as a figure or not. String is the save file name.
return
-----
None
"""
n_types = len(top_markers.cell_type.unique())
nrow = int(np.floor(np.sqrt(n_types)))
ncol = int(np.ceil(n_types / nrow))
for i, m in enumerate(top_markers.cell_type.unique()):
plt.subplot(nrow, ncol, i + 1)
g = top_markers.query('cell_type == @m')
plt.title(m, size=12, weight='bold')
for j, gn in enumerate(g.gene):
plt.annotate(gn, (0, 0.2 * j))
plt.axis('off')
plt.ylim(topn * 0.2, -0.2)
plt.tight_layout()
if save:
plt.savefig(save)
elif mplotlib_backend:
mplotlib_backend.savefig()
else:
plt.show()
def plot_distance_jitter(df):
ax = sns.stripplot(x="type", y="distance", data=df, jitter=True)
def sc_pl_scatter(ad, basis='tsne', color='cell'):
df = pd.DataFrame(ad.obsm['X_%s' % basis])
df.columns = ['%s%d' % (basis, i + 1) for i in range(df.shape[1])]
df[color] = ad.obs[color].tolist()
df[color] = df[color].astype('category')
df[color].cat.categories = ad.obs[color].cat.categories
sns.lmplot('%s1' % basis, # Horizontal axis
'%s2' % basis, # Vertical axis
data=df, # Data source
fit_reg=False, # Don't fix a regression line
hue=color, # Set color
scatter_kws={"marker": "o", # Set marker style
"s": 10}, palette=default_20)
return df
def plot_heatmap_gray(X, title='', save=None, mplotlib_backend=None):
plt.clf()