In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler


def select_from_cluster(arr, num_selected):
    """
    Algorithm MinMax for selecting points from cluster.

    Parameters
    ----------
    arr: np.ndarray
        distance matrix for points that needs to be selected if func_distance is None.
        Otherwise, treated as coordinates array.
    num_selected: int
        number of molecules that need to be selected
    cluster_ids: np.ndarray
        indices of molecules that form a cluster

    Returns
    -------
    selected: list
        list of ids of selected molecules
    """
    arr_dist = arr

    # choosing initial point as the medoid
    selected = [np.argmin(np.sum(arr_dist, axis=0))]
    while len(selected) < num_selected:
        min_distances = np.min(arr_dist[selected], axis=0)
        new_id = np.argmax(min_distances)
        selected.append(new_id)
        print(len(selected))
    return selected

In [2]:
data = pd.read_csv(r'../B3clf_new_data_2022_June/BBB_data_all_new_20220710.csv')
names = data.Name
X = data.iloc[:, 1:]#.astype('float')
X.iloc[np.where(X.isin([np.inf]))]

Unnamed: 0,ATS0s,ATS1s,AATS0s,AATS1s,ATSC2s,AATSC2s,sumI,meanI,gmax,gmin,...,sumI.1,meanI.1,gmax.1,gmin.1,ATS0s.1,AATS0s.1,sumI.2,meanI.2,gmax.2,gmin.2
23,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
23,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
23,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
23,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
23,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
23,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
23,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
23,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
23,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf
23,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf


There are some infinite values in the dataset. The good news is that there are only 40 samples. So the good idea might be to drop them out

In [3]:
weird_x = np.unique(np.where(X.isin([np.inf, -np.inf, np.nan]))[0])
print(weird_x)

X = X.drop(index=weird_x).astype(float).reset_index(drop=True)
names = names.drop(index=weird_x).reset_index(drop=True)

[  11   23   63   81   86   97  105  170  189  197  222  527  536  541
  542  543  544  546  547  548  551  597  641  682  683  692  722  741
  747  806  833 2240 2265 3129 4439 5861 6433 6488 7261 7308]


In [4]:
X_norm = StandardScaler().fit_transform(X)

In [5]:
from sklearn.metrics import pairwise_distances

X_dists = pairwise_distances(X_norm)
best_5 = select_from_cluster(X_dists, int(0.05*X_dists.shape[0]))

names.iloc[best_5].to_csv('diverse_5_upd_new.csv')

data_95 = X.drop(index=best_5)
names_95 = names.drop(index=best_5)
X_95 = data_95.iloc[:, 1:].astype('float')
X_norm_95 = StandardScaler().fit_transform(X_95)

X_dists_95 = pairwise_distances(X_norm_95)
for percent in [100]:
    selected = select_from_cluster(X_dists_95,
                                   X_dists_95.shape[0])
    print(selected)
    names_95.iloc[selected].to_csv(f'diverse_{percent}_from_95_upd_new.csv')

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
27

1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789


3230
3231
3232
3233
3234
3235
3236
3237
3238
3239
3240
3241
3242
3243
3244
3245
3246
3247
3248
3249
3250
3251
3252
3253
3254
3255
3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
3266
3267
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
3298
3299
3300
3301
3302
3303
3304
3305
3306
3307
3308
3309
3310
3311
3312
3313
3314
3315
3316
3317
3318
3319
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
3335
3336
3337
3338
3339
3340
3341
3342
3343
3344
3345
3346
3347
3348
3349
3350
3351
3352
3353
3354
3355
3356
3357
3358
3359
3360
3361
3362
3363
3364
3365
3366
3367
3368
3369
3370
3371
3372
3373
3374
3375
3376
3377
3378
3379
3380
3381
3382
3383
3384
3385
3386
3387
3388
3389
3390
3391
3392
3393
3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
3410
3411
3412
3413
3414
3415
3416
3417
3418
3419
3420
3421
3422
3423
3424
3425
3426
3427
3428
3429


4870
4871
4872
4873
4874
4875
4876
4877
4878
4879
4880
4881
4882
4883
4884
4885
4886
4887
4888
4889
4890
4891
4892
4893
4894
4895
4896
4897
4898
4899
4900
4901
4902
4903
4904
4905
4906
4907
4908
4909
4910
4911
4912
4913
4914
4915
4916
4917
4918
4919
4920
4921
4922
4923
4924
4925
4926
4927
4928
4929
4930
4931
4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
4947
4948
4949
4950
4951
4952
4953
4954
4955
4956
4957
4958
4959
4960
4961
4962
4963
4964
4965
4966
4967
4968
4969
4970
4971
4972
4973
4974
4975
4976
4977
4978
4979
4980
4981
4982
4983
4984
4985
4986
4987
4988
4989
4990
4991
4992
4993
4994
4995
4996
4997
4998
4999
5000
5001
5002
5003
5004
5005
5006
5007
5008
5009
5010
5011
5012
5013
5014
5015
5016
5017
5018
5019
5020
5021
5022
5023
5024
5025
5026
5027
5028
5029
5030
5031
5032
5033
5034
5035
5036
5037
5038
5039
5040
5041
5042
5043
5044
5045
5046
5047
5048
5049
5050
5051
5052
5053
5054
5055
5056
5057
5058
5059
5060
5061
5062
5063
5064
5065
5066
5067
5068
5069


6510
6511
6512
6513
6514
6515
6516
6517
6518
6519
6520
6521
6522
6523
6524
6525
6526
6527
6528
6529
6530
6531
6532
6533
6534
6535
6536
6537
6538
6539
6540
6541
6542
6543
6544
6545
6546
6547
6548
6549
6550
6551
6552
6553
6554
6555
6556
6557
6558
6559
6560
6561
6562
6563
6564
6565
6566
6567
6568
6569
6570
6571
6572
6573
6574
6575
6576
6577
6578
6579
6580
6581
6582
6583
6584
6585
6586
6587
6588
6589
6590
6591
6592
6593
6594
6595
6596
6597
6598
6599
6600
6601
6602
6603
6604
6605
6606
6607
6608
6609
6610
6611
6612
6613
6614
6615
6616
6617
6618
6619
6620
6621
6622
6623
6624
6625
6626
6627
6628
6629
6630
6631
6632
6633
6634
6635
6636
6637
6638
6639
6640
6641
6642
6643
6644
6645
6646
6647
6648
6649
6650
6651
6652
6653
6654
6655
6656
6657
6658
6659
6660
6661
6662
6663
6664
6665
6666
6667
6668
6669
6670
6671
6672
6673
6674
6675
6676
6677
6678
6679
6680
6681
6682
6683
6684
6685
6686
6687
6688
6689
6690
6691
6692
6693
6694
6695
6696
6697
6698
6699
6700
6701
6702
6703
6704
6705
6706
6707
6708
6709


In [148]:
x, y = np.where(X_dists_95 == 0)

weird_names = []
for pair in zip(x[x != y], y[x!= y]):
    weird_names.append(data_95.iloc[[pair[0], pair[1]]].Name.values)
    
weird = np.array(weird_names)
np.save('weird_names.npy', weird)

In [156]:
data.iloc[[11, 2737]]

Unnamed: 0,Name,category,nAcid,ALogP,ALogp2,AMR,naAromAtom,nH,nN,nO,...,Du,P1m,E1m,E2m,E3m,Dm,E1v,E2v,E3v,Dv
11,bbb_1011,1,0.0,0.202,0.040804,31.7974,0.0,16.0,0.0,0.0,...,1.499125,0.558838,0.310295,0.272591,0.029382,0.612269,0.402035,0.374265,0.119954,0.896254
2737,bbb_3557,1,0.0,0.202,0.040804,31.7974,0.0,16.0,0.0,0.0,...,1.499125,0.558838,0.310295,0.272591,0.029382,0.612269,0.402035,0.374265,0.119954,0.896254


In [158]:
weird.shape

(274, 2)