In [1]:
import pandas as pd
import useful_rdkit_utils as uru
import numpy as np
from rdkit import SimDivFilters
from tqdm.auto import tqdm
from div_isim import diversity
from clust_isim import hierarchical_clustering

In [23]:
tqdm.pandas()

Read 10K molecules from ChEMBL

In [25]:
df = pd.read_csv("test.csv",names=["SMILES","Name"])
len(df)

10000

Add fingerprints to the dataframe

In [28]:
df['morgan_fp'] = df.SMILES.progress_apply(uru.smi2morgan_fp)
df['numpy_fp'] = df.SMILES.progress_apply(uru.smi2numpy_fp)

  0%|          | 0/10000 [00:00<?, ?it/s]

  0%|          | 0/10000 [00:00<?, ?it/s]

Convert fingerprints to a numpy array

In [32]:
X = np.stack(df.numpy_fp.values)

Use the RDKit to perform a diversity pick

In [18]:
%%time
mmp = SimDivFilters.MaxMinPicker()
maxmin_picks = mmp.LazyBitVectorPick(df.morgan_fp,len(df),len(df),[])
maxmin_list = np.empty(len(df),dtype=int)
maxmin_list.fill(-1)
for i,p in enumerate(maxmin_picks):
    maxmin_list[i] = p
df['maxmin'] = maxmin_list

CPU times: user 5.87 s, sys: 25.6 ms, total: 5.9 s
Wall time: 5.9 s


Use iSIM to perform a diversity pick

In [33]:
%%time 
diversity_picks = diversity(X,1)

CPU times: user 53.3 s, sys: 173 ms, total: 53.5 s
Wall time: 53.6 s


Use the RDKit to cluster 10K fingerprints

In [35]:
%%time
cluster_list = uru.taylor_butina_clustering(df.morgan_fp.values)

CPU times: user 12 s, sys: 778 ms, total: 12.8 s
Wall time: 13.2 s


Cluster with iSIM, 10K took too long, dropping to 500

In [38]:
%%time
clusters = hierarchical_clustering(X[:500])

499
498
497
496
495
494
493
492
491
490
489
488
487
486
485
484
483
482
481
480
479
478
477
476
475
474
473
472
471
470
469
468
467
466
465
464
463
462
461
460
459
458
457
456
455
454
453
452
451
450
449
448
447
446
445
444
443
442
441
440
439
438
437
436
435
434
433
432
431
430
429
428
427
426
425
424
423
422
421
420
419
418
417
416
415
414
413
412
411
410
409
408
407
406
405
404
403
402
401
400
399
398
397
396
395
394
393
392
391
390
389
388
387
386
385
384
383
382
381
380
379
378
377
376
375
374
373
372
371
370
369
368
367
366
365
364
363
362
361
360
359
358
357
356
355
354
353
352
351
350
349
348
347
346
345
344
343
342
341
340
339
338
337
336
335
334
333
332
331
330
329
328
327
326
325
324
323
322
321
320
319
318
317
316
315
314
313
312
311
310
309
308
307
306
305
304
303
302
301
300
299
298
297
296
295
294
293
292
291
290
289
288
287
286
285
284
283
282
281
280
279
278
277
276
275
274
273
272
271
270
269
268
267
266
265
264
263
262
261
260
259
258
257
256
255
254
253
252
251
250
