In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_venn import venn3
import networkx as nx

In [2]:
import sys
sys.path.append('../../code')

import min_vertex_k_cut
from splits import get_hi_split

# DRD2

In [3]:
drd2_hi = pd.read_csv('../../data/raw/drd2_hi.csv', index_col=0)
drd2_hi

Unnamed: 0,smiles,value
0,Brc1ccc(-[n+]2cc[n+](Cc3ccccc3)cc2)c2cc[nH]c12,True
1,Brc1ccc(CNCCN2CCN(Cc3cc4ccccc4[nH]3)CC2)cc1,False
2,Brc1ccc(N2CCN(Cc3ccccc3)CC2)c2cc[nH]c12,True
3,Brc1ccc(NCCN2CCN(CCc3c[nH]c4ccccc34)CC2)cc1,True
4,Brc1ccc(NCCN2CCN(Cc3cc4ccccc4[nH]3)CC2)cc1,True
...,...,...
6262,c1cnc(N2CCN(CCCOc3ccc(-c4nc5ccccc5o4)cc3)CC2)nc1,True
6263,c1cnc(N2CCN(CCCSc3nc4ccccc4s3)CC2)nc1,True
6264,c1cnc(N2CCN(Cc3c[nH]c4ncccc34)CC2)nc1,False
6265,c1cncc(CN[C@H]2C3C4CC5C6C4CC3C6C52)c1,False


## Greedy

In [4]:
train_1, test_1, sim_1 = get_hi_split(drd2_hi, threshold=0.4, seed=322, cutoff=0.55)
print('Lost molecules:', len(drd2_hi) - len(train_1) - len(test_1))

Lost molecules: 1066


In [5]:
print(len(train_1))
print(len(test_1))

5013
188


## ILP

In [6]:
smiles = drd2_hi['smiles'].to_list()
threshold = 0.4

neighborhood_graph = min_vertex_k_cut.get_neighborhood_graph(smiles, threshold)
main_component, small_components = min_vertex_k_cut.get_main_component(neighborhood_graph)

old_nodes_to_new = dict(zip(main_component.nodes(), range(main_component.number_of_nodes())))
new_nodes_to_old = {v: k for k, v in old_nodes_to_new.items()}
main_component = nx.relabel_nodes(main_component, old_nodes_to_new)

In [7]:
coarsed_main_component, node_to_cluster = min_vertex_k_cut.coarse_graph(main_component, 0.7)

In [10]:
model = min_vertex_k_cut.train_test_split_connected_graph(coarsed_main_component, train_min_fraq=0.85, test_min_fraq=0.13, max_mip_gap=0.05)

Total molecules: 6082
Min train size 5169
Min test size 790
Starting solution of the Linear programming relaxation problem using Primal Simplex

Coin0506I Presolve 66257 (-3926) rows, 3926 (0) columns and 136436 (-7852) elements
Clp0030I 13 infeas 9.3146535, obj 6081.5745 - mu 0.0410119, its 52, 3926 interior
Clp0030I 27 infeas 0.48770189, obj 6081.1649 - mu 0.00016868888, its 105, 3926 interior
Clp0030I 39 infeas 0.0074515276, obj 6081.069 - mu 2.0817459e-06, its 105, 3926 interior
Clp0030I 51 infeas 0.00010423538, obj 6081.0668 - mu 2.5690288e-08, its 105, 3926 interior
Clp1000I sum of infeasibilities 1.44334e-05 - average 2.17839e-10, 0 fixed columns
Coin0506I Presolve 66257 (0) rows, 3926 (0) columns and 136436 (0) elements
Clp0006I 0  Obj 6081.0668 Dual inf 1128086 (3926)
Clp0029I End of values pass after 3926 iterations
Clp0014I Perturbing problem by 0.001% of 0.60175391 - largest nonzero change 2.9999878e-05 ( 0.0022565436%) - largest zero change 0
Clp0006I 4482  Obj 6082.0225 D

In [11]:
_ = min_vertex_k_cut.process_bisect_results(model, coarsed_main_component, main_component, node_to_cluster)

Molecules in train: 5193
Molecules in test: 792
Molecules lost: 97


# HIV

In [3]:
hiv = pd.read_csv('../../data/raw/HIV.csv')
hiv['value'] = hiv['HIV_active']
hiv = hiv.drop(labels=['activity', 'HIV_active'], axis=1)
hiv

Unnamed: 0,smiles,value
0,CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)...,0
1,C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3...,0
2,CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21,0
3,Nc1ccc(C=Cc2ccc(N)cc2S(=O)(=O)O)c(S(=O)(=O)O)c1,0
4,O=S(=O)(O)CCS(=O)(=O)O,0
...,...,...
41122,CCC1CCC2c3c([nH]c4ccc(C)cc34)C3C(=O)N(N(C)C)C(...,0
41123,Cc1ccc2[nH]c3c(c2c1)C1CCC(C(C)(C)C)CC1C1C(=O)N...,0
41124,Cc1ccc(N2C(=O)C3c4[nH]c5ccccc5c4C4CCC(C(C)(C)C...,0
41125,Cc1cccc(N2C(=O)C3c4[nH]c5ccccc5c4C4CCC(C(C)(C)...,0


## Greedy

In [15]:
train_1, test_1, sim_1 = get_hi_split(hiv, threshold=0.4, seed=322, cutoff=0.55)
print('Lost molecules:', len(hiv) - len(train_1) - len(test_1))



Lost molecules: 5851


## ILP

In [4]:
smiles = hiv['smiles'].to_list()
threshold = 0.4

neighborhood_graph = min_vertex_k_cut.get_neighborhood_graph(smiles, threshold)
main_component, small_components = min_vertex_k_cut.get_main_component(neighborhood_graph)

old_nodes_to_new = dict(zip(main_component.nodes(), range(main_component.number_of_nodes())))
new_nodes_to_old = {v: k for k, v in old_nodes_to_new.items()}
main_component = nx.relabel_nodes(main_component, old_nodes_to_new)



In [5]:
coarsed_main_component, node_to_cluster = min_vertex_k_cut.coarse_graph(main_component, 0.4)

In [6]:
model = min_vertex_k_cut.train_test_split_connected_graph(coarsed_main_component, train_min_fraq=0.85, test_min_fraq=0.12, max_mip_gap=0.3)

Total molecules: 31969
Min train size 27173
Min test size 3836
Welcome to the CBC MILP Solver 
Version: Trunk
Build Date: Oct 24 2021 

Starting solution of the Linear programming relaxation problem using Primal Simplex

Coin0506I Presolve 47611 (-12242) rows, 12242 (0) columns and 107460 (-24484) elements
Clp0030I 13 infeas 446.26431, obj 31475.57 - mu 0.0098469573, its 105, 11010 interior
Clp0030I 22 infeas 5.1986853, obj 31715.582 - mu 0.00036459272, its 105, 11913 interior
Clp0030I 32 infeas 0.1332499, obj 31724.394 - mu 4.4993445e-06, its 105, 12242 interior
Clp0030I 43 infeas 0.0022409046, obj 31724.609 - mu 1.665924e-07, its 105, 12242 interior
Clp1000I sum of infeasibilities 1.07727e-05 - average 2.26265e-10, 0 fixed columns
Coin0506I Presolve 47611 (0) rows, 12242 (0) columns and 107460 (0) elements
Clp0006I 0  Obj 31724.619 Dual inf 21895567 (12242)
Clp0029I End of values pass after 12242 iterations
Clp0014I Perturbing problem by 0.001% of 0.72733462 - largest nonzero change 

In [None]:
_ = min_vertex_k_cut.process_bisect_results(model, coarsed_main_component, main_component, node_to_cluster)