In [69]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.feature_selection import VarianceThreshold
from MIL_functions import data_encoding,data_splitting,model_building

In [88]:
int_results_raw = {
    'SEC':      pd.read_pickle('model_results/internal/MIL_aromatic_amine_cv_SEC.pk1'),
    'LHS' :     pd.read_pickle('model_results/internal/MIL_aromatic_amine_cv_LSH.pk1'),
    'scaffold': pd.read_pickle('model_results/internal/MIL_aromatic_amine_cv_Scaffold.pk1'),
    'random':   pd.read_pickle('model_results/internal/MIL_aromatic_amine_cv_results.pk1')
}
ext_results_raw = {
    'SEC':      pd.read_pickle('model_results\external\ext_val_results_SEC_stratified.pk1'),
    'LHS':      pd.read_pickle('model_results\external\ext_val_results_LSH_stratified.pk1'),
    'scaffold': pd.read_pickle('model_results\external\ext_val_results_scaffold_stratified.pk1'),
    'random':   pd.read_pickle('model_results\external\ext_val_results.pk1')
}

MORGAN_int_results_raw = {
    'SEC':      pd.read_pickle('model_results/internal/MIL_aromatic_amine_cv_SEC_MORGAN.pk1'),
    'LHS' :     pd.read_pickle('model_results/internal/MIL_aromatic_amine_cv_LSH_MORGAN.pk1'),
    'scaffold': pd.read_pickle('model_results/internal/MIL_aromatic_amine_cv_Scaffold_MORGAN.pk1'),
    'random':   pd.read_pickle('model_results/internal/MIL_aromatic_amine_cv_results_MORGAN.pk1')
}
MORGAN_ext_results_raw = {
    'SEC':      pd.read_pickle('model_results\external\ext_val_results_SEC_stratified_MORGAN.pk1'),
    'LHS':      pd.read_pickle('model_results\external\ext_val_results_LSH_stratified_MORGAN.pk1'),
    'scaffold': pd.read_pickle('model_results\external\ext_val_results_scaffold_stratified_MORGAN.pk1'),
    'random':   pd.read_pickle('model_results\external\ext_val_results_MORGAN.pk1')
}

hansen_raw =  pd.read_pickle('model_results/hansen/rscv_random_hansen_results.pk1')


## Aromatic amine dataset

In [64]:
splitting_function = {
    'SEC':          data_splitting.SEC,
    'LHS' :         data_splitting.LSH,
    'scaffold':     data_splitting.scaffold_split,
    'random':       data_splitting.random_split  
}

for internal_set,external_set,encoding in [(int_results_raw,ext_results_raw,'MACCS'),(MORGAN_int_results_raw,MORGAN_ext_results_raw,'Morgan')]:
    data = data_encoding.load_compressed_pickle("data/encoded/encoded_data.dat")
    if encoding == 'Morgan':
        data = model_building.remove_zero_variance(data,encoding='Morgan')
    for splitting_method in internal_set:
        internal_data = False; external_data = []

        training_data,test_data = data_splitting.split_data(data,splitting_function[splitting_method])
        rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=6234794)
        for fold,[train_index, validation_index] in enumerate(rskf.split(training_data, training_data["Ames"])):
            for mdl in internal_set[splitting_method]['model'].unique():
                t = training_data.iloc[validation_index].reset_index(drop=True).reset_index()
                t['model'] = mdl;   t['fold'] = fold%10; t["iteration"] = fold//10
                if type(internal_data) == bool and not internal_data:
                    internal_data   =   t
                    print(validation_index)
                else:
                    print(fold)
                    internal_data = pd.concat([internal_data,t])
        external_data = test_data

    external_set[splitting_method]['smiles'] = external_set[splitting_method]['index'].apply(lambda x: external_data['smiles'].iloc[x])
    internal_set[splitting_method]['smiles'] = pd.merge(internal_set[splitting_method],internal_data,how='left', on=['iteration','fold','index','model'])['smiles']
    break

[  1  30  31  32  36  44  53  57  59  61  67  86 117 124 142 170 177 178
 179 202 204 210 221 224 242 276 287 292 293 312 320 327 328 345 349 353
 365]
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
17
17
17
17
17
17
17
17
17
17
17
17
17
17
17
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
19
19
19
19
19
19
19
19
19
19
19
19
19
19
19
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
22
22
22
2

[16:51:36] Running LargestFragmentChooser
[16:51:36] Fragment: Nc1ccc2c(c1)oc1ccccc12
[16:51:36] New largest fragment: Nc1ccc2c(c1)oc1ccccc12 (23)
[16:51:36] Running LargestFragmentChooser
[16:51:36] Fragment: Nc1ccc([N+](=O)[O-])c(N)c1
[16:51:36] New largest fragment: Nc1ccc([N+](=O)[O-])c(N)c1 (18)
[16:51:36] Running LargestFragmentChooser
[16:51:36] Fragment: Nc1snc2c(Cl)cc(Cl)cc12
[16:51:36] New largest fragment: Nc1snc2c(Cl)cc(Cl)cc12 (16)
[16:51:36] Running LargestFragmentChooser
[16:51:36] Fragment: Nc1ccc(Oc2cccc(Oc3ccc(N)cc3)c2)cc1
[16:51:36] New largest fragment: Nc1ccc(Oc2cccc(Oc3ccc(N)cc3)c2)cc1 (38)
[16:51:36] Running LargestFragmentChooser
[16:51:36] Fragment: Nc1ncnc2c1ncn2Cc1ccc([N+](=O)[O-])cc1
[16:51:36] New largest fragment: Nc1ncnc2c1ncn2Cc1ccc([N+](=O)[O-])cc1 (30)
[16:51:36] Running LargestFragmentChooser
[16:51:36] Fragment: Nc1ccc2ccccc2c1N=Nc1ccc([N+](=O)[O-])cc1
[16:51:36] New largest fragment: Nc1ccc2ccccc2c1N=Nc1ccc([N+](=O)[O-])cc1 (34)
[16:51:36] Running L

[  1  30  31  32  36  44  53  57  59  61  67  86 117 124 142 170 177 178
 179 202 204 210 221 224 242 276 287 292 293 312 320 327 328 345 349 353
 365]
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3
4
4
4
4
4
4
4
4
4
4
4
4
4
4
4
5
5
5
5
5
5
5
5
5
5
5
5
5
5
5
6
6
6
6
6
6
6
6
6
6
6
6
6
6
6
7
7
7
7
7
7
7
7
7
7
7
7
7
7
7
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
9
9
9
9
9
9
9
9
9
9
9
9
9
9
9
10
10
10
10
10
10
10
10
10
10
10
10
10
10
10
11
11
11
11
11
11
11
11
11
11
11
11
11
11
11
12
12
12
12
12
12
12
12
12
12
12
12
12
12
12
13
13
13
13
13
13
13
13
13
13
13
13
13
13
13
14
14
14
14
14
14
14
14
14
14
14
14
14
14
14
15
15
15
15
15
15
15
15
15
15
15
15
15
15
15
16
16
16
16
16
16
16
16
16
16
16
16
16
16
16
17
17
17
17
17
17
17
17
17
17
17
17
17
17
17
18
18
18
18
18
18
18
18
18
18
18
18
18
18
18
19
19
19
19
19
19
19
19
19
19
19
19
19
19
19
20
20
20
20
20
20
20
20
20
20
20
20
20
20
20
21
21
21
21
21
21
21
21
21
21
21
21
21
21
21
22
22
22
2

## Hansen dataset

In [89]:
def remove_zero_variance(inp):
    df = inp.copy()
    all_data = [lst for lists in df['Morgan_MIL'].to_list() for lst in lists]
    constant_filter = VarianceThreshold(threshold=0)
    constant_filter.fit(all_data)
    # df['Morgan'] = df['Morgan'].apply(lambda x: constant_filter.transform(np.array(x).reshape(1, -1)))
    df['Morgan_MIL'] = df['Morgan_MIL'].apply(lambda x: constant_filter.transform(x))
    return df

data = {}

data['MACCS'] = data_encoding.load_compressed_pickle("data/encoded/encoded_data_hansen.dat")
data['Morgan'] = data_encoding.load_compressed_pickle("data/encoded/encoded_data_hansen.dat"); data['Morgan'] = model_building.clean_data(data['Morgan']); data['Morgan'] = remove_zero_variance(data['Morgan'])
rskf = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=6234794)

internal_data = False
for kernel in ['linear', 'polynomial']:
    for encoding in ["MACCS",'Morgan']:
        dataset = data[encoding]
        for fold,[train_index, validation_index] in enumerate(rskf.split(dataset, dataset["Ames"])):
            t = dataset.iloc[validation_index].reset_index(drop=True).reset_index()
            t['encoding'] = encoding; t['model'] = 'NSK '+kernel;   t['fold'] = fold%10; t["iteration"] = fold//10
            if type(internal_data) == bool and not internal_data:
                internal_data   =   t
            else:
                internal_data = pd.concat([internal_data,t])

hansen_raw['smiles'] = pd.merge(hansen_raw,internal_data,how='left', on=['iteration','fold','index','model'])['smiles']


## Saving the results

In [93]:
for split in int_results_raw:
    int_results_raw[split].to_csv('model_results/internal/MIL_aromatic_amine_cv_'+split+'.csv')

for split in ext_results_raw:
    ext_results_raw[split].to_csv('model_results/external/ext_val_results_'+split+'.csv')

for split in MORGAN_int_results_raw:
    MORGAN_int_results_raw[split].to_csv('model_results/internal/MIL_aromatic_amine_cv_'+split+'_MORGAN.csv')

for split in ext_results_raw:
    ext_results_raw[split].to_csv('model_results/external/ext_val_results_'+split+'_MORGAN.csv')

hansen_raw.to_csv('model_results/hansen/rscv_random_hansen_results.csv')