In [1]:
import os
import pandas as pd
import numpy as np

# Data Preparation

In [2]:
DB_DIR = 'database/output_files_OLD/'
ANODYNE_FILE = 'anodyne_1-1000-ADMET-properties.csv'
FDA_FILE = 'fda-ADMET-properties.csv'
WNFDA_FILE = 'world-not-fda-ADMET-properties.csv'
DATASET_FILE = 'dataset.csv'

MAJORITY_CLASS = 'anodyne'
MINORITY_CLASS = 'fda'

### Import data

In [3]:
anodyne_df = pd.read_csv(DB_DIR + ANODYNE_FILE)
fda = pd.read_csv(DB_DIR + FDA_FILE)
wnfda = pd.read_csv(DB_DIR + WNFDA_FILE)

### Merge FDA and World-Not-FDA

In [4]:
fda_df = pd.concat([fda, wnfda])

### Add labels

In [5]:
fda_df['Label'] = MINORITY_CLASS
anodyne_df['Label'] = MAJORITY_CLASS

### Merge FDA and Anodyne

In [6]:
df = pd.concat([fda_df, anodyne_df])

### Remove 'Molecule' column

In [7]:
df.drop('Molecule', inplace = True, axis = 1)

### Show rows with at least one missing data

In [8]:
df[df.isnull().any(axis = 1)]

Unnamed: 0,Canonical SMILES,Formula,MW,#Heavy atoms,#Aromatic heavy atoms,Fraction Csp3,#Rotatable bonds,#H-bond acceptors,#H-bond donors,MR,...,Ghose #violations,Veber #violations,Egan #violations,Muegge #violations,Bioavailability Score,PAINS #alerts,Brenk #alerts,Leadlikeness #violations,Synthetic Accessibility,Label


### Delete duplicate rows based on SMILES

In [9]:
df.drop_duplicates(subset=['Canonical SMILES'], inplace = True)

### Delete duplicate rows based on all columns except SMILES

In [10]:
df.drop_duplicates(subset=df.columns.difference(['Canonical SMILES']), inplace = True)

### Remove 'Canonical SMILES' and 'Formula' columns

In [11]:
df.drop('Canonical SMILES', inplace = True, axis = 1)
df.drop('Formula', inplace = True, axis = 1)

### Convert categorical data using One-Hot-Encoding

In [12]:
labels = df['Label']
df = df.drop('Label', axis = 1)

caterogical_columns = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns = caterogical_columns, prefix = caterogical_columns)

df = pd.concat([df, labels.rename('Label')], axis = 1)

### Dump csv file function

In [13]:
def dump_df(path, f_name, df):
    if not os.path.exists(path):
        os.makedirs(path)
    df.to_csv(os.path.join(path, f_name), index = False)

### Load csv file function

In [14]:
def load_df(path, f_name):
    return pd.read_csv(os.path.join(path, f_name))

### Save prepared data

In [15]:
dump_df(DB_DIR, DATASET_FILE, df)

# Train-Test-Validation splitting

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder

In [17]:
DATASET_DIR = 'dataset'
TRAIN_F_NAME = 'train.csv'
TEST_F_NAME = 'test.csv'

### Load dataset

In [18]:
df = load_df(DB_DIR, DATASET_FILE)
y = df['Label']
X = df.drop('Label', axis = 1)

### Get feature names

In [19]:
feature_names = list(X.columns)

### Fit label encoder

In [20]:
le = LabelEncoder().fit(df['Label'])

### Train-Test splitting

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size = 0.2)

X_train.reset_index(drop = True, inplace = True)
X_test.reset_index(drop = True, inplace = True)
y_train.reset_index(drop = True, inplace = True)
y_test.reset_index(drop = True, inplace = True)

train_df = pd.concat([X_train, y_train.rename('Label')], axis = 1)
test_df = pd.concat([X_test, y_test.rename('Label')], axis = 1)

dump_df(DATASET_DIR, TRAIN_F_NAME, train_df)
dump_df(DATASET_DIR, TEST_F_NAME, test_df)

### Stratified K-fold splitting for cross-validation

In [22]:
skf = StratifiedKFold(n_splits = 10)

# Class balancing

In [23]:
from sklearn.preprocessing import QuantileTransformer
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from joblib import dump, load

In [24]:
VALIDATION_DIR = os.path.join(DATASET_DIR, 'validation')
FOLDS_DIR = os.path.join(VALIDATION_DIR, 'fold{}')

VALIDATION_DATA_DIR = os.path.join(FOLDS_DIR, 'data')
VALIDATION_MODELS_DIR = os.path.join(FOLDS_DIR, 'models')

CLUSTER_BAL_SETS_DIR = os.path.join(VALIDATION_DATA_DIR, 'cluster_bal')
RANDOM_BAL_SETS_DIR = os.path.join(VALIDATION_DATA_DIR, 'random_bal')

CLUSTER_BAL_MODELS_DIR = os.path.join(VALIDATION_MODELS_DIR, 'cluster_bal')
RANDOM_BAL_MODELS_DIR = os.path.join(VALIDATION_MODELS_DIR, 'random_bal')

CLUSTER_BAL = 'cluster_balancing'
RANDOM_BAL = 'random_balancing'
ALL = 'all'

BALANCED_SET_F_NAME = 'train{}.csv'
MODEL_F_NAME = 'clf{}.joblib'

MAX = 'max'
SUM = 'sum'
PRODUCT = 'product'
MAJORITY_VOTE = 'majority_vote'
VOTE_METHODS = (MAX, SUM, PRODUCT, MAJORITY_VOTE)

### Cluster based majority class splitting

In [25]:
def get_cluster_splits(majority_df, minority_df, K):
    
    # Drop label column temporarily
    majority_df.drop('Label', inplace = True, axis = 1)
    
    # Scale features and perform kmeans clustering on the majority class dataset
    majority_df_scaled = QuantileTransformer(output_distribution='normal').fit_transform(majority_df)
    kmeans = KMeans(n_clusters = K).fit(majority_df_scaled)
    majority_df['Cluster ID'] = kmeans.labels_
    
    # Put label column back
    majority_df['Label'] = MAJORITY_CLASS
    
    # Distribute clusters entries evenly amongst K new datasets
    clusters = [cluster for _, cluster in majority_df.groupby(['Cluster ID'])]
    splits = [pd.DataFrame() for _ in range(K)]
    for cluster in clusters:
        cluster.drop('Cluster ID', inplace = True, axis = 1)
        for i in range(K):
            splits[i] = splits[i].append(cluster.iloc[[j for j in range(i, cluster.shape[0], K)]], ignore_index = True)
            
    # Append the minority class dataset to each split
    split_dfs = [split.append(minority_df, ignore_index = True) for split in splits]
            
    return split_dfs

### Random based majority class splitting

In [26]:
def get_random_splits(majority_df, minority_df, K):
    
    # Shuffle the majority class dataset
    majority_df_shuffled = majority_df.sample(frac = 1)
    
    # split the shuffled dataset
    splits = np.array_split(majority_df_shuffled, K) 
    
    # Append the minority class dataset to each split
    splits_df = [pd.DataFrame(split).append(minority_df, ignore_index = True) for split in splits]
    
    return splits_df

### Balanced datasets creation

In [27]:
def create_balanced_datasets(df, method):
    
    grouped_df = {label: label_df for label, label_df in df.groupby('Label')}
    
    majority_df = grouped_df[MAJORITY_CLASS]
    minority_df = grouped_df[MINORITY_CLASS]

    majority_df.reset_index(drop = True, inplace = True)
    minority_df.reset_index(drop = True, inplace = True)

    K = round(majority_df.shape[0] / minority_df.shape[0])
    
    if method == CLUSTER_BAL:
        balanced_dfs = get_cluster_splits(majority_df.copy(), minority_df.copy(), K)
    elif method == RANDOM_BAL:
        balanced_dfs = get_random_splits(majority_df.copy(), minority_df.copy(), K)
        
    return balanced_dfs

### K-fold balanced datasets creation and dumping

In [28]:
def dump_balanced_datasets(df, skf, method = ALL):
    
    y = df['Label']
    X = df.drop('Label', axis = 1)

    for fold, (train_index, test_index) in enumerate(skf.split(X, y)):

        print('============================= Fold {} =============================\n'.format(fold))

        validation_train_df = train_df.iloc[train_index]
        validation_test_df = train_df.iloc[test_index]

        print('Saving validation test dataset...')

        dump_df(VALIDATION_DATA_DIR.format(fold), TEST_F_NAME, validation_test_df)

        print('Done!')

        print('Creating and saving balanced subsets...')
        
        if (method == CLUSTER_BAL) or (method == ALL):
            
            cluster_split_dfs = create_balanced_datasets(validation_train_df, CLUSTER_BAL)
            
            for i, cluster_split_df in enumerate(cluster_split_dfs):
                dump_df(CLUSTER_BAL_SETS_DIR.format(fold), BALANCED_SET_F_NAME.format(i), cluster_split_df)
        
        if (method == RANDOM_BAL) or (method == ALL):
            
            random_split_dfs = create_balanced_datasets(validation_train_df, RANDOM_BAL)
            
            for i, random_split_df in enumerate(random_split_dfs):
                dump_df(RANDOM_BAL_SETS_DIR.format(fold), BALANCED_SET_F_NAME.format(i), random_split_df)

        print('Done!\n')

### Ensemble Implementation

In [29]:
class Ensemble:
    
    def __init__(self, le): 
        
        self.le = le
        self.clfs = []
    
    def fit(self, df_list):
        
        for df in df_list:
            
            y = df['Label']
            X = df.drop('Label', axis = 1)
            
            clf = RandomForestClassifier(n_estimators = 100, n_jobs =- 1)
            clf.fit(X, y)
            
            self.clfs += [clf]
            
    def predict(self, X, vote_method = MAX):
        
        p = np.asarray([clf.predict_proba(X) for clf in self.clfs])
        
        if vote_method == MAX:
            y_pred = self.max_proba(p)
            
        elif vote_method == SUM:
            y_pred = self.sum_proba(p)
            
        elif vote_method == PRODUCT:
            y_pred = self.prod_proba(p)
            
        elif vote_method == MAJORITY_VOTE:
            y_pred = self.majority_vote(p)
        
        return y_pred
            
    def max_proba(self, p1):        
        p2 = p1.max(axis = 0)
        return self.le.inverse_transform(p2.argmax(axis = 1))
    
    def sum_proba(self, p1):        
        p2 = p1.sum(axis = 0)
        return self.le.inverse_transform(p2.argmax(axis = 1))
    
    def prod_proba(self, p1):
        p2 = p1.prod(axis = 0)        
        return self.le.inverse_transform(p2.argmax(axis = 1))
    
    def majority_vote(self, p1):        
        p2 = np.apply_along_axis(lambda a : np.array([1 if a[0] >= a[1] else 0, 1 if a[1] >= a[0] else 0]), 2, p1)
        p3 = p2.sum(axis = 0)
        return self.le.inverse_transform(p3.argmax(axis = 1))
    
    def get_feature_importances(self): 
        return np.average(np.asarray([clf.feature_importances_ for clf in self.clfs]), axis = 0)
    
    def save_clfs(self, clfs_dir, f_name):
        if not os.path.exists(clfs_dir):
            os.makedirs(clfs_dir)
        for i, clf in enumerate(self.clfs):
            dump(clf, os.path.join(clfs_dir, f_name.format(i)))
            
    def load_clfs(self, clfs_dir):    
        self.clfs = [load(os.path.join(clfs_dir, f_name)) for f_name in os.listdir(clfs_dir)]
        
    def get_clfs(self):
        return self.clfs

### Load training dataset

In [30]:
train_df = load_df(DATASET_DIR, TRAIN_F_NAME)

### Create and dump balanced datasets

In [31]:
dump_balanced_datasets(train_df, skf)


Saving validation test dataset...
Done!
Creating and saving balanced subsets...
Done!


Saving validation test dataset...
Done!
Creating and saving balanced subsets...
Done!


Saving validation test dataset...
Done!
Creating and saving balanced subsets...
Done!


Saving validation test dataset...
Done!
Creating and saving balanced subsets...
Done!


Saving validation test dataset...
Done!
Creating and saving balanced subsets...
Done!


Saving validation test dataset...
Done!
Creating and saving balanced subsets...
Done!


Saving validation test dataset...
Done!
Creating and saving balanced subsets...
Done!


Saving validation test dataset...
Done!
Creating and saving balanced subsets...
Done!


Saving validation test dataset...
Done!
Creating and saving balanced subsets...
Done!


Saving validation test dataset...
Done!
Creating and saving balanced subsets...
Done!



### Ensemble training on balanced data implementation

In [32]:
def train_ensemble(n_folds, le, method = ALL):

    for fold in range(n_folds):

        print('============================= Fold {} =============================\n'.format(fold))

        print('Training and saving models...')
        
        if (method == CLUSTER_BAL) or (method == ALL):
            
            cluster_bal_dir = CLUSTER_BAL_SETS_DIR.format(fold)
            cluster_split_dfs = [load_df(cluster_bal_dir, f_name) for f_name in os.listdir(cluster_bal_dir)]

            ensemble_cluster = Ensemble(le)    
            ensemble_cluster.fit(cluster_split_dfs)
            ensemble_cluster.save_clfs(CLUSTER_BAL_MODELS_DIR.format(fold), MODEL_F_NAME)
            
        if (method == RANDOM_BAL) or (method == ALL):
        
            random_bal_dir = RANDOM_BAL_SETS_DIR.format(fold)
            random_split_dfs = [load_df(random_bal_dir, f_name) for f_name in os.listdir(random_bal_dir)]

            ensemble_random = Ensemble(le)    
            ensemble_random.fit(random_split_dfs)
            ensemble_random.save_clfs(RANDOM_BAL_MODELS_DIR.format(fold), MODEL_F_NAME)        

        print('Done!\n')

### Train ensembles on balanced data

In [33]:
train_ensemble(skf.get_n_splits(), le)


Training and saving models...
Done!


Training and saving models...
Done!


Training and saving models...
Done!


Training and saving models...
Done!


Training and saving models...
Done!


Training and saving models...
Done!


Training and saving models...
Done!


Training and saving models...
Done!


Training and saving models...
Done!


Training and saving models...
Done!



### Evaluation function implementation

In [34]:
def evaluate():
    
    n = skf.get_n_splits()

    cluster_accuracy = {vote_method: 0 for vote_method in VOTE_METHODS}
    cluster_mcc = {vote_method: 0 for vote_method in VOTE_METHODS}
    random_accuracy = {vote_method: 0 for vote_method in VOTE_METHODS}
    random_mcc = {vote_method: 0 for vote_method in VOTE_METHODS}

    for fold in range(n):

        print('============================= Fold {} =============================\n'.format(fold))

        print('Loading validation test dataset...')

        validation_test_df = load_df(VALIDATION_DATA_DIR.format(fold), TEST_F_NAME)

        y_test = validation_test_df['Label']
        X_test = validation_test_df.drop('Label', axis = 1)

        print('Done!')

        print('Loading trained models...')

        ensemble_cluster = Ensemble(le)
        ensemble_cluster.load_clfs(CLUSTER_BAL_MODELS_DIR.format(fold))

        ensemble_random = Ensemble(le)
        ensemble_random.load_clfs(RANDOM_BAL_MODELS_DIR.format(fold))

        print('Done!')

        print('Evaluating predictions...')

        for vote_method in VOTE_METHODS:

            y_pred_cluster = ensemble_cluster.predict(X_test, vote_method)
            cluster_accuracy[vote_method] += accuracy_score(y_test, y_pred_cluster)
            cluster_mcc[vote_method] += matthews_corrcoef(y_test, y_pred_cluster)

            y_pred_random = ensemble_random.predict(X_test, vote_method)
            random_accuracy[vote_method] += accuracy_score(y_test, y_pred_random)
            random_mcc[vote_method] += matthews_corrcoef(y_test, y_pred_random)

        print('Done!\n')

    print('============================= Results =============================\n')

    for vote_method in VOTE_METHODS:

        cluster_accuracy[vote_method] /= n
        cluster_mcc[vote_method] /= n
        random_accuracy[vote_method] /= n
        random_mcc[vote_method] /= n

        print('-------------------------- Vote: {} -------------------------\n'.format(vote_method))
        print('Cluster split accuracy: {:.4f}'.format(cluster_accuracy[vote_method]))
        print('Cluster split MCC: {:.4f}\n'.format(cluster_mcc[vote_method]))
        print('Random split accuracy: {:.4f}'.format(random_accuracy[vote_method]))
        print('Random split MCC: {:.4f}\n'.format(random_mcc[vote_method]))

### Evaluate

In [35]:
evaluate()


Loading validation test dataset...
Done!
Loading trained models...
Done!
Evaluating predictions...
Done!


Loading validation test dataset...
Done!
Loading trained models...
Done!
Evaluating predictions...
Done!


Loading validation test dataset...
Done!
Loading trained models...
Done!
Evaluating predictions...
Done!


Loading validation test dataset...
Done!
Loading trained models...
Done!
Evaluating predictions...
Done!


Loading validation test dataset...
Done!
Loading trained models...
Done!
Evaluating predictions...
Done!


Loading validation test dataset...
Done!
Loading trained models...
Done!
Evaluating predictions...
Done!


Loading validation test dataset...
Done!
Loading trained models...
Done!
Evaluating predictions...
Done!


Loading validation test dataset...
Done!
Loading trained models...
Done!
Evaluating predictions...
Done!


Loading validation test dataset...
Done!
Loading trained models...
Done!
Evaluating predictions...
Done!


Loading validation test dataset...
D

# Feature selection

### Get feature importance based on previously trained models

In [36]:
fis = []

for fold in range(skf.get_n_splits()):
    
    ensemble_cluster = Ensemble(le)
    ensemble_cluster.load_clfs(CLUSTER_BAL_MODELS_DIR.format(fold))
    
    fis += [ensemble_cluster.get_feature_importances()]
    
fis = np.average(np.asarray(fis), axis = 0)
fis = pd.DataFrame(fis, index = feature_names, columns = ['Importance'])
fis = fis.sort_values('Importance', ascending = False)

### Show the 20 most important features

In [37]:
fis.head(20)

Unnamed: 0,Importance
MW,0.102498
#Heavy atoms,0.056426
Synthetic Accessibility,0.049842
MR,0.049327
CYP1A2 inhibitor_Yes,0.036068
CYP1A2 inhibitor_No,0.035597
Fraction Csp3,0.034876
CYP2C19 inhibitor_Yes,0.034626
CYP2C19 inhibitor_No,0.034277
TPSA,0.028809


### Select the 10 most important features in the training set

In [38]:
selected_features = list(fis.head(10).index) + ['Label']
train_df = load_df(DATASET_DIR, TRAIN_F_NAME)
train_df = train_df[selected_features]

### Create and dump balanced datasets

In [39]:
dump_balanced_datasets(train_df, skf)


Saving validation test dataset...
Done!
Creating and saving balanced subsets...
Done!


Saving validation test dataset...
Done!
Creating and saving balanced subsets...
Done!


Saving validation test dataset...
Done!
Creating and saving balanced subsets...
Done!


Saving validation test dataset...
Done!
Creating and saving balanced subsets...
Done!


Saving validation test dataset...
Done!
Creating and saving balanced subsets...
Done!


Saving validation test dataset...
Done!
Creating and saving balanced subsets...
Done!


Saving validation test dataset...
Done!
Creating and saving balanced subsets...
Done!


Saving validation test dataset...
Done!
Creating and saving balanced subsets...
Done!


Saving validation test dataset...
Done!
Creating and saving balanced subsets...
Done!


Saving validation test dataset...
Done!
Creating and saving balanced subsets...
Done!



### Train ensemble on balanced data

In [40]:
train_ensemble(skf.get_n_splits(), le)


Training and saving models...
Done!


Training and saving models...
Done!


Training and saving models...
Done!


Training and saving models...
Done!


Training and saving models...
Done!


Training and saving models...
Done!


Training and saving models...
Done!


Training and saving models...
Done!


Training and saving models...
Done!


Training and saving models...
Done!



### Evaluate

In [41]:
evaluate()


Loading validation test dataset...
Done!
Loading trained models...
Done!
Evaluating predictions...
Done!


Loading validation test dataset...
Done!
Loading trained models...
Done!
Evaluating predictions...
Done!


Loading validation test dataset...
Done!
Loading trained models...
Done!
Evaluating predictions...
Done!


Loading validation test dataset...
Done!
Loading trained models...
Done!
Evaluating predictions...
Done!


Loading validation test dataset...
Done!
Loading trained models...
Done!
Evaluating predictions...
Done!


Loading validation test dataset...
Done!
Loading trained models...
Done!
Evaluating predictions...
Done!


Loading validation test dataset...
Done!
Loading trained models...
Done!
Evaluating predictions...
Done!


Loading validation test dataset...
Done!
Loading trained models...
Done!
Evaluating predictions...
Done!


Loading validation test dataset...
Done!
Loading trained models...
Done!
Evaluating predictions...
Done!


Loading validation test dataset...
D

# Testing 

### Testing function implementation

In [44]:
def test_model(train_df, test_df):

    print('Creating balanced datasets...')

    cluster_split_dfs = create_balanced_datasets(train_df, CLUSTER_BAL)     
    random_split_dfs = create_balanced_datasets(train_df, RANDOM_BAL)

    print('Done!')

    print('Training ensembles...')

    ensemble_cluster = Ensemble(le)    
    ensemble_cluster.fit(cluster_split_dfs)

    ensemble_random = Ensemble(le)    
    ensemble_random.fit(random_split_dfs)

    print('Done!')

    cluster_accuracy = {vote_method: 0 for vote_method in VOTE_METHODS}
    cluster_mcc = {vote_method: 0 for vote_method in VOTE_METHODS}
    random_accuracy = {vote_method: 0 for vote_method in VOTE_METHODS}
    random_mcc = {vote_method: 0 for vote_method in VOTE_METHODS}

    y_test = test_df['Label']
    X_test = test_df.drop('Label', axis = 1)

    print('Evaluating predictions...')

    for vote_method in VOTE_METHODS:

        y_pred_cluster = ensemble_cluster.predict(X_test, vote_method)
        cluster_accuracy[vote_method] = accuracy_score(y_test, y_pred_cluster)
        cluster_mcc[vote_method] = matthews_corrcoef(y_test, y_pred_cluster)

        y_pred_random = ensemble_random.predict(X_test, vote_method)
        random_accuracy[vote_method] = accuracy_score(y_test, y_pred_random)
        random_mcc[vote_method] = matthews_corrcoef(y_test, y_pred_random)

    print('Done!\n')

    print('============================= Results =============================\n')

    for vote_method in VOTE_METHODS:

        print('-------------------------- Vote: {} -------------------------\n'.format(vote_method))
        print('Cluster split accuracy: {:.4f}'.format(cluster_accuracy[vote_method]))
        print('Cluster split MCC: {:.4f}\n'.format(cluster_mcc[vote_method]))
        print('Random split accuracy: {:.4f}'.format(random_accuracy[vote_method]))
        print('Random split MCC: {:.4f}\n'.format(random_mcc[vote_method]))

### Testing - All features selected

In [45]:
train_df = load_df(DATASET_DIR, TRAIN_F_NAME)
test_df = load_df(DATASET_DIR, TEST_F_NAME)

test_model(train_df, test_df)

Creating balanced datasets...
Done!
Training ensembles...
Done!
Evaluating predictions...
Done!


-------------------------- Vote: max -------------------------

Cluster split accuracy: 0.9357
Cluster split MCC: 0.6055

Random split accuracy: 0.9360
Random split MCC: 0.6059

-------------------------- Vote: sum -------------------------

Cluster split accuracy: 0.9245
Cluster split MCC: 0.5722

Random split accuracy: 0.9249
Random split MCC: 0.5716

-------------------------- Vote: product -------------------------

Cluster split accuracy: 0.9249
Cluster split MCC: 0.5733

Random split accuracy: 0.9252
Random split MCC: 0.5726

-------------------------- Vote: majority_vote -------------------------

Cluster split accuracy: 0.9242
Cluster split MCC: 0.5702

Random split accuracy: 0.9235
Random split MCC: 0.5682



### Testing -  Most important features selected

In [46]:
train_df = load_df(DATASET_DIR, TRAIN_F_NAME)
test_df = load_df(DATASET_DIR, TEST_F_NAME)

train_df = train_df[selected_features]
test_df = test_df[selected_features]

test_model(train_df, test_df)

Creating balanced datasets...
Done!
Training ensembles...
Done!
Evaluating predictions...
Done!


-------------------------- Vote: max -------------------------

Cluster split accuracy: 0.9148
Cluster split MCC: 0.5394

Random split accuracy: 0.9136
Random split MCC: 0.5351

-------------------------- Vote: sum -------------------------

Cluster split accuracy: 0.9012
Cluster split MCC: 0.5081

Random split accuracy: 0.9013
Random split MCC: 0.5087

-------------------------- Vote: product -------------------------

Cluster split accuracy: 0.9021
Cluster split MCC: 0.5102

Random split accuracy: 0.9015
Random split MCC: 0.5093

-------------------------- Vote: majority_vote -------------------------

Cluster split accuracy: 0.8991
Cluster split MCC: 0.5024

Random split accuracy: 0.9006
Random split MCC: 0.5059

