# Compute (Predictive and Uncertainty) Performance Tables

<hr/>

#### Imports

In [1]:
import copy

import numpy as np
import pandas as pd

from scipy.stats import spearmanr

from sklearn.metrics import r2_score

#### Constants

In [2]:
N_REPETITIONS = 100
N_SPLITS = 2
DATA_FOLDER = '../../../data/featurized/'
RESULT_FOLDER = '../../../data/results/kfold_ensembles/'
ALLOWED_RANGE_FACTOR = .5

#### Settings

In [3]:
algorithms = ['rf', 'xgb', 'svm_rbf', 'shallow', 'dropout']
algorithm_names = ['RF', 'XGB', 'SVM', 'Shallow', 'Deep']
algorithm_to_name = dict(zip(algorithms, algorithm_names))

descriptors = ['maccs', 'ecfc', 'rdkit', 'cddd']
descriptor_names = ['MACCS', 'ECFC', 'RDKit', 'CDDD']
descriptor_to_name = dict(zip(descriptors, descriptor_names))

PLOT_FOLDER = f'./plots/'

# Names of the datasets that were already run
dataset_names = [
    'tetrahymena',
    'freesolv',
    'esol',
    'MMP2',
    'IL4',
    'F7',
    'O60674',
    'O14965',
    'P03372',
    'P04150',
    'P06401',
    'P11229',
    'P12931',
    'P16581',
    'P17252',
    'P18089',
    'P19327',
    'P21554',
    'P24530',
    'P25929',
    'P28335',
    'P28482',
    'P35968',
    'P41594',
    'P42345',
    'P47871',
    'P49146',
    'P61169',
    'Q05397',
    'Q16602',
    'P24941',
    'Q92731'
]

#### Functions

In [4]:
def load_results_file(name, algorithm, descriptor, n_repetitions=N_REPETITIONS, n_splits=N_SPLITS):
    """Loads results from disk for a given combination of ML settings."""
    path = f'{RESULT_FOLDER}{name}/{algorithm}/{descriptor}/single_predictions/test.csv'
    results = pd.read_csv(path, sep=';').set_index('id')
    single_tables = list()
    for i in range(n_repetitions):
        partial_results = results[[f'rep{i}_split{j}' for j in range(n_splits)]]
        single_tables.append(partial_results)
    return single_tables


def load_true(name):
    """Loads dependent variables of a given dataset."""
    path = f'{DATA_FOLDER}{name}/{name}_y.csv'
    y = pd.read_csv(path, sep=';').set_index('id')
    return y


def clean_member(member_raw, y):
    """Removes predictions that are outside the allowed y-range"""
    member = copy.deepcopy(member_raw)
    for column in member.columns:
        # To the current column, those indeces
        # that are nan belong to training samples
        train_y = y.loc[member[~member[column].isnull()].index]
        train_y_range = train_y.max() - train_y.min()
        boundary = ALLOWED_RANGE_FACTOR * train_y_range
        allowed_min = (train_y.min() - boundary).values[0]
        allowed_max = (train_y.max() + boundary).values[0]
        exclusion = ((member[column] < allowed_min) | (member[column] > allowed_max))
        member.loc[exclusion] = np.nan
    return member


def get_values_for_one_evaluation(single_tables, y, clean=True):
    """Computes predictive and uncertainty performance for one ensemble evaluation."""
    members = pd.DataFrame(index=y.index)
    for i, pre_member in enumerate(single_tables):
        # Collect members
        if clean:
            member = clean_member(pre_member, y)
        else:
            member = pre_member
        members = pd.concat([members, member], axis=1)
    mean = members.mean(axis=1)
    sdev = members.std(axis=1)
    r2 = r2_score(y, mean)
    resd = y['y'] - mean
    uq = spearmanr(resd.abs(), sdev)[0]
    return r2, uq


def get_values_for_whole_dataset(name, clean=True):
    """Computes all performance and uncertainty values for all featurization/algorithm combinations of a single dataset."""
    y = load_true(name)
    r2_matrix = np.zeros((len(descriptors), len(algorithms)))
    uq_matrix = np.zeros((len(descriptors), len(algorithms)))
    for i, descriptor in enumerate(descriptors):
        for j, algorithm in enumerate(algorithms):
            single_tables = load_results_file(name, algorithm, descriptor)
            r2, uq = get_values_for_one_evaluation(single_tables, y, clean=clean)
            r2_matrix[i][j] = r2
            uq_matrix[i][j] = uq
    r2_matrix_df = make_df(r2_matrix)
    uq_matrix_df = make_df(uq_matrix)
    return r2_matrix_df, uq_matrix_df


def make_df(matrix):
    """Puts index and column names to evaluation matrix."""
    matrix_df = pd.DataFrame(matrix, columns=algorithm_names)
    matrix_df.index = descriptor_names
    matrix_df.index.name = 'descriptor'
    return matrix_df

<hr/>

Quick check difference between raw and clean

In [5]:
mmp2_r2, mmp2_uq = get_values_for_whole_dataset('MMP2', clean=False)
mmp2_r2_clean, mmp2_uq_clean = get_values_for_whole_dataset('MMP2', clean=True)

In [6]:
mmp2_r2

Unnamed: 0_level_0,RF,XGB,SVM,Shallow,Deep
descriptor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MACCS,0.424884,0.474591,0.146657,0.180418,0.148106
ECFC,0.462778,0.529387,0.39868,0.577507,0.538089
RDKit,0.441466,0.483171,0.389704,-0.337834,0.459375
CDDD,0.442398,0.465654,0.452958,0.378977,0.399054


In [7]:
mmp2_r2_clean

Unnamed: 0_level_0,RF,XGB,SVM,Shallow,Deep
descriptor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
MACCS,0.424884,0.474591,0.146657,0.180418,0.148106
ECFC,0.462778,0.529387,0.39868,0.577507,0.538089
RDKit,0.441466,0.483171,0.389704,-0.231477,0.459375
CDDD,0.442398,0.465654,0.452958,0.379104,0.399054


#### Compute Performance Tables

In [8]:
for i, dataset_name in enumerate(dataset_names):
    print(f'At {i+1} out of {len(dataset_names)} ({dataset_name}).')
    r2_matrix_df, uq_matrix_df = get_values_for_whole_dataset(dataset_name)
    r2_matrix_df.to_csv(f'../../../data/generated_by_notebooks/predictive_performances/{dataset_name}.csv', sep=';')
    uq_matrix_df.to_csv(f'../../../data/generated_by_notebooks/uncertainty_performances/{dataset_name}.csv', sep=';')
print('Done.')

At 1 out of 32 (tetrahymena).
At 2 out of 32 (freesolv).
At 3 out of 32 (esol).
At 4 out of 32 (MMP2).
At 5 out of 32 (IL4).
At 6 out of 32 (F7).
At 7 out of 32 (O60674).
At 8 out of 32 (O14965).
At 9 out of 32 (P03372).
At 10 out of 32 (P04150).
At 11 out of 32 (P06401).
At 12 out of 32 (P11229).
At 13 out of 32 (P12931).
At 14 out of 32 (P16581).
At 15 out of 32 (P17252).
At 16 out of 32 (P18089).
At 17 out of 32 (P19327).
At 18 out of 32 (P21554).
At 19 out of 32 (P24530).
At 20 out of 32 (P25929).
At 21 out of 32 (P28335).
At 22 out of 32 (P28482).
At 23 out of 32 (P35968).
At 24 out of 32 (P41594).
At 25 out of 32 (P42345).
At 26 out of 32 (P47871).
At 27 out of 32 (P49146).
At 28 out of 32 (P61169).
At 29 out of 32 (Q05397).
At 30 out of 32 (Q16602).
At 31 out of 32 (P24941).
At 32 out of 32 (Q92731).
Done.
