# Analyze Differences Between Datasets and Within Datasets

<hr/>

#### Imports

In [1]:
import collections

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from scipy.stats import f_oneway, kruskal

#### Constants

In [2]:
DATA_FOLDER = '../../../data/generated_by_notebooks/'

#### Settings

In [3]:
mpl.rcParams['figure.dpi'] = 200

algorithms = ['rf', 'xgb', 'svm_rbf', 'shallow', 'dropout']
algorithm_names = ['RF', 'XGB', 'SVM', 'Shallow', 'Deep']
algorithm_to_name = dict(zip(algorithms, algorithm_names))

descriptors = ['maccs', 'ecfc', 'rdkit', 'cddd']
descriptor_names = ['MACCS', 'MFC', 'RDKit', 'CDDD']
descriptor_to_name = dict(zip(descriptors, descriptor_names))

# Names of the datasets that were already run
dataset_names = [
    'tetrahymena',
    'freesolv',
    'esol',
    'MMP2',
    'IL4',
    'F7',
    'O60674',
    'O14965',
    'P03372',
    'P04150',
    'P06401',
    'P11229',
    'P12931',
    'P16581',
    'P17252',
    'P18089',
    'P19327',
    'P21554',
    'P24530',
    'P25929',
    'P28335',
    'P28482',
    'P35968',
    'P41594',
    'P42345',
    'P47871',
    'P49146',
    'P61169',
    'Q05397',
    'Q16602',
    'P24941',
    'Q92731'
]

#### Functions

In [4]:
def load_evaluation_matrix(name, evaluation='predictive', to_matrix=True):
    """Loads performance table for a single dataset."""
    path = f'{DATA_FOLDER}/{evaluation}_performances/{name}.csv'
    df = pd.read_csv(path, sep=';').set_index('descriptor')
    if to_matrix:
        matrix = df.to_numpy()
        return matrix
    else:
        return df
    
    
def fetch(performance_dfs, labels, transpose=False):
    """Collects performance values from a list of dataframes, where 'labels' are the column names."""
    collected = [pd.DataFrame() for _ in range(len(labels))]
    for dataset_name in dataset_names:
        df = performance_dfs[dataset_name]
        if transpose:
            df = df.T
        for i, label in enumerate(labels):
            collected[i][dataset_name] = df[label]
    return collected


def make_y_range(yrange, interval=.1):
    """Creates a range between yrange[0] and yrange[1] with a given interval."""
    factor = 1./interval
    ylow = int(factor * yrange[0])
    yhigh = int(factor * yrange[1])
    rang = range(ylow, yhigh+1)
    rang_adjusted = [np.round(val*interval, 1) for val in rang]
    return rang_adjusted

    
def my_formatting(number):
    """Formats a number scientifically when < .01 and 'normally' otherwise."""
    if number < 0.01:
        return f'{number:.2e}'
    else:
        return f'{number:.2f}'

<hr/>

#### Load data

In [5]:
predictive_performances = dict()
uncertainty_performances = dict()

for dataset_name in dataset_names:
    pred_df = load_evaluation_matrix(dataset_name, evaluation='predictive', to_matrix=False)
    predictive_performances[dataset_name] = pred_df.rename(index={'ECFC': 'MFC'})
    uq_df = load_evaluation_matrix(dataset_name, evaluation='uncertainty', to_matrix=False)
    uncertainty_performances[dataset_name] = uq_df.rename(index={'ECFC': 'MFC'})

#### Prepare

In [6]:
pred_algorithm_wise = fetch(predictive_performances, algorithm_names)
pred_descriptor_wise = fetch(predictive_performances, descriptor_names, transpose=True)

uq_algorithm_wise = fetch(uncertainty_performances, algorithm_names)
uq_descriptor_wise = fetch(uncertainty_performances, descriptor_names, transpose=True)

#### Reshape

Predictive

In [7]:
pred_descriptor_wise_flat = dict()
pred_algorithm_wise_flat = dict()
pred_dataset_wise_flat = dict()

for i, descriptor in enumerate(descriptors):
    pred_descriptor_wise_flat[descriptor] = pred_descriptor_wise[i].values.reshape((-1))
    
for i, algorithm in enumerate(algorithms):
    pred_algorithm_wise_flat[algorithm] = pred_algorithm_wise[i].values.reshape((-1))
    
for dataset_name in dataset_names:
    pred_dataset_wise_flat[dataset_name] = predictive_performances[dataset_name].values.reshape((-1))
    
pred_descriptor_wise_flat_list = [pred_descriptor_wise_flat[descriptor] for descriptor in descriptors]    
pred_algorithm_wise_flat_list = [pred_algorithm_wise_flat[algorithm] for algorithm in algorithms]
pred_dataset_wise_flat_list = [pred_dataset_wise_flat[dataset_name] for dataset_name in dataset_names]

UQ

In [8]:
uq_descriptor_wise_flat = dict()
uq_algorithm_wise_flat = dict()
uq_dataset_wise_flat = dict()

for i, descriptor in enumerate(descriptors):
    uq_descriptor_wise_flat[descriptor] = uq_descriptor_wise[i].values.reshape((-1))
    
for i, algorithm in enumerate(algorithms):
    uq_algorithm_wise_flat[algorithm] = uq_algorithm_wise[i].values.reshape((-1))
    
for dataset_name in dataset_names:
    uq_dataset_wise_flat[dataset_name] = uncertainty_performances[dataset_name].values.reshape((-1))
    
uq_descriptor_wise_flat_list = [uq_descriptor_wise_flat[descriptor] for descriptor in descriptors]    
uq_algorithm_wise_flat_list = [uq_algorithm_wise_flat[algorithm] for algorithm in algorithms]
uq_dataset_wise_flat_list = [uq_dataset_wise_flat[dataset_name] for dataset_name in dataset_names]

<hr/>

#### Analyze Predictive

In [9]:
pred_tests_df = pd.DataFrame(index=['ANOVA', 'Kruskal–Wallis'])

Predictive, inter-descriptor

In [10]:
anova_stat, anova_pvalue = f_oneway(*pred_descriptor_wise_flat_list)

In [11]:
kruskal_stat, kruskal_pvalue = kruskal(*pred_descriptor_wise_flat_list)

In [12]:
pred_tests_df['Inter-featurization'] = [
    f'Statistic = {my_formatting(anova_stat)}; p-value = {my_formatting(anova_pvalue)}',
    f'Statistic = {my_formatting(kruskal_stat)}; p-value = {my_formatting(kruskal_pvalue)}'
]

Predictive, inter-algorithm

In [13]:
anova_stat, anova_pvalue = f_oneway(*pred_algorithm_wise_flat_list)

In [14]:
kruskal_stat, kruskal_pvalue = kruskal(*pred_algorithm_wise_flat_list)

In [15]:
pred_tests_df['Inter-technique'] = [
    f'Statistic = {my_formatting(anova_stat)}; p-value = {my_formatting(anova_pvalue)}',
    f'Statistic = {my_formatting(kruskal_stat)}; p-value = {my_formatting(kruskal_pvalue)}'
]

Predictive, inter-dataset

In [16]:
anova_stat, anova_pvalue = f_oneway(*pred_dataset_wise_flat_list)

In [17]:
kruskal_stat, kruskal_pvalue = kruskal(*pred_dataset_wise_flat_list)

In [18]:
pred_tests_df['Inter-dataset'] = [
    f'Statistic = {my_formatting(anova_stat)}; p-value = {my_formatting(anova_pvalue)}',
    f'Statistic = {my_formatting(kruskal_stat)}; p-value = {my_formatting(kruskal_pvalue)}'
]

In [19]:
print(pred_tests_df.T.to_latex())

\begin{tabular}{lll}
\toprule
{} &                                   ANOVA &                          Kruskal–Wallis \\
\midrule
Inter-featurization &   Statistic = 11.58; p-value = 2.13e-07 &   Statistic = 28.04; p-value = 3.57e-06 \\
Inter-technique     &   Statistic = 11.05; p-value = 1.15e-08 &   Statistic = 38.16; p-value = 1.04e-07 \\
Inter-dataset       &  Statistic = 40.91; p-value = 2.54e-127 &  Statistic = 482.58; p-value = 1.81e-82 \\
\bottomrule
\end{tabular}



<hr/>

#### Analyze UQ

In [20]:
uq_tests_df = pd.DataFrame(index=['ANOVA', 'Kruskal–Wallis'])

UQ, inter-descriptor

In [21]:
anova_stat, anova_pvalue = f_oneway(*uq_descriptor_wise_flat_list)

In [22]:
kruskal_stat, kruskal_pvalue = kruskal(*uq_descriptor_wise_flat_list)

In [23]:
uq_tests_df['Inter-featurization'] = [
    f'Statistic = {my_formatting(anova_stat)}; p-value = {my_formatting(anova_pvalue)}',
    f'Statistic = {my_formatting(kruskal_stat)}; p-value = {my_formatting(kruskal_pvalue)}'
]

UQ, inter-algorithm

In [24]:
anova_stat, anova_pvalue = f_oneway(*uq_algorithm_wise_flat_list)

In [25]:
kruskal_stat, kruskal_pvalue = kruskal(*uq_algorithm_wise_flat_list)

In [26]:
uq_tests_df['Inter-technique'] = [
    f'Statistic = {my_formatting(anova_stat)}; p-value = {my_formatting(anova_pvalue)}',
    f'Statistic = {my_formatting(kruskal_stat)}; p-value = {my_formatting(kruskal_pvalue)}'
]

UQ, inter-dataset

In [27]:
anova_stat, anova_pvalue = f_oneway(*uq_dataset_wise_flat_list)

In [28]:
kruskal_stat, kruskal_pvalue = kruskal(*uq_dataset_wise_flat_list)

In [29]:
uq_tests_df['Inter-dataset'] = [
    f'Statistic = {my_formatting(anova_stat)}; p-value = {my_formatting(anova_pvalue)}',
    f'Statistic = {my_formatting(kruskal_stat)}; p-value = {my_formatting(kruskal_pvalue)}'
]

In [30]:
uq_tests_df

Unnamed: 0,Inter-featurization,Inter-technique,Inter-dataset
ANOVA,Statistic = 2.66; p-value = 0.05,Statistic = 18.63; p-value = 1.72e-14,Statistic = 31.56; p-value = 9.93e-106
Kruskal–Wallis,Statistic = 8.82; p-value = 0.03,Statistic = 79.60; p-value = 2.12e-16,Statistic = 355.65; p-value = 8.10e-57


In [31]:
print(uq_tests_df.T.to_latex())

\begin{tabular}{lll}
\toprule
{} &                                   ANOVA &                          Kruskal–Wallis \\
\midrule
Inter-featurization &        Statistic = 2.66; p-value = 0.05 &        Statistic = 8.82; p-value = 0.03 \\
Inter-technique     &   Statistic = 18.63; p-value = 1.72e-14 &   Statistic = 79.60; p-value = 2.12e-16 \\
Inter-dataset       &  Statistic = 31.56; p-value = 9.93e-106 &  Statistic = 355.65; p-value = 8.10e-57 \\
\bottomrule
\end{tabular}

