# ML Results Analysis

## Setup

In [None]:
import logging
from itertools import combinations, permutations
from pathlib import Path

import numpy as np
import pandas as pd
from sqlite3 import connect

from matplotlib import pyplot as plt
import seaborn as sns

logger = logging.getLogger("ResultAnalysis")

np.random.seed(707260)

### Data Loading

In [None]:
# Connects to a database, and loads all contents into a single dataframe
def parse_results_db(sqlite_path: Path):
    # Establish a "connection" (file I/O) to the database
    db_con = connect(sqlite_path)
    db_cur = db_con.cursor()

    # Get a list of the tables present in the database (should be one for every analysis)
    result_tables = db_cur.execute("SELECT name FROM sqlite_master").fetchall()

    # Unpack the list of (single-value) tuples into a single listba
    result_tables = [x[0] for x in result_tables]

    # Read the contents of each results table into a DataFrame
    result_dfs = []
    for t in result_tables:
        try:
            df = pd.read_sql(
                f"SELECT * FROM {t}",
                con=db_con
            )
        # Occasionally, tables will get corrupted if they were being written to when a job is terminated (i.e. SLURM job cancellation)
        except:
            logging.warning(f"Failed to read table '{t}', ignoring it")
            continue

        # Add a column tracking the study, model, and dataset
        df.loc[:, ["study", "model", "dataset"]] = t.split('__')

        # Add it to the list
        result_dfs.append(df)

    # Concatenate the results into one large dataframe and return it
    return pd.concat(result_dfs)

In [None]:
results_df = parse_results_db(Path("../step3_run_analysis/results/dcm_classic_ml.db"))

### Analysis Stratification

In [None]:
data_indices = ['feature_type', 'feature_set', 'scope', 'mri_type', 'algorithm', 'data_prep']

# Sub-index helpers
def get_feature_type(dataset_components: list[str]):
    return dataset_components[0]

# Unique to datasets containing only clinical metrics
def get_prep_clin(dataset_components: list[str]):
    return '_'.join(dataset_components[1:])

# Unique to datasets w/ MRI-containing metrics
def get_feature_set(dataset_components: list[str]):
    return dataset_components[2]

def get_scope(dataset_components: list[str]):
    return '_'.join(dataset_components[3:5])

def get_mri_type(dataset_components: list[str]):
    return '_'.join(dataset_components[5:7])

def get_algorithm(dataset_components: list[str]):
    return dataset_components[7]

def get_prep_img(dataset_components: list[str]):
    return '_'.join(dataset_components[8:])

# Helper function make dataframe application easier
def parse_dataset(dataset_label: str):
    dataset_components = dataset_label.split('_')
    feature_type = get_feature_type(dataset_components)
    if feature_type == 'clinical':
        prep = get_prep_clin(dataset_components)
        return (feature_type, 'N/A', 'N/A', 'N/A', 'N/A', prep)
    else:
        feature_set = get_feature_set(dataset_components)
        scope = get_scope(dataset_components)
        mri_type = get_mri_type(dataset_components)
        algorithm = get_algorithm(dataset_components)
        prep = get_prep_img(dataset_components)
        return np.array([feature_type, feature_set, scope, mri_type, algorithm, prep])

# Parse the dataset column to form our "analysis" columns
new_vals = np.stack(results_df['dataset'].apply(parse_dataset))
results_df.loc[:, data_indices] = new_vals

In [None]:
results_df[data_indices]

### Other

In [None]:
# Analysis index; used in grouping operations to gather samples of the sample "methdology"
analysis_idx = ["study", "model", *data_indices]
# Same as above, with replicates (which allow for variation calculations
analysis_idx_w_replicates = [*analysis_idx, "replicate"]

## Best Performance Across Trials

### Setup

In [None]:
def get_values_at_other_optima(grouping_cols: list[str], other_cols: list[str], ascending: list[bool], df: pd.DataFrame, n=1):
    # Sort the dataframe by the "other", placing their optima (as desingated by the user) towards the bottom
    sorted_df = df.sort_values(by=other_cols, ascending=ascending)

    # Group the results by our grouping indices, and grab the last n entries (which correspond to our optima)
    optima_df = sorted_df.groupby(grouping_cols).tail(n)

    # Return the result
    return optima_df

In [None]:
def build_metric_report(target_col: str, df: pd.DataFrame):
    # Convert this to floating point, to suppress "cannot apply to object type" errors
    tmp_df = df.set_index(analysis_idx)
    tmp_df[target_col] = tmp_df[target_col].astype('float32')

    # Calculate the mean and standard deviation of the model's performance
    target_metric_means = tmp_df.groupby(analysis_idx)[target_col].mean()
    target_metric_stds = tmp_df.groupby(analysis_idx)[target_col].std()

    # Place it into a dataframe for easier management
    report_df = pd.DataFrame(
        columns=["MEAN", "STD"],
        index=target_metric_means.index
    )

    report_df["MEAN"] = target_metric_means
    report_df["STD"] = target_metric_stds

    # Sort them in ascending order
    report_df = report_df.sort_values("MEAN")

    # Return the report
    return report_df

### Testing Balanced Accuracy @ Peak Validation Accuracy 

In [None]:
sorting_cols = ['balanced_accuracy (validate)', 'log_loss (validate)']
# validation_optima_dir = [False, True]
validation_optima_dir = [True, False]
target_metric = 'balanced_accuracy (test)'

bacc_validation_optima_df = get_values_at_other_optima(grouping_cols=analysis_idx_w_replicates, other_cols=sorting_cols, ascending=validation_optima_dir, df=results_df)

build_metric_report(target_metric, bacc_validation_optima_df)

### Testing Balanced Accuracy @ Minimum Log-Loss 

In [None]:
sorting_cols = ['log_loss (validate)', 'balanced_accuracy (validate)']
validation_optima_dir = [False, True]
# validation_optima_dir = [True, False]
target_metric = 'balanced_accuracy (test)'

log_loss_validation_optima_df = get_values_at_other_optima(grouping_cols=analysis_idx_w_replicates, other_cols=sorting_cols, ascending=validation_optima_dir, df=results_df)

build_metric_report(target_metric, log_loss_validation_optima_df)

## Statistical Comparisons

### Setup

In [None]:
from itertools import permutations

from scipy.stats import ranksums, kruskal, false_discovery_control

alt_keys = {
    'two-sided': '!=',
    'greater':   '>',
    'less':      '<'
}

In [None]:
def paired_rankedsum(df: pd.DataFrame, query: list[str], target: str, alternative: str = 'two-sided'):
    pvals = {}
    query_set = set(df[query])

    # Caclulate the native rankedsum p-value for each pair of datasets, testing whether the former's value is greater than the latters
    for v1, v2 in permutations(query_set, 2):
        x1 = df.query(f"{query} == '{v1}'")[target]
        x2 = df.query(f"{query} == '{v2}'")[target]
        p = ranksums(x1, x2, alternative=alternative).pvalue
        pvals[f"{v1} {alt_keys[alternative]} {v2} [{query}]"] = [p]

    # Save the results as a dataframe
    return_df = pd.DataFrame.from_dict(pvals).T
    return_df.index.name = 'Comparison'
    return_df.columns = ['p']
    return return_df

In [None]:
# Calculate the p-values for whether one experimental permutation has greater average balanced accuracy (testing) than another
sub_dfs = []
target = 'balanced_accuracy (test)'
for k in analysis_idx:
    if len(set(results_df[k])) < 2:
        logger.warning(f"Column '{k}' was homogenous, cannot split for statistical comparisons!")
        continue
    tmp_df = paired_rankedsum(bacc_validation_optima_df, k, target, alternative='greater')
    sub_dfs.append(tmp_df)

sig_test_at_peak_valid_df = pd.concat(sub_dfs).sort_values('p')

In [None]:
sig_test_at_peak_valid_df.head(20)