In [1]:
import os.path
import numpy as np
import pickle

In [2]:
# Parameters
metric = 'auc_ovo'

params = {
    'IRIS': {
        'n_runs': 200,
        'best_method': 'SINKHORN',
        'n_imputations': 20,
        'epochs': 50,
        'alpha': 1.,
    },
    'AUSTRALIAN': {
        'n_runs': 200,
        'best_method': 'MISSFOREST',
        'n_imputations': 20,
        'epochs': 20,
        'alpha': 1.,
    },
    'WINE': {
        'n_runs': 200,
        'best_method': 'SINKHORN',
        'n_imputations': 20,
        'epochs': 50,
        'alpha': 1.,
    },
    'PIMA': {
        'n_runs': 200,
        'best_method': 'SINKHORN',
        'n_imputations': 20,
        'epochs': 50,
        'alpha': 1.,
    },
    'ABALONE': {
        'n_runs': 200,
        'best_method': 'MISSFOREST',
        'n_imputations': 20,
        'epochs': 50,
        'alpha': .3,
    },
}

datasets = list(params.keys())

clean_datasets = {
    'IRIS': 'IRIS',
    'AUSTRALIAN': 'STAT',
    'WINE': 'WINE',
    'PIMA': 'PIMA',
    'ABALONE': 'ABAL',
}

clean_methods = {
    'MISSFOREST': 'MISS',
    'SOFTIMPUTE': 'SOFT',
    'GAIN': 'GAIN',
    'MIDA': 'MIDA',
    'SINKHORN': 'SINK',
    'MICE': 'MICE',
}

missing_modes = ['MCAR', 'MAR', 'MNAR']
missing_rates = [.1, .15, .25]
num_lines = len(missing_modes) * len(missing_rates)

approaches = ['SINGLE', 'MULTIPLE', 'SINGLE-HOT-PATCH', 'HOT-PATCH']

In [3]:
def get_ranks(array):
    vals, inverse, count = np.unique(array, return_inverse=True, return_counts=True)
    idx_vals_repeated = np.where(count > 1)[0]
    rows, cols = np.where(inverse == idx_vals_repeated[:, np.newaxis])
    _, inverse_rows = np.unique(rows, return_index=True)
    indices_of_repeated = np.split(cols, inverse_rows[1:])
    order = (-np.array(array)).argsort()
    ranks = order.argsort() * 1. + 1.
    for indices in indices_of_repeated:
        if len(indices) > 0:
            mean_rank = np.mean(ranks[indices])
            ranks[indices] = mean_rank
    return ranks

all_ranks = []
for dataset in datasets:
    best_method = params[dataset]['best_method']
    n_imputations = params[dataset]['n_imputations']
    n_runs = params[dataset]['n_runs']
    epochs = params[dataset]['epochs']
    alpha = params[dataset]['alpha']
    for i, missing_mode in enumerate(missing_modes):
        if i > 0:
            print(f'\\cline{{2-{len(approaches) * 2 + 5}}}')
        for j, missing_rate in enumerate(missing_rates):
            line = f'\\multirow{{{num_lines}}}{{*}}{{{clean_datasets[dataset]}}} &' if i + j == 0 else '&'
            line += f' \\multirow{{{len(missing_rates)}}}{{*}}{{{missing_mode}}} &' if j == 0 else ' &'
            line += f' {round(missing_rate * 100.)}\\% &'
            if os.path.isfile(f'results/MICE_{dataset}_r{n_runs}_e{epochs}.pickle'):
                mice_metrics = pickle.load(open(f'results/MICE_{dataset}_r{n_runs}_e{epochs}.pickle', 'rb'))
                exp_name = f'{dataset}_{missing_mode}_{f"{missing_rate:.02f}".split(".")[-1]}_' + \
                           f'n{n_imputations}_r{n_runs}_e{epochs}_a{str(alpha).replace(".", "_")}'
                if os.path.isfile(f'results/{exp_name}.pickle'):
                    metrics = pickle.load(open(f'results/{exp_name}.pickle', 'rb'))
                    v = {'MICE': round(np.mean(mice_metrics[missing_mode][missing_rate][metric]) / 100., 7)}
                    values = [v['MICE']]
                    for approach in approaches:
                        v[approach] = round(np.mean(metrics[best_method][approach][metric]) / 100., 7)
                        values.append(v[approach])
                    ranks = get_ranks(values)
                    all_ranks.append(ranks)
                    for k, approach in enumerate(['MICE'] + approaches):
                        bold = v[approach] == max(values)
                        line += f' $\\mathbf{{{v[approach]:.07f}}}$ &' if bold else f' ${v[approach]:.07f}$ &'
                        line += f' ({int(ranks[k]) if ranks[k].is_integer() else ranks[k]})'
                        if k < len(approaches):
                            line += ' &'
                    print(line + ' \\\\')
    print('\\hline')
line = f'\\multicolumn{{3}}{{|l|}}{{Average rank}}'
for r in np.array(all_ranks).mean(axis=0):
    line += f' & \\multicolumn{{2}}{{c|}}{{{r:.04f}}}'
print(line + ' \\\\')
print('\\hline')

\multirow{9}{*}{IRIS} & \multirow{3}{*}{MCAR} & 10\% & $0.9964543$ & (4) & $0.9958323$ & (5) & $\mathbf{0.9973218}$ & (1) & $0.9968544$ & (3) & $0.9972525$ & (2) \\
& & 15\% & $0.9916368$ & (5) & $0.9947010$ & (4) & $0.9967011$ & (2) & $0.9961452$ & (3) & $\mathbf{0.9967655}$ & (1) \\
& & 25\% & $0.9878213$ & (5) & $0.9879101$ & (4) & $0.9931490$ & (2) & $0.9920251$ & (3) & $\mathbf{0.9933714}$ & (1) \\
\cline{2-13}
& \multirow{3}{*}{MAR} & 10\% & $0.9963453$ & (5) & $0.9977831$ & (4) & $\mathbf{0.9983718}$ & (1) & $0.9980598$ & (3) & $0.9983512$ & (2) \\
& & 15\% & $0.9968731$ & (3) & $0.9964883$ & (5) & $0.9969672$ & (2) & $0.9966728$ & (4) & $\mathbf{0.9970707}$ & (1) \\
& & 25\% & $\mathbf{0.9972507}$ & (1) & $0.9934700$ & (5) & $0.9940230$ & (3) & $0.9937325$ & (4) & $0.9940345$ & (2) \\
\cline{2-13}
& \multirow{3}{*}{MNAR} & 10\% & $0.9973377$ & (3) & $0.9968828$ & (5) & $0.9975337$ & (2) & $0.9969866$ & (4) & $\mathbf{0.9975402}$ & (1) \\
& & 15\% & $0.9923837$ & (4) & $0.992218

In [4]:
def chi_2(N, k, ranks):
    s = 0
    for rank in ranks:
        s += rank**2
    first = (12 * N) / (k * (k + 1))
    second = s - (k * (k + 1)**2 / 4)
    return first * second

def FF(N, k, chi2):
    return ((N - 1) * chi2) / ((N * (k - 1)) - chi2)

N = len(datasets) * num_lines
k = len(approaches) + 1 # 4 approaches + 1 imputation method (MICE)
ranks = np.array(all_ranks).mean(axis=0)

chi2 = chi_2(N, k, ranks)
print(f'chi2: {chi2}')

ff = FF(N, k, chi2)
print(f'ff: {ff}')

print(f'CD: {2.728 * ((k * (k + 1)) / (6 * N))**.5}')

chi2: 117.01333333333342
ff: 81.74089754445403
CD: 0.9093333333333333
