In [1]:
import os
import pickle
import numpy as np
import scipy.stats

In [2]:
def t_test(ours, theirs, p_value=.05):
    t, p = scipy.stats.ttest_ind(ours, theirs)
    res = 'even'
    if p <= p_value:
        if t > 0:
            ## Our method is better
            res = 'better'
        else:
            ## Theirs is better
            res = 'worse'
    return res


t_test_symbols = {
    'better': '$\\bullet$',
    'worse': '$\\circ$',
    'even': '$\\equiv$',
}


methods_per_settings = {
    'Single Best': [
        'NN-A',
        'DAN-A',
        'DANN-A',
        'DSAN-A',
        'FT-A',
        'MCD-A',
    ],
    'Source Combine': [
        'NN-B',
        'DANN-B',
        'DAN-B',
        'DSAN-B',
        'FT-B',
        'MCD-B',
    ],
    'Multi Source': [
        'MDAN',
        'MFSAN',
        'M3SDA',
        'ABMSDA',
        'OURS',
        'OURS-beta',
    ],
}


clean_metrics = {
    'bal_acc': 'bACC',
    'auc': 'AUC',
    'f1': 'F1',
}

# dataset = 'DIGITS'
# dataset = 'DOMAINNET'
dataset = 'COVID'

runs = 5

if dataset == 'DIGITS':
    clean_domains = {
        'mnist': 'MNIST',
        'mnistm': 'MNIST-M',
        'svhn': 'SVHN',
        'syn': 'SYN',
        'usps': 'USPS'
    }
    batch_size = 128
    epochs = 150
    py = 'PYHIDIFF_'
    ours_t_test = 'OURS-beta'

if dataset == 'DOMAINNET':
    clean_domains = {
        'clipart': 'CLIP',
        'infograph': 'INFO',
        'painting': 'PAINT',
        'real': 'REAL',
        'sketch': 'SKETCH',
        'quickdraw': 'QUICK',
    }
    batch_size = 50
    epochs = 200
    py = ''
    ours_t_test = 'OURS-beta'

if dataset == 'COVID':
    clean_domains = {
        '1': '1',
        '2': '2',
        '3': '3',
        '4': '4',
        '5': '5',
    }
    batch_size = 50
    epochs = 10
    epochs = 100
    py = ''
    ours_t_test = 'OURS'

In [4]:
end_line = ' \\\\'
print('\hline')
line = 'Setting & Method & Metric &'
for i, (domain, clean_domain) in enumerate(clean_domains.items()):
    line += f' \multicolumn{{2}}{{l{"||" if i == len(clean_domains) - 1 else "|"}}}{{{clean_domain}}} &'
line += f' \multicolumn{{2}}{{l|}}{{Avg}}'
print(line + end_line)
print('\hline')

bests = {}
second_bests = {}
for domain in list(clean_domains.keys()) + ['avg']:
    bests[domain] = {}
    second_bests[domain] = {}
    for metric in clean_metrics.keys():
        bests[domain][metric] = 0
        second_bests[domain][metric] = 0

ours = None
for setting, methods in methods_per_settings.items():
    for i, method in enumerate(methods):
        for metric, clean_metric in clean_metrics.items():
            filename = f'results/{dataset}_{py}{method.replace("-", "_")}_r{runs}_e{epochs}_b{batch_size}.pickle'
            if os.path.exists(filename):
                metrics = pickle.load(open(filename, 'rb'))
                if method == ours_t_test:
                    ours = metrics
                avg = []
                for domain, clean_domain in clean_domains.items():
                    metric_values = metrics[domain][metric]
                    avg.extend(metric_values)
                    if np.mean(metric_values) >= bests[domain][metric]:
                        bests[domain][metric] = np.mean(metric_values)
                if np.mean(avg) >= bests['avg'][metric]:
                    bests['avg'][metric] = np.mean(avg)

for setting, methods in methods_per_settings.items():
    for i, method in enumerate(methods):
        for metric, clean_metric in clean_metrics.items():
            filename = f'results/{dataset}_{py}{method.replace("-", "_")}_r{runs}_e{epochs}_b{batch_size}.pickle'
            if os.path.exists(filename):
                metrics = pickle.load(open(filename, 'rb'))
                avg = []
                for domain, clean_domain in clean_domains.items():
                    metric_values = metrics[domain][metric]
                    avg.extend(metric_values)
                    if (np.mean(metric_values) >= second_bests[domain][metric] and
                        np.mean(metric_values) < bests[domain][metric]):
                        second_bests[domain][metric] = np.mean(metric_values)
                if np.mean(avg) >= second_bests['avg'][metric] and np.mean(avg) < bests['avg'][metric]:
                    second_bests['avg'][metric] = np.mean(avg)

metrics_ours = {}
for setting, methods in methods_per_settings.items():
    line = f'\\multirow{{{len(methods) * len(clean_metrics)}}}{{*}}{{\parbox{{1.4cm}}{{{setting}}}}} '
    for i, method in enumerate(methods):
        line += f'& \\multirow{{{len(clean_metrics)}}}{{*}}{{{method}}} &'\
        .replace('OURS', 'WMSSDA').replace('beta', '$\\beta$').replace('-A', '').replace('-B', '')
        for metric, clean_metric in clean_metrics.items():
            line += f' {clean_metric} &'
            filename = f'results/{dataset}_{py}{method.replace("-", "_")}_r{runs}_e{epochs}_b{batch_size}.pickle'
            if os.path.exists(filename):
                metrics = pickle.load(open(filename, 'rb'))
                if 'OURS' in method:
                    metrics_ours[method] = metrics
                avg = []
                for domain, clean_domain in clean_domains.items():
                    metric_values = metrics[domain][metric]
                    ours_metric_values = ours[domain][metric]
                    avg.extend(metric_values)
                    best = np.mean(metric_values) == bests[domain][metric]
                    second_best = np.mean(metric_values) == second_bests[domain][metric]
                    line += ' $' + ('\\mathbf{' if best else ('\\underline{' if second_best else ''))
                    if metric == 'auc':
                        mean = f'{np.mean(metric_values)/100.:.04f}'.lstrip('0')
                        std = f'{np.std(metric_values)/100.:.04f}'.lstrip('0')
                        line += f'{mean} \pm {std}'
                    else:
                        line += f'{np.mean(metric_values):.02f} \pm {np.std(metric_values):.02f}'
                    line += ('}' if best or second_best else '') + '$ &'
                    if not 'OURS' in method:
                        ## Run t-test between current cell and corresponding cell in our method
                        line += f' {t_test_symbols[t_test(ours_metric_values, metric_values)]}'
                    line += ' &'
                avg_t_test, ours_avg_t_test = [], []
                for run in range(runs):
                    theirs_mean = 0
                    ours_mean = 0
                    for domain, clean_domain in clean_domains.items():
                        metric_values = metrics[domain][metric]
                        ours_metric_values = ours[domain][metric]
                        theirs_mean += np.mean(metric_values)
                        ours_mean += np.mean(ours_metric_values)
                    avg_t_test.append(theirs_mean / len(clean_domains))
                    ours_avg_t_test.append(ours_mean / len(clean_domains))
                best = np.mean(avg) == bests['avg'][metric]
                second_best = np.mean(avg) == second_bests['avg'][metric]
                line += ' $' + ('\\mathbf{' if best else ('\\underline{' if second_best else ''))
                if metric == 'auc':
                    mean = f'{np.mean(avg)/100.:.04f}'.lstrip('0')
                    line += f'{mean}'
                else:
                    line += f'{np.mean(avg):.02f}'
                line += ('}' if best or second_best else '') + '$ &'
                if not 'OURS' in method:
                    ## Run t-test between ours averages and theirs
                    line += f' {t_test_symbols[t_test(np.array(ours_avg_t_test), np.array(avg_t_test))]}'
            else:
                line += ' &' * len(clean_domains)
            print(line + end_line)
            line = '& &'
        if i < len(methods) - 1:
            print(f'\\hhline{{~|{"-" * (4 + len(clean_domains) * 2)}}}')
        line = ''
    print('\\hline')

\hline
Setting & Method & Metric & \multicolumn{2}{l|}{1} & \multicolumn{2}{l|}{2} & \multicolumn{2}{l|}{3} & \multicolumn{2}{l|}{4} & \multicolumn{2}{l||}{5} & \multicolumn{2}{l|}{Avg} \\
\hline
\multirow{18}{*}{\parbox{1.4cm}{Single Best}} & \multirow{3}{*}{NN} & bACC & $86.15 \pm 1.15$ & $\bullet$ & $85.57 \pm 0.46$ & $\bullet$ & $85.41 \pm 0.37$ & $\bullet$ & $83.47 \pm 0.30$ & $\bullet$ & $85.91 \pm 2.92$ & $\bullet$ & $85.30$ & $\bullet$ \\
& & AUC & $.9257 \pm .0083$ & $\bullet$ & $.9145 \pm .0042$ & $\bullet$ & $.8983 \pm .0042$ & $\bullet$ & $.8878 \pm .0066$ & $\bullet$ & $.9621 \pm .0048$ & $\bullet$ & $.9177$ & $\bullet$ \\
& & F1 & $81.52 \pm 1.14$ & $\bullet$ & $82.06 \pm 0.83$ & $\bullet$ & $80.79 \pm 1.00$ & $\bullet$ & $78.80 \pm 0.68$ & $\bullet$ & $81.79 \pm 3.55$ & $\bullet$ & $80.99$ & $\bullet$ \\
\hhline{~|--------------}
& \multirow{3}{*}{DAN} & bACC & $87.09 \pm 1.01$ & $\bullet$ & $80.96 \pm 7.17$ & $\equiv$ & $79.50 \pm 4.82$ & $\bullet$ & $82.36 \pm 1.14$ & 

  t, p = scipy.stats.ttest_ind(ours, theirs)
