# Analysing and comparing statistics of different generation methods

In [15]:
import numpy as np
import torch
import matplotlib.pyplot as plt
import pickle as pkl
from tabulate import tabulate

In [8]:
# Load the statistics
path_random = '..\..\datasets\\100random\\100\statistics' 
random_efficiencies = pkl.load(open(path_random + "\effiencies.pkl", 'rb'))
length_cf_random = pkl.load(open(path_random + "\lengths_cf.pkl", 'rb'))
length_org_random = pkl.load(open(path_random + "\lengths_org.pkl", 'rb'))
start_points_random = pkl.load(open(path_random + "\start_points.pkl", 'rb'))

path_step = '..\..\datasets\\100step\\100\statistics'
step_efficiencies = pkl.load(open(path_step + "\effiencies.pkl", 'rb'))[:100]
length_cf_step = pkl.load(open(path_step + "\lengths_cf.pkl", 'rb'))[:100]
length_org_step = pkl.load(open(path_step + "\lengths_org.pkl", 'rb'))[:100]
start_points_step = pkl.load(open(path_step + "\start_points.pkl", 'rb'))[:100]

path_mcts = '..\..\datasets\\100mcts\\100\statistics'
mcts_efficiencies = pkl.load(open(path_mcts + "\effiencies.pkl", 'rb'))
length_cf_mcts = pkl.load(open(path_mcts + "\lengths_cf.pkl", 'rb'))
length_org_mcts = pkl.load(open(path_mcts + "\lengths_org.pkl", 'rb'))
start_points_mcts = pkl.load(open(path_mcts + "\start_points.pkl", 'rb'))

In [24]:
weight = {'validity': 1, 'proximity': 1, 'critical_state': 0.5, 'diversity': 0.5, 'realisticness': 0.2, 'sparsity': 0.5}
with open('..\..\interpretability\\normalisation_values.pkl', 'rb') as f:
    normalisation = pkl.load(f)

mcts_prox, step_prox, random_prox = [], [], []
mcts_val, step_val, random_val = [], [], []
mcts_div, step_div, random_div = [], [], []
mcts_crit, step_crit, random_crit = [], [], []
mcts_real, step_real, random_real = [], [], []
mcts_spar, step_spar, random_spar = [], [], []
mcts_qc, step_qc, random_qc = [], [], []

with open('..\..\interpretability\logs\qc_comparison.txt', 'r') as f:
    for line in f:
        parts = line.split(' ')
        if 'validity' in line:
            mcts_val.append(float(parts[1]) / normalisation['validity'] * weight['validity'])
            step_val.append(float(parts[2]) / normalisation['validity'] * weight['validity'])
            random_val.append(float(parts[3]) / normalisation['validity'] * weight['validity'])
        elif 'diversity' in line:
            mcts_div.append(float(parts[1]) / normalisation['diversity'] * weight['diversity'])
            step_div.append(float(parts[2]) / normalisation['diversity'] * weight['diversity'])
            random_div.append(float(parts[3]) / normalisation['diversity'] * weight['diversity'])
        elif 'proximity' in line:
            mcts_prox.append(- float(parts[1]) / normalisation['proximity'] * weight['proximity'])
            step_prox.append(-float(parts[2]) / normalisation['proximity'] * weight['proximity'])
            random_prox.append(-float(parts[3]) / normalisation['proximity'] * weight['proximity'])
        elif 'critical' in line:
            mcts_crit.append(float(parts[1]) / normalisation['critical_state'] * weight['critical_state'])
            step_crit.append(float(parts[2]) / normalisation['critical_state'] * weight['critical_state'])
            random_crit.append(float(parts[3]) / normalisation['critical_state'] * weight['critical_state'])
        elif 'realistic' in line:
            mcts_real.append(float(parts[1])   / normalisation['realisticness'] * weight['realisticness'])
            step_real.append(float(parts[2])   / normalisation['realisticness'] * weight['realisticness'])
            random_real.append(float(parts[3]) / normalisation['realisticness'] * weight['realisticness'])
        elif 'sparsity' in line:
            mcts_spar.append(float(parts[1]) / normalisation['sparsity'] * weight['sparsity'])
            step_spar.append(float(parts[2]) / normalisation['sparsity'] * weight['sparsity'])
            random_spar.append(float(parts[3]) / normalisation['sparsity'] * weight['sparsity'])

qc_mcts = np.mean(mcts_prox) + np.mean(mcts_val) + np.mean(mcts_div) + np.mean(mcts_crit) + np.mean(mcts_real) + np.mean(mcts_spar)
qc_step = np.mean(step_prox) + np.mean(step_val) + np.mean(step_div) + np.mean(step_crit) + np.mean(step_real) + np.mean(step_spar)
qc_random = np.mean(random_prox) + np.mean(random_val) + np.mean(random_div) + np.mean(random_crit) + np.mean(random_real) + np.mean(random_spar)

In [28]:
table = [
    ['statistics'] + ['mcts'] + ['step'] + ['random'],
    ['efficiency'] + [round(np.mean(mcts_efficiencies), 2)] + [round(np.mean(step_efficiencies), 2)] + [round(np.mean(random_efficiencies), 2)],
    ['length cf'] + [round(np.mean(length_cf_mcts), 2)] + [round(np.mean(length_cf_step), 2)] + [round(np.mean(length_cf_random), 2)],
    ['length org'] + [round(np.mean(length_org_mcts), 2)] + [round(np.mean(length_org_step), 2)] + [round(np.mean(length_org_random), 2)],
    ['start points'] + [round(np.mean(start_points_mcts), 2)] + [round(np.mean(start_points_step), 2)] + [round(np.mean(start_points_random), 2)],
     ['----------'] + ['----------'] + ['----------'] + ['----------'],
    ['validity'] + [round(np.mean(mcts_val), 2)] + [round(np.mean(step_val), 2)] + [round(np.mean(random_val), 2)],
    ['proximity'] + [round(np.mean(mcts_prox), 2)] + [round(np.mean(step_prox), 2)] + [round(np.mean(random_prox), 2)],
    ['diversity'] + [round(np.mean(mcts_div), 2)] + [round(np.mean(step_div), 2)] + [round(np.mean(random_div), 2)],
    ['critical'] + [round(np.mean(mcts_crit), 2)] + [round(np.mean(step_crit), 2)] + [round(np.mean(random_crit), 2)],
    ['realistic'] + [round(np.mean(mcts_real), 2)] + [round(np.mean(step_real), 2)] + [round(np.mean(random_real), 2)],
    ['sparsity'] + [round(np.mean(mcts_spar), 2)] + [round(np.mean(step_spar), 2)] + [round(np.mean(random_spar), 2)],
    ['qc'] + [round(qc_mcts, 2)] + [round(qc_step, 2)] + [round(qc_random, 2)]
]

print(tabulate(table, headers='firstrow'))

### Observations

MCTS is by far the slowest. This makes sense, because it considers many more options.

MCTS makes much shorter trajectories (basically always minimum length). This indicates 
- that diversity is not properly optimised for
- the quality criteria are not traded-off well against each other

The length of MCTS org and cf trajectories are the same (this is built into the method), while step and random have different lengths.
- These are differences in when I decide to end the trajectories. A good ablation would make try to reduce these effect by e.g. ending originals of step at the same timestep as the cf

For step the trajectories in the counterfactual are longer than the originals. Maybe this is built in to the method? But I think it's more likely the byproduct of some quality criteria incentivising this difference in lengths.

MCTS starts slightly later than step. Random starts much later because it has uniformly distributed starting points.


