In [3]:
import os

# Define paths
results_path = "results.txt"
outliers_path = "results_with_outliers.txt"
logs_folder = "outlier_logs"

# Helper functions
def parse_results_file(filepath, with_outliers=False):
    results = {}
    with open(filepath, 'r') as f:
        for line in f:
            parts = line.strip().split(',')
            if 'ERROR' in parts or len(parts) < (4 if with_outliers else 3):
                continue
            try:
                if with_outliers:
                    dataset, model, outlier_technique, acc = parts
                else:
                    dataset, model, acc = parts
                acc = float(acc)
                if not (0.3 <= acc <= 0.99):
                    continue
                key = (dataset, model)
                if with_outliers:
                    results[key] = (outlier_technique, acc)
                else:
                    results[key] = acc
            except ValueError:
                continue
    return results

def parse_outlier_log(filepath):
    data = {}
    with open(filepath, 'r') as f:
        lines = f.read().strip().split("\n\n")
        for block in lines:
            lines_block = block.strip().split("\n")
            if len(lines_block) != 3:
                continue
            header = lines_block[0].strip()
            dataset, rest = header.split(" - ")
            split, technique = rest.strip("():").split("(")
            detected = int(lines_block[1].split(":")[1])
            total = int(lines_block[2].split(":")[1])
            pct = detected / total * 100
            data[(split.strip(), technique.strip())] = pct
    return data

# Load result files
results = parse_results_file(results_path)
results_with_outliers = parse_results_file(outliers_path, with_outliers=True)

# Process outliers
log_data = {}
for fname in os.listdir(logs_folder):
    if fname.endswith("_outliers.txt"):
        dataset = fname.replace("_outliers.txt", "")
        full_path = os.path.join(logs_folder, fname)
        log_data[dataset] = parse_outlier_log(full_path)

# Analyze and print improvements
print("dataset, model, outlier_technique, accuracy_no_outliers, accuracy_with_outliers, diff, train_outliers_percent, test_outliers_percent")
for key, acc_no_outliers in results.items():
    if key in results_with_outliers:
        outlier_technique, acc_with_outliers = results_with_outliers[key]
        if acc_with_outliers > acc_no_outliers:
            dataset, model = key
            diff = acc_with_outliers - acc_no_outliers
            log = log_data.get(dataset, {})
            train_pct = log.get(('train', outlier_technique), 0.0)
            test_pct = log.get(('test', outlier_technique), 0.0)
            print(
                f"dataset: {dataset}, "
                f"model: {model}, "
                f"outlier_technique: {outlier_technique}, "
                f"accuracy_no_outliers: {acc_no_outliers:.6f}, "
                f"accuracy_with_outliers: {acc_with_outliers:.6f}, "
                f"diff: {diff:.6f}, "
                f"train_outliers_percent: {train_pct:.2f}, "
                f"test_outliers_percent: {test_pct:.2f}"
            )


dataset,model,acc_no_outliers,acc_with_outliers,diff(%),train_outliers(%),test_outliers(%)
electricity,mlp,0.837155,0.837677,0.000522,5.00,4.99
vehicle,mlp,0.796863,0.753725,-0.043137,5.19,2.35
kc1,mlp,0.853712,0.832070,-0.021643,33.43,33.89
phoneme,mlp,0.846438,0.825285,-0.021153,22.67,23.03
pol,mlp,0.984234,0.569063,-0.415171,87.90,89.64
eye_movements,mlp,0.595034,0.587508,-0.007526,4.99,4.98
cmc,mlp,0.533333,0.537853,0.004520,4.99,5.08
electricity,RandomForest,0.859237,0.860451,0.001214,5.00,4.99
vehicle,RandomForest,0.721176,0.714902,-0.006275,5.19,2.35
kc1,RandomForest,0.869984,0.844550,-0.025434,33.43,33.89
phoneme,RandomForest,0.886710,0.856491,-0.030219,22.67,23.03
pol,RandomForest,0.974946,0.691720,-0.283226,87.90,89.64
eye_movements,RandomForest,0.648690,0.644302,-0.004388,4.99,4.98
cmc,RandomForest,0.544859,0.540339,-0.004520,4.99,5.08
electricity,xgboost,0.922013,0.919320,-0.002692,5.00,4.99
vehicle,xgboost,0.754902,0.733333,-0.021569,5.19,2.35
kc1,xgboost,0.862875,0.843286