In [1]:
import os

# Define paths
results_path = "results.txt"
outliers_path = "results_with_outliers.txt"
logs_folder = "outlier_logs"

# Helper functions
def parse_results_file(filepath, with_outliers=False):
    results = {}
    with open(filepath, 'r') as f:
        for line in f:
            parts = line.strip().split(',')
            if 'ERROR' in parts or len(parts) < (4 if with_outliers else 3):
                continue
            try:
                if with_outliers:
                    dataset, model, outlier_technique, acc = parts
                else:
                    dataset, model, acc = parts
                acc = float(acc)
                if not (0.3 <= acc <= 0.99):
                    continue
                key = (dataset, model)
                if with_outliers:
                    results[key] = (outlier_technique, acc)
                else:
                    results[key] = acc
            except ValueError:
                continue
    return results

def parse_outlier_log(filepath):
    data = {}
    with open(filepath, 'r') as f:
        lines = f.read().strip().split("\n\n")
        for block in lines:
            lines_block = block.strip().split("\n")
            if len(lines_block) != 3:
                continue
            header = lines_block[0].strip()
            dataset, rest = header.split(" - ")
            split, technique = rest.strip("():").split("(")
            detected = int(lines_block[1].split(":")[1])
            total = int(lines_block[2].split(":")[1])
            pct = detected / total * 100
            data[(split.strip(), technique.strip())] = pct
    return data

# Load result files
results = parse_results_file(results_path)
results_with_outliers = parse_results_file(outliers_path, with_outliers=True)

# Process outliers
log_data = {}
for fname in os.listdir(logs_folder):
    if fname.endswith("_outliers.txt"):
        dataset = fname.replace("_outliers.txt", "")
        full_path = os.path.join(logs_folder, fname)
        log_data[dataset] = parse_outlier_log(full_path)

# Analyze and print improvements
print("dataset, model, outlier_technique, accuracy_no_outliers, accuracy_with_outliers, diff, train_outliers_percent, test_outliers_percent")
for key, acc_no_outliers in results.items():
    if key in results_with_outliers:
        outlier_technique, acc_with_outliers = results_with_outliers[key]
        if acc_with_outliers > acc_no_outliers:
            dataset, model = key
            diff = acc_with_outliers - acc_no_outliers
            log = log_data.get(dataset, {})
            train_pct = log.get(('train', outlier_technique), 0.0)
            test_pct = log.get(('test', outlier_technique), 0.0)
            print(
                f"dataset: {dataset}, "
                f"model: {model}, "
                f"outlier_technique: {outlier_technique}, "
                f"accuracy_no_outliers: {acc_no_outliers:.6f}, "
                f"accuracy_with_outliers: {acc_with_outliers:.6f}, "
                f"diff: {diff:.6f}, "
                f"train_outliers_percent: {train_pct:.2f}, "
                f"test_outliers_percent: {test_pct:.2f}"
            )


dataset, model, outlier_technique, accuracy_no_outliers, accuracy_with_outliers, diff, train_outliers_percent, test_outliers_percent
dataset: cmc, model: mlp, outlier_technique: IQR, accuracy_no_outliers: 0.533333, accuracy_with_outliers: 0.551638, diff: 0.018305, train_outliers_percent: 6.16, test_outliers_percent: 1.36
dataset: cmc, model: xgboost, outlier_technique: IQR, accuracy_no_outliers: 0.569492, accuracy_with_outliers: 0.569718, diff: 0.000226, train_outliers_percent: 6.16, test_outliers_percent: 1.36
