In [9]:
import os
import pandas as pd
import re
from scipy.stats import pearsonr

# ---- CONFIGURATION ----
datasets_task5 = ["cmc", "connect-4", "electricity", "eye", "kc1", "phoneme", "pol", "splice", "vehicle"]
datasets_task6and7 = ["Diamonds", "2dplanes", "1000-Cameras-Dataset", "Abalone_reg",
                      "Brazillian_houses_reproduced", "Data_science_Salaries"]
techniques = ["IsolationForest", "LocalOutlierFactor", "OneClassSVM", "ZScore", "ModifiedZScore", "IQR"]
results_file = "results.txt"
results_with_outliers_file = "results_with_outliers.txt"
outlier_log_folder = "outlier_logs"

# ---- HELPERS ----

def parse_results_file(filepath, with_outliers=False):
    data = {}
    with open(filepath, 'r') as f:
        for line in f:
            parts = line.strip().split(',')
            if with_outliers:
                if len(parts) != 4 or parts[3] == "ERROR":
                    continue
                dataset, model, technique, acc = parts
            else:
                if len(parts) != 3 or parts[2] == "ERROR":
                    continue
                dataset, model, acc = parts
            try:
                acc = float(acc)
            except ValueError:
                continue
            if with_outliers:
                data.setdefault(dataset, {}).setdefault(model, {})[technique] = acc
            else:
                data.setdefault(dataset, {})[model] = acc
    return data

def parse_outlier_log(filepath):
    results = {}
    with open(filepath, 'r') as f:
        content = f.read()

    pattern = r"(.*?) - (train|val|test) \((.*?)\):\s+Outliers detected: (\d+)\s+Total samples:\s+(\d+)"
    for match in re.finditer(pattern, content):
        dataset_full, split, technique, outliers, total = match.groups()
        outliers = int(outliers)
        total = int(total)
        results.setdefault(technique, []).append((outliers, total))

    # Aggregate across splits
    outlier_perc = {}
    for technique, values in results.items():
        total_outliers = sum(x[0] for x in values)
        total_samples = sum(x[1] for x in values)
        perc = total_outliers / total_samples if total_samples > 0 else 0
        outlier_perc[technique] = perc
    return outlier_perc

def get_outlier_data(dataset):
    file_path = os.path.join(outlier_log_folder, f"{dataset}_outliers.txt")
    if not os.path.exists(file_path):
        return {}
    return parse_outlier_log(file_path)

# ---- LOAD DATA ----
results = parse_results_file(results_file)
results_with_outliers = parse_results_file(results_with_outliers_file, with_outliers=True)

# ---- TASK 5 ----
rows = []
zscore_diffs = []
iso_diffs = []
zscore_percs = []
iso_percs = []

for dataset in datasets_task5:
    base = results.get(dataset, {})
    mlp_acc = base.get("mlp", None)
    xgb_acc = base.get("xgboost", None)
    diff = (mlp_acc - xgb_acc) if mlp_acc is not None and xgb_acc is not None else None

    outlier_info = get_outlier_data(dataset)
    zscore = outlier_info.get("ZScore", None)
    iso = outlier_info.get("IsolationForest", None)

    if diff is not None:
        if zscore is not None:
            zscore_diffs.append(diff)
            zscore_percs.append(zscore)
        if iso is not None:
            iso_diffs.append(diff)
            iso_percs.append(iso)

    rows.append({
        "Dataset": dataset,
        "MLP": mlp_acc,
        "XGBoost": xgb_acc,
        "Difference": diff,
        "ZScore %": zscore,
        "IsolationForest %": iso
    })

# Correlation row
z_corr = pearsonr(zscore_diffs, zscore_percs)[0] if len(zscore_diffs) >= 2 else None
iso_corr = pearsonr(iso_diffs, iso_percs)[0] if len(iso_diffs) >= 2 else None

rows.append({
    "Dataset": "Correlation",
    "MLP": "",
    "XGBoost": "",
    "Difference": "",
    "ZScore %": z_corr,
    "IsolationForest %": iso_corr
})

pd.DataFrame(rows).to_csv("task5.csv", index=False)

# ---- TASK 6 ----
rows = []
for dataset in datasets_task6and7:
    base = results.get(dataset, {})
    lr = base.get("LinearRegression", None)
    mlp = base.get("mlp", None)
    xgb = base.get("xgboost", None)
    diff_mlp = (mlp - lr) if mlp is not None and lr is not None else None
    diff_xgb = (xgb - lr) if xgb is not None and lr is not None else None

    outlier_info = get_outlier_data(dataset)
    zscore = outlier_info.get("ZScore", None)
    iso = outlier_info.get("IsolationForest", None)

    rows.append({
        "Dataset": dataset,
        "LinearRegression": lr,
        "MLP": mlp,
        "XGBoost": xgb,
        "MLP-LR Diff": diff_mlp,
        "XGB-LR Diff": diff_xgb,
        "ZScore %": zscore,
        "IsolationForest %": iso
    })

pd.DataFrame(rows).to_csv("task6.csv", index=False)

# ---- TASK 7 ----
rows = []

for dataset in datasets_task6and7:
    row = {"Dataset": dataset}
    base = results.get(dataset, {})
    lr = base.get("LinearRegression", None)
    row["LinearRegression"] = lr

    for technique in techniques:
        lr_acc = results_with_outliers.get(dataset, {}).get("LinearRegression", {}).get(technique, None)
        row[technique] = lr_acc
    rows.append(row)

pd.DataFrame(rows).to_csv("task7.csv", index=False)

print("All CSV files created: task5.csv, task6.csv, task7.csv")


All CSV files created: task5.csv, task6.csv, task7.csv
