In [8]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import defaultdict


In [10]:
# ------------------- CONFIG -------------------
ROOT = Path("/root/workspace/New_experiments/Evaluation/data/outputs")
RELIN_DIR = ROOT.parent / "relative_improvement"
FINAL_DIR = ROOT.parent / "Final_results"
RELIN_DIR.mkdir(exist_ok=True)
FINAL_DIR.mkdir(parents=True, exist_ok=True)
DATASETS = ["adultsample", "australian", "contraceptive", "credit", "imdb"]
SEEDS = [94, 584, 1234]
PERCENTAGES = [5, 10, 20, 40]
ONE_PERCENTAGE = 10
PATTERNS = {
    "sent":    "SENT-I_evaluations_{dat}_{seed}.csv",
    "fixed":   "IPM_evaluations_fixed_{dat}_{seed}.csv",
    "retrain": "IPM_evaluations_Retraining_{dat}_{seed}.csv",}


In [12]:
# ------------------- PART 1: GENERATE RELATIVE IMPROVEMENTS -------------------
all_results = []
null_col_name = None

for dat in DATASETS:
    for seed in SEEDS:
        # Build file paths
        path_sent    = ROOT / PATTERNS["sent"].format(dat=dat, seed=seed)
        path_fixed   = ROOT / PATTERNS["fixed"].format(dat=dat, seed=seed)
        path_retrain = ROOT / PATTERNS["retrain"].format(dat=dat, seed=seed)

        # Check existence
        if not (path_sent.exists() and path_fixed.exists() and path_retrain.exists()):
            print(f"[WARN] Missing raw CSV for {dat}, seed {seed}")
            continue

        # Load DataFrames
        df_sent    = pd.read_csv(path_sent)
        df_fixed   = pd.read_csv(path_fixed)
        df_retrain = pd.read_csv(path_retrain)

        # Detect null-fraction column
        cols_sent    = [c for c in df_sent.columns if "nulls" in c.lower()]
        cols_fixed   = [c for c in df_fixed.columns if "nulls" in c.lower()]
        cols_retrain = [c for c in df_retrain.columns if "nulls" in c.lower()]
        if not (cols_sent and cols_fixed and cols_retrain):
            print(f"[ERROR] Null column missing for {dat}, seed {seed}")
            continue
        if len({cols_sent[0], cols_fixed[0], cols_retrain[0]}) != 1:
            print(f"[ERROR] Inconsistent null columns for {dat}, seed {seed}")
            continue

        null_col = cols_sent[0]
        if null_col_name is None:
            null_col_name = null_col

        # Metric columns
        sim_s, time_s = "avg_semantic_sim_SENTI", "total_time_SENTI"
        sim_f, time_f = "avg_semantic_sim_IPM_fixed", "total_time_IPM_fixed"
        sim_r, time_r = "avg_semantic_sim_IPM_70_30_Retraining", "total_time_IPM_70_30_Retraining"

        # Select and merge
        df1 = df_sent[[null_col, "start_index", "end_index", sim_s, time_s]]
        df2 = df_fixed[[null_col, "start_index", "end_index", sim_f, time_f]]
        df3 = df_retrain[[null_col, "start_index", "end_index", sim_r, time_r]]
        df = df1.merge(df2, on=[null_col, "start_index", "end_index"]) \
               .merge(df3, on=[null_col, "start_index", "end_index"])

        # Compute improvements
        df["Over_IPM_fixed"]     = (df[sim_s] - df[sim_f]) / df[sim_f] * 100
        df["Over_IPM_retrained"] = (df[sim_s] - df[sim_r]) / df[sim_r] * 100
        df["OverTime_IPM_fixed"]     = (df[time_f] - df[time_s]) / df[time_f] * 100
        df["OverTime_IPM_retrained"] = (df[time_r] - df[time_s]) / df[time_r] * 100
        df["FracTime_IPM_fixed"]     = df[time_f] / df[time_s]
        df["FracTime_IPM_retrained"] = df[time_r] / df[time_s]

        # Add context
        df.insert(0, "seed", seed)
        df.insert(0, "dataset", dat)

        # Save per-seed CSV
        out_path = RELIN_DIR / f"{dat}_{seed}_relative_improvement.csv"
        df.to_csv(out_path, index=False)
        print(f"[INFO] Saved detailed improvements: {out_path}")

        all_results.append((dat, df))

# Concatenate and summary
if all_results and null_col_name:
    grouped = defaultdict(list)
    for dat, df in all_results:
        grouped[dat].append(df)

    concat_results = {}
    for dat, dfs in grouped.items():
        df_concat = pd.concat(dfs, ignore_index=True)
        concat_path = RELIN_DIR / f"{dat}_relative_improvement.csv"
        df_concat.to_csv(concat_path, index=False)
        print(f"[INFO] Saved concatenated file for {dat}: {concat_path}")
        concat_results[dat] = df_concat

    for dat, df_concat in concat_results.items():
        summary = df_concat.groupby(["dataset", null_col_name]).agg({
            "Over_IPM_fixed":           "mean",
            "Over_IPM_retrained":       "mean",
            "OverTime_IPM_fixed":       "mean",
            "OverTime_IPM_retrained":   "mean",
            "FracTime_IPM_fixed":       "mean",
            "FracTime_IPM_retrained":   "mean",
        }).reset_index().rename(columns={
            null_col_name:                "pct_nulls",
            "Over_IPM_fixed":            "avg_over_sim_IPM_fixed(%)",
            "Over_IPM_retrained":        "avg_over_sim_IPM_retrained(%)",
            "OverTime_IPM_fixed":        "avg_over_time_IPM_fixed(%)",
            "OverTime_IPM_retrained":    "avg_over_time_IPM_retrained(%)",
            "FracTime_IPM_fixed":        "avg_frac_time_IPM_fixed",
            "FracTime_IPM_retrained":    "avg_frac_time_IPM_retrained",
        }).sort_values(["pct_nulls"])

        summary_path = RELIN_DIR / f"average_relative_improvement_{dat}.csv"
        summary.to_csv(summary_path, index=False)
        print(f"[INFO] Saved averaged summary for {dat}: {summary_path}")
else:
    print("[INFO] No data to summarize or null column not detected.")

# -------------------GAIN & TIME RATIOS-------------------
global_records = []
for dataset in DATASETS:
    summary_records = []
    for pct in PERCENTAGES:
        gains_ipm, gains_fixed = [], []
        time_ipm, time_fixed = [], []

        for seed in SEEDS:
            fp = RELIN_DIR / f"{dataset}_{seed}_relative_improvement.csv"
            if not fp.exists():
                continue
            df = pd.read_csv(fp)
            # Filter by null percentage
            df = df[df[null_col_name] == pct]
            if df.shape[0] < 2:
                continue
            # Ensure chunk size
            chunk = df.iloc[1]['end_index'] - df.iloc[1]['start_index']
            df = df[df['end_index'] - df['start_index'] >= chunk]
            if df.empty:
                continue

            try:
                senti = df['avg_semantic_sim_SENTI'].values
                ipm = df['avg_semantic_sim_IPM_70_30_Retraining'].values
                fixed = df['avg_semantic_sim_IPM_fixed'].values
                senti_t = df['total_time_SENTI'].values
                ipm_t = df['total_time_IPM_70_30_Retraining'].values
                fixed_t = df['total_time_IPM_fixed'].values
            except KeyError:
                continue

            gains_ipm.extend(((senti - ipm) / ipm) * 100)
            gains_fixed.extend(((senti - fixed) / fixed) * 100)
            time_ipm.extend(ipm_t / senti_t)
            time_fixed.extend(fixed_t / senti_t)

        summary_records.append({
            "Dataset": dataset,
            "Null %": pct,
            "SENTI over IPM accuracy (%)": np.mean(gains_ipm),
            "SENTI over IPM_fixed accuracy (%)": np.mean(gains_fixed),
            "SENTI/IPM time ratio (x)": np.mean(time_ipm),
            "SENTI/IPM_fixed time ratio (x)": np.mean(time_fixed),
        })
        if pct == ONE_PERCENTAGE:
            global_records.append(summary_records[-1])

    # Compute per-dataset overall
    df_sum = pd.DataFrame(summary_records)
    summary_records.append({
        "Dataset": dataset,
        "Null %": "Overall",
        "SENTI over IPM accuracy (%)": df_sum["SENTI over IPM accuracy (%)"].mean(),
        "SENTI over IPM_fixed accuracy (%)": df_sum["SENTI over IPM_fixed accuracy (%)"].mean(),
        "SENTI/IPM time ratio (x)": df_sum["SENTI/IPM time ratio (x)"].mean(),
        "SENTI/IPM_fixed time ratio (x)": df_sum["SENTI/IPM_fixed time ratio (x)"].mean(),
    })

    # Save per-dataset final CSV
    df_out = pd.DataFrame(summary_records)
    out_fp = FINAL_DIR / f"{dataset}_avg_gains_and_time_ratios.csv"
    df_out.to_csv(out_fp, index=False)
    print(f"Saved: {out_fp}")

# Global summary for ONE_PERCENTAGE
global_df = pd.DataFrame(global_records)
if not global_df.empty:
    global_overall = {
        "Dataset": "Overall",
        "Null %": ONE_PERCENTAGE,
        "SENTI over IPM accuracy (%)": global_df["SENTI over IPM accuracy (%)"].mean(),
        "SENTI over IPM_fixed accuracy (%)": global_df["SENTI over IPM_fixed accuracy (%)"].mean(),
        "SENTI/IPM time ratio (x)": global_df["SENTI/IPM time ratio (x)"].mean(),
        "SENTI/IPM_fixed time ratio (x)": global_df["SENTI/IPM_fixed time ratio (x)"].mean(),
    }
    global_df.loc[len(global_df)] = global_overall
    final_global_fp = FINAL_DIR / f"average_gains_and_time_ratios_{ONE_PERCENTAGE}pct.csv"
    global_df.to_csv(final_global_fp, index=False)
    print(f" Saved: {final_global_fp}")


[INFO] Saved detailed improvements: /root/workspace/New_experiments/Evaluation/data/relative_improvement/adultsample_94_relative_improvement.csv
[INFO] Saved detailed improvements: /root/workspace/New_experiments/Evaluation/data/relative_improvement/adultsample_584_relative_improvement.csv
[INFO] Saved detailed improvements: /root/workspace/New_experiments/Evaluation/data/relative_improvement/adultsample_1234_relative_improvement.csv
[INFO] Saved detailed improvements: /root/workspace/New_experiments/Evaluation/data/relative_improvement/australian_94_relative_improvement.csv
[INFO] Saved detailed improvements: /root/workspace/New_experiments/Evaluation/data/relative_improvement/australian_584_relative_improvement.csv
[INFO] Saved detailed improvements: /root/workspace/New_experiments/Evaluation/data/relative_improvement/australian_1234_relative_improvement.csv
[INFO] Saved detailed improvements: /root/workspace/New_experiments/Evaluation/data/relative_improvement/contraceptive_94_relat