In [28]:
import re
from pathlib import Path
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.ticker as mticker
import pandas as pd
from pathlib import Path


In [None]:
# --------------------------- CONFIG -----------------------------
FOLDER     = "/root/workspace/DDI_results/relative_improvement_with_time_frac"
DATASETS   = ['adultsample', 'australian', 'contraceptive', 'credit', 'imdb']
SEEDS      = [94, 584, 1234]
PCTS       = [5, 10, 20, 40]  # percentages of nulls to evaluate

In [38]:

# --------------------------- ANALYSIS ---------------------------
for PERCENTAGE in PCTS:
    print(f"\n=== Null Percentage: {PERCENTAGE}% ===")

    # Reset accumulators
    all_gains_senti_ipm         = []
    all_gains_senti_ipm_fixed   = []
    all_time_ratios_senti_ipm   = []
    all_time_ratios_senti_ipm_fixed = []

    dataset_gains = {
        dataset: {
            'SENTI_IPM': [],
            'SENTI_IPM_FIXED': [],
            'TIME_SENTI_IPM': [],
            'TIME_SENTI_IPM_FIXED': []
        } for dataset in DATASETS
    }

    for dataset in DATASETS:
        for seed in SEEDS:
            file_path = Path(FOLDER) / f"{dataset}_{seed}_relative_improvement.csv"
            if not file_path.exists():
                print(f"[WARN] File not found: {file_path}")
                continue

            df = pd.read_csv(file_path)
            if 'pct_nulls' not in df.columns:
                print(f"[WARN] Missing 'pct_nulls' column in {file_path.name}")
                continue

            # Force float comparison to be safe
            df['pct_nulls'] = df['pct_nulls'].astype(float)
            df_filtered = df[np.isclose(df['pct_nulls'], float(PERCENTAGE))]

            if len(df_filtered) < 2:
                continue

            # Use CHUNK_SIZE from full DataFrame
            CHUNK_SIZE = df.iloc[1]['end_index'] - df.iloc[1]['start_index']
            df_filtered = df_filtered[df_filtered['end_index'] - df_filtered['start_index'] >= CHUNK_SIZE]

            if df_filtered.empty:
                continue

            try:
                senti_scores      = df_filtered['avg_semantic_sim_SENTI'].values
                ipm_scores        = df_filtered['avg_semantic_sim_IPM_70_30_Retraining'].values
                ipm_fixed_scores  = df_filtered['avg_semantic_sim_IPM_70_30_fixed'].values
                senti_times       = df_filtered['total_time_SENTI'].values
                ipm_times         = df_filtered['total_time_IPM_70_30_Retraining'].values
                ipm_fixed_times   = df_filtered['total_time_IPM_70_30_fixed'].values
            except KeyError as e:
                print(f"[ERROR] Missing column in {file_path.name}: {e}")
                continue

            # Compute accuracy gains and time ratios
            gain_over_ipm         = ((senti_scores - ipm_scores) / ipm_scores) * 100
            gain_over_ipm_fixed   = ((senti_scores - ipm_fixed_scores) / ipm_fixed_scores) * 100
            time_ratio_ipm        = ipm_times / senti_times
            time_ratio_ipm_fixed  = ipm_fixed_times / senti_times

            # Accumulate overall
            all_gains_senti_ipm.extend(gain_over_ipm)
            all_gains_senti_ipm_fixed.extend(gain_over_ipm_fixed)
            all_time_ratios_senti_ipm.extend(time_ratio_ipm)
            all_time_ratios_senti_ipm_fixed.extend(time_ratio_ipm_fixed)

            # Accumulate per dataset
            dataset_gains[dataset]['SENTI_IPM'].extend(gain_over_ipm)
            dataset_gains[dataset]['SENTI_IPM_FIXED'].extend(gain_over_ipm_fixed)
            dataset_gains[dataset]['TIME_SENTI_IPM'].extend(time_ratio_ipm)
            dataset_gains[dataset]['TIME_SENTI_IPM_FIXED'].extend(time_ratio_ipm_fixed)

    # ------------------------ PRINT OVERALL AVERAGES ------------------------
    if all_gains_senti_ipm:
        avg_gain_senti_ipm        = np.mean(all_gains_senti_ipm)
        avg_gain_senti_ipm_fixed  = np.mean(all_gains_senti_ipm_fixed)
        avg_time_ratio_senti_ipm  = np.mean(all_time_ratios_senti_ipm)
        avg_time_ratio_senti_ipm_fixed = np.mean(all_time_ratios_senti_ipm_fixed)

        print(f"\n[OVERALL] Average accuracy gain SENTI over IPM       : {avg_gain_senti_ipm:.2f}%")
        print(f"[OVERALL] Average accuracy gain SENTI over IPM_fixed : {avg_gain_senti_ipm_fixed:.2f}%")
        print(f"[OVERALL] Average time ratio SENTI/IPM               : {avg_time_ratio_senti_ipm:.2f}x")
        print(f"[OVERALL] Average time ratio SENTI/IPM_fixed         : {avg_time_ratio_senti_ipm_fixed:.2f}x")

        # ------------------------ PRINT PER DATASET AVERAGES ------------------------
        print(f"\n[BY DATASET] Average accuracy gains and time ratios:")
        for dataset in DATASETS:
            d = dataset_gains[dataset]
            if not d['SENTI_IPM']:
                continue
            avg_dataset_gain_ipm        = np.mean(d['SENTI_IPM'])
            avg_dataset_gain_ipm_fixed  = np.mean(d['SENTI_IPM_FIXED'])
            avg_dataset_time_ratio_ipm  = np.mean(d['TIME_SENTI_IPM'])
            avg_dataset_time_ratio_ipm_fixed = np.mean(d['TIME_SENTI_IPM_FIXED'])

            print(f"\n{dataset}:")
            print(f"  SENTI over IPM accuracy gain       : {avg_dataset_gain_ipm:.2f}%")
            print(f"  SENTI over IPM_fixed accuracy gain : {avg_dataset_gain_ipm_fixed:.2f}%")
            print(f"  SENTI/IPM time ratio               : {avg_dataset_time_ratio_ipm:.2f}x")
            print(f"  SENTI/IPM_fixed time ratio         : {avg_dataset_time_ratio_ipm_fixed:.2f}x")
    else:
        print("[WARN] No data available for this percentage.")



=== Null Percentage: 5% ===

[OVERALL] Average accuracy gain SENTI over IPM       : 5.91%
[OVERALL] Average accuracy gain SENTI over IPM_fixed : 6.94%
[OVERALL] Average time ratio SENTI/IPM               : 150.03x
[OVERALL] Average time ratio SENTI/IPM_fixed         : 84.95x

[BY DATASET] Average accuracy gains and time ratios:

adultsample:
  SENTI over IPM accuracy gain       : 4.40%
  SENTI over IPM_fixed accuracy gain : 5.97%
  SENTI/IPM time ratio               : 154.09x
  SENTI/IPM_fixed time ratio         : 82.61x

australian:
  SENTI over IPM accuracy gain       : 3.25%
  SENTI over IPM_fixed accuracy gain : 3.07%
  SENTI/IPM time ratio               : 163.94x
  SENTI/IPM_fixed time ratio         : 95.37x

contraceptive:
  SENTI over IPM accuracy gain       : 8.66%
  SENTI over IPM_fixed accuracy gain : 9.01%
  SENTI/IPM time ratio               : 145.05x
  SENTI/IPM_fixed time ratio         : 83.59x

credit:
  SENTI over IPM accuracy gain       : 0.93%
  SENTI over IPM_fixed 