In [1]:
import pandas as pd
import numpy as np

In [2]:
FOLDER = "/Users/lucio/Downloads/relative_improvement"
DATASETS = ['adultsample', 'australian', 'contraceptive', 'credit', 'imdb']
SEEDS = ['94', '584', '1234']

PERCENTAGE = 10

In [3]:
all_gains_senti_ipm = []
all_gains_senti_ipm_fixed = []
all_time_ratios_senti_ipm = []
all_time_ratios_senti_ipm_fixed = []
dataset_gains = {dataset: {'SENTI_IPM': [], 'SENTI_IPM_FIXED': [], 
                          'TIME_SENTI_IPM': [], 'TIME_SENTI_IPM_FIXED': []} for dataset in DATASETS}

for dataset in DATASETS:
    for seed in SEEDS:
        # Read the CSV file
        file_path = f"{FOLDER}/{dataset}_{seed}_relative_improvement.csv"
        df = pd.read_csv(file_path)
        
        # Filter for specified percentage of null values
        df_filtered = df[df['pct_nulls'] == PERCENTAGE]
        
        # Remove rows where chunk size is too small
        CHUNK_SIZE = df.iloc[1]['end_index'] - df.iloc[1]['start_index']
        df_filtered = df_filtered[df_filtered['end_index'] - df_filtered['start_index'] >= CHUNK_SIZE]
        
        senti_scores = df_filtered['avg_semantic_sim_SENTI'].values
        ipm_scores = df_filtered['avg_semantic_sim_IPM_70_30_Retraining'].values
        ipm_fixed_scores = df_filtered['avg_semantic_sim_IPM_70_30_fixed'].values
        
        senti_times = df_filtered['total_time_SENTI'].values
        ipm_times = df_filtered['total_time_IPM_70_30_Retraining'].values
        ipm_fixed_times = df_filtered['total_time_IPM_70_30_fixed'].values
        
        # Compute relative improvements for accuracy
        gain_over_ipm = ((senti_scores - ipm_scores) / ipm_scores) * 100
        gain_over_ipm_fixed = ((senti_scores - ipm_fixed_scores) / ipm_fixed_scores) * 100
        
        # Compute time ratios
        time_ratio_ipm = ipm_times / senti_times
        time_ratio_ipm_fixed = ipm_fixed_times / senti_times
        
        # Store accuracy gains
        all_gains_senti_ipm.extend(gain_over_ipm)
        all_gains_senti_ipm_fixed.extend(gain_over_ipm_fixed)
        
        # Store time ratios
        all_time_ratios_senti_ipm.extend(time_ratio_ipm)
        all_time_ratios_senti_ipm_fixed.extend(time_ratio_ipm_fixed)
        
        # Store gains and ratios by dataset
        dataset_gains[dataset]['SENTI_IPM'].extend(gain_over_ipm)
        dataset_gains[dataset]['SENTI_IPM_FIXED'].extend(gain_over_ipm_fixed)
        dataset_gains[dataset]['TIME_SENTI_IPM'].extend(time_ratio_ipm)
        dataset_gains[dataset]['TIME_SENTI_IPM_FIXED'].extend(time_ratio_ipm_fixed)

# Calculate overall average gains and time ratios
avg_gain_senti_ipm = np.mean(all_gains_senti_ipm)
avg_gain_senti_ipm_fixed = np.mean(all_gains_senti_ipm_fixed)
avg_time_ratio_senti_ipm = np.mean(all_time_ratios_senti_ipm)
avg_time_ratio_senti_ipm_fixed = np.mean(all_time_ratios_senti_ipm_fixed)

print(f"Average accuracy gain SENTI over IPM: {avg_gain_senti_ipm:.2f}%")
print(f"Average accuracy gain SENTI over IPM_fixed: {avg_gain_senti_ipm_fixed:.2f}%")
print(f"Average time ratio SENTI/IPM: {avg_time_ratio_senti_ipm:.2f}x")
print(f"Average time ratio SENTI/IPM_fixed: {avg_time_ratio_senti_ipm_fixed:.2f}x")

# Calculate average gains and time ratios by dataset
print(f"\nAverage accuracy gains and time ratios by dataset:")
for dataset in DATASETS:
    avg_dataset_gain_ipm = np.mean(dataset_gains[dataset]['SENTI_IPM'])
    avg_dataset_gain_ipm_fixed = np.mean(dataset_gains[dataset]['SENTI_IPM_FIXED'])
    avg_dataset_time_ratio_ipm = np.mean(dataset_gains[dataset]['TIME_SENTI_IPM'])
    avg_dataset_time_ratio_ipm_fixed = np.mean(dataset_gains[dataset]['TIME_SENTI_IPM_FIXED'])
    print(f"\n{dataset}:")
    print(f"  SENTI over IPM accuracy: {avg_dataset_gain_ipm:.2f}%")
    print(f"  SENTI over IPM_fixed accuracy: {avg_dataset_gain_ipm_fixed:.2f}%")
    print(f"  SENTI/IPM time ratio: {avg_dataset_time_ratio_ipm:.2f}x")
    print(f"  SENTI/IPM_fixed time ratio: {avg_dataset_time_ratio_ipm_fixed:.2f}x")


Average accuracy gain SENTI over IPM: 6.16%
Average accuracy gain SENTI over IPM_fixed: 6.60%
Average time ratio SENTI/IPM: 93.15x
Average time ratio SENTI/IPM_fixed: 51.14x

Average accuracy gains and time ratios by dataset:

adultsample:
  SENTI over IPM accuracy: 4.45%
  SENTI over IPM_fixed accuracy: 5.89%
  SENTI/IPM time ratio: 90.75x
  SENTI/IPM_fixed time ratio: 46.62x

australian:
  SENTI over IPM accuracy: 2.49%
  SENTI over IPM_fixed accuracy: 2.58%
  SENTI/IPM time ratio: 109.48x
  SENTI/IPM_fixed time ratio: 62.85x

contraceptive:
  SENTI over IPM accuracy: 8.96%
  SENTI over IPM_fixed accuracy: 9.44%
  SENTI/IPM time ratio: 86.72x
  SENTI/IPM_fixed time ratio: 45.66x

credit:
  SENTI over IPM accuracy: 1.49%
  SENTI over IPM_fixed accuracy: 2.70%
  SENTI/IPM time ratio: 103.53x
  SENTI/IPM_fixed time ratio: 60.78x

imdb:
  SENTI over IPM accuracy: 12.98%
  SENTI over IPM_fixed accuracy: 12.22%
  SENTI/IPM time ratio: 74.12x
  SENTI/IPM_fixed time ratio: 38.22x
