In [None]:
import pandas as pd
import numpy as np
from IPython.display import display

pd.options.display.max_rows = 120
pd.options.display.max_columns = 75
np.set_printoptions(precision=4)

def filter(df: pd.DataFrame, col: str, values: list[float], tol: float = 1e-6) -> pd.DataFrame:
  def is_within_tolerance(val, target_values, tolerance):
    return any(abs(val - target) < tolerance for target in target_values)
  
  filtered_df = df[df[col].apply(lambda x: is_within_tolerance(x, values, tol))]
  return filtered_df

sample_deletion_ratios = [0.1]

df_ours = pd.read_json('../results_final/sample_deletion.json', lines=True)
df_ours = filter(df_ours, 'sample_deletion_ratio', sample_deletion_ratios)

all_datanames = [
  'beijing',
  'default',
  'phishing',
  'shoppers',
]
all_num_users = [
  1000,
]
all_watermark = [
  'pair_compare_one_pair',
]
all_num_classes = [
  256,
]
all_num_watermark_bits = [
  32,
]
all_min_hamming_dist = [
  7,
]
all_ratio_num_samples_per_class_interval = [
  -30000,
]
all_quality_loss = [
  'quad_random_init985',
]
all_embedding_models = [
  'orig',
]
all_quality_modes = [
  'average',
]
all_gen_code_losses = [
  'general_bfs',
]
all_tao_approximations = [
  0
]
all_time_limits = [
  180
]
all_error_rates = [
  0.001,
]
all_deletion_rates = [
  0.1,
]
all_dim_ratios = [
  'correct-pca-0.99',
]
all_upper_bounds = [
  '6stage_splus-0.01-0.01',
]
df_ours = df_ours[['dataname', 'num_users', 'watermark', 'num_classes', 'num_samples', 'num_watermark_bits', 'min_hamming_dist', 'ratio_num_samples_per_class_interval', 'classifier', 'quality_loss', 'embedding_model', 'correct', 'tao_approximation', 'num_tested_samples_per_class', 'loss', 'quad_loss', 'time_limit', 'min_gap', 'error_rate', 'gap', 'deletion_rate', 'sample_deletion_ratio', 'dim_ratio', 'num_samples_per_class_upper_bound', 'gen_code_loss', 'gauss', 'alter']]
df_ours['embedding_model'] = df_ours['embedding_model'].apply(lambda x: x[:4])
df_ours = df_ours[df_ours['embedding_model'].isin(all_embedding_models)]
df_ours = df_ours[df_ours['time_limit'].isin(all_time_limits)]
df_ours = df_ours[df_ours['dataname'].isin(all_datanames)]
df_ours = df_ours[df_ours['num_users'].isin(all_num_users)]
df_ours = df_ours[df_ours['watermark'].isin(all_watermark)]
df_ours = df_ours[df_ours['num_classes'].isin(all_num_classes)]
df_ours = df_ours[df_ours['num_watermark_bits'].isin(all_num_watermark_bits)]
df_ours = df_ours[df_ours['num_samples'] < 50000]
df_ours = df_ours[df_ours['min_hamming_dist'].isin(all_min_hamming_dist)]
df_ours = df_ours[df_ours['ratio_num_samples_per_class_interval'].isin(all_ratio_num_samples_per_class_interval)]
df_ours = df_ours[df_ours['tao_approximation'].isin(all_tao_approximations)]
df_ours = df_ours[df_ours['error_rate'].isin(all_error_rates)]
df_ours = df_ours[df_ours['classifier'] == 'nn']
df_ours = df_ours[df_ours['deletion_rate'].isin(all_deletion_rates)]
df_ours = df_ours[df_ours['quality_loss'].isin(all_quality_loss)]
df_ours = df_ours[df_ours['dim_ratio'].isin(all_dim_ratios)]
df_ours = df_ours[df_ours['num_samples_per_class_upper_bound'].isin(all_upper_bounds)]
df_ours = df_ours[df_ours['gen_code_loss'].isin(all_gen_code_losses)]
df_ours = df_ours[df_ours['deletion_rate'] == 0.1]
df_ours = df_ours[df_ours['gauss'] == 0.01]
df_ours = df_ours[df_ours['alter'] == 0.01]
df_ours = df_ours.rename(columns={
  'ratio_num_samples_per_class_interval': 'ratio',
  'num_watermark_bits': 'bits',
  'min_hamming_dist': 'hamming_distance'
})

df_ours = df_ours.groupby(by=['num_users', 'bits', 'hamming_distance', 'ratio', 'dataname', 'sample_deletion_ratio', 'error_rate', 'deletion_rate', 'gauss', 'alter'], as_index=False).head(100).groupby(by=['num_users', 'bits', 'hamming_distance', 'ratio', 'dataname', 'sample_deletion_ratio', 'error_rate', 'deletion_rate', 'gauss', 'alter'], as_index=False)[['correct']].agg({
  'correct': ['count', 'mean'],
})
for col in df_ours.columns:
  if df_ours[col].dtype in [np.float32, np.float64, float]:
    df_ours[col] = df_ours[col].round(3)
# df_ours = df_ours.reset_index(drop=True)

dataname2btz = {
  'beijing': (5, 0, 53),
  'default': (5, 0, 53),
  'shoppers': (5, 0, 7),
}

df_freqwm = pd.read_json('../results_final/freqwm_sample_deletion.json', lines=True)
df_freqwm = df_freqwm[df_freqwm['t'] == 'dyn']
df_freqwm['sample_deletion_ratio'] = df_freqwm['sample_deletion_ratio'].round(3)
df_freqwm = filter(df_freqwm, 'sample_deletion_ratio', sample_deletion_ratios)
df_freqwm = df_freqwm.groupby(['dataname', 'num_watermark_bits', 'sample_deletion_ratio'], as_index=False).head(100).groupby(['dataname', 'num_watermark_bits', 'sample_deletion_ratio'], as_index=False)[['correct']].agg({
  'correct': ['count', 'mean'],
}).reset_index(drop=True)

df_tabular_mark = pd.read_json('../results_final/tabular_mark_sample_deletion.json', lines=True)
df_tabular_mark = df_tabular_mark[df_tabular_mark['num_cells_ratio'] == 0.15]
df_tabular_mark = df_tabular_mark[df_tabular_mark['p_ratio'] == 0.2]
df_tabular_mark = df_tabular_mark[df_tabular_mark['num_units'] == 2]
df_tabular_mark['sample_deletion_ratio'] = df_tabular_mark['sample_deletion_ratio'].round(3)
df_tabular_mark = filter(df_tabular_mark, 'sample_deletion_ratio', sample_deletion_ratios)
df_tabular_mark = df_tabular_mark.groupby(['dataname', 'num_watermark_bits', 'sample_deletion_ratio'], as_index=True).head(100).groupby(['dataname', 'num_watermark_bits', 'sample_deletion_ratio'], as_index=True)[['correct']].agg({
  'correct': ['count', 'mean'],
})

df_tabwak_partition = pd.read_json('../results_final/tabwak_partition_sample_deletion.json', lines=True)
df_tabwak_partition = df_tabwak_partition[(df_tabwak_partition['token_dim'] == 5) | (df_tabwak_partition['token_dim'] == 6) | (df_tabwak_partition['dataname'].isin(['default', 'phishing', 'shoppers']))]
df_tabwak_partition['sample_deletion_ratio'] = df_tabwak_partition['sample_deletion_ratio'].round(3)
df_tabwak_partition = filter(df_tabwak_partition, 'sample_deletion_ratio', sample_deletion_ratios)
df_tabwak_partition = df_tabwak_partition.groupby(['dataname', 'num_watermark_bits', 'sample_deletion_ratio'], as_index=False).head(100).groupby(['dataname', 'num_watermark_bits', 'sample_deletion_ratio'], as_index=False)[['correct']].agg({
  'correct': ['count', 'mean'],
}).reset_index(drop=True)

display('TableMark', df_ours)
display('FreqyWM', df_freqwm)
display('TabularMark', df_tabular_mark)
display('TabWak Partition', df_tabwak_partition)
