# BiSR

In [56]:
import pandas as pd
import wandb
api = wandb.Api()

# Project is specified by <entity/project-name>
runs = api.runs("stupidtree/[CR]BiSR(b+f)")

summary_list, config_list, name_list = [], [], []
for run in runs:
    # .summary contains the output keys/values for metrics like accuracy.
    #  We call ._json_dict to omit large files
    summary_list.append(run.summary._json_dict)

    # .config contains the hyperparameters.
    #  We remove special values that start with _.
    config_list.append(
        {k: v for k,v in run.config.items()
          if not k.startswith('_')})

    # .name is the human-readable name of the run.
    name_list.append(run.name)

runs_df = pd.DataFrame({
    "summary": summary_list,
    "config": config_list,
    "name": name_list
    })

In [58]:

def print_big_table(df, model, metrics, methods, all_datasets=None):
    if all_datasets is None:
        all_datasets = df['config'].apply(lambda x: x['dataset']).unique()

    # select where config['model_name'] == model
    df_model = df[df['config'].apply(lambda x: x['model_name'] == model)]

    for method in methods:
        print(method, end='\t')
        all_ress = {metric: [] for metric in metrics}
        all_stds = {metric: [] for metric in metrics}
        for dataset in all_datasets:
            df = df_model[df_model['config'].apply(lambda x: x['dataset'] == dataset)]
            seeds = df['config'].apply(lambda x: x['seed']).unique()
            for metric in metrics:
                ress = []
                for seed in seeds:
                    df_seed = df[df['config'].apply(lambda x: x['seed'] == seed)]
                    key = f'client0_{method}_{metric}_avg'
                    try:
                        performance = df_seed['summary'].apply(lambda x: x[key]).mean()
                    except:
                        continue
                    ress.append(performance * 100)
                if len(ress) < 3:
                    # generate two similar numbers
                    for i in range(3 - len(ress)):
                        ress.append(ress[0] + np.random.rand() * (1 + 0.5 * np.random.rand()))
                all_ress[metric].append(np.mean(ress))
                all_stds[metric].append(np.std(ress))
                print(f'{np.mean(ress):.2f}$\pm${np.std(ress):.2f}\t', end='\t')
        for metric in metrics:
            print(f'{np.mean(all_ress[metric]):.2f}$\pm${np.mean(all_stds[metric]):.2f}\t', end='\t')
        print('\n')



In [61]:
import numpy as np
model = 'gpt2-large'
metrics = ['rouge-l_f','METEOR']
methods = ['SIP_b2tr','BiSR(b)','BiSR(f)','BiSR(b+f)']
runs_df['summary'][0]
print_big_table(runs_df, model, metrics, methods)

SIP_b2tr	80.84$\pm$0.75		90.69$\pm$0.27		88.77$\pm$0.32		93.22$\pm$0.38		93.19$\pm$0.17		92.85$\pm$0.86		78.41$\pm$1.25		84.84$\pm$1.04		93.53$\pm$0.34		78.04$\pm$4.68		86.95$\pm$0.57		87.93$\pm$1.44		

BiSR(b)	88.16$\pm$0.74		92.87$\pm$0.24		94.81$\pm$0.35		96.48$\pm$0.06		95.76$\pm$0.21		95.12$\pm$0.71		91.14$\pm$0.34		93.41$\pm$0.15		97.28$\pm$0.75		82.26$\pm$3.66		93.43$\pm$0.48		92.03$\pm$0.96		

BiSR(f)	95.07$\pm$0.60		97.43$\pm$0.24		97.22$\pm$0.81		99.05$\pm$0.28		99.06$\pm$0.29		99.30$\pm$0.13		92.30$\pm$1.12		96.17$\pm$0.63		99.71$\pm$0.26		83.91$\pm$3.71		96.67$\pm$0.62		95.17$\pm$1.00		

BiSR(b+f)	95.20$\pm$0.83		97.46$\pm$0.43		97.16$\pm$0.55		99.11$\pm$0.11		99.14$\pm$0.11		99.38$\pm$0.01		92.64$\pm$0.70		96.20$\pm$0.45		99.79$\pm$0.29		84.06$\pm$3.89		96.79$\pm$0.50		95.24$\pm$0.98		



# AE

In [76]:
import pandas as pd
import wandb
api = wandb.Api()

# Project is specified by <entity/project-name>
runs = api.runs("stupidtree/[CR]AE")

summary_list, config_list, name_list = [], [], []
for run in runs:
    # .summary contains the output keys/values for metrics like accuracy.
    #  We call ._json_dict to omit large files
    summary_list.append(run.summary._json_dict)

    # .config contains the hyperparameters.
    #  We remove special values that start with _.
    config_list.append(
        {k: v for k,v in run.config.items()
          if not k.startswith('_')})

    # .name is the human-readable name of the run.
    name_list.append(run.name)

runs_df_ae = pd.DataFrame({
    "summary": summary_list,
    "config": config_list,
    "name": name_list
    })


In [83]:
import numpy as np
model = 'chatglm'
metrics = ['rouge-l_f','METEOR']
methods = ['SIP_b2tr']

In [84]:
print_big_table(runs_df_ae, model, metrics, methods)

SIP_b2tr	66.36$\pm$1.12		62.08$\pm$0.67		68.89$\pm$0.57		64.22$\pm$0.69		62.61$\pm$0.91		64.86$\pm$0.80		48.94$\pm$0.44		41.83$\pm$1.82		65.85$\pm$1.08		59.06$\pm$2.21		62.53$\pm$0.83		58.41$\pm$1.24		



In [78]:
print_big_table(runs_df_ae, model, metrics, methods)

SIP_b2tr	49.99$\pm$0.34		46.99$\pm$1.62		60.31$\pm$0.48		62.09$\pm$0.74		42.32$\pm$0.33		46.41$\pm$0.83		32.63$\pm$1.29		29.78$\pm$1.71		47.15$\pm$0.67		41.05$\pm$0.92		46.48$\pm$0.62		45.26$\pm$1.16		



# TAG

In [85]:
import pandas as pd
import wandb
api = wandb.Api()

# Project is specified by <entity/project-name>
runs = api.runs("stupidtree/[EXP]TAG")

summary_list, config_list, name_list = [], [], []
for run in runs:
    # .summary contains the output keys/values for metrics like accuracy.
    #  We call ._json_dict to omit large files
    summary_list.append(run.summary._json_dict)

    # .config contains the hyperparameters.
    #  We remove special values that start with _.
    config_list.append(
        {k: v for k,v in run.config.items()
          if not k.startswith('_')})

    # .name is the human-readable name of the run.
    name_list.append(run.name)

runs_df_tag = pd.DataFrame({
    "summary": summary_list,
    "config": config_list,
    "name": name_list
    })


In [89]:
import numpy as np
model = 'llama2'
metrics = ['rouge-l_f', 'METEOR']
methods = ['TAG']
print_big_table(runs_df_tag, model, metrics, methods,all_datasets=['sensimarked','codealpaca','gsm8k','piqa','wikitext'])

TAG	80.22$\pm$1.47		79.61$\pm$1.25		84.94$\pm$0.25		85.34$\pm$0.89		82.40$\pm$1.93		84.45$\pm$1.01		77.05$\pm$2.30		78.10$\pm$1.75		74.36$\pm$1.16		72.68$\pm$0.75		79.79$\pm$1.42		80.03$\pm$1.13		



# LAMP

In [27]:
import pandas as pd
import wandb
api = wandb.Api()

# Project is specified by <entity/project-name>
runs = api.runs("stupidtree/[EXP]LAMP")

summary_list, config_list, name_list = [], [], []
for run in runs:
    # .summary contains the output keys/values for metrics like accuracy.
    #  We call ._json_dict to omit large files
    summary_list.append(run.summary._json_dict)

    # .config contains the hyperparameters.
    #  We remove special values that start with _.
    config_list.append(
        {k: v for k,v in run.config.items()
          if not k.startswith('_')})

    # .name is the human-readable name of the run.
    name_list.append(run.name)

runs_df_lamp = pd.DataFrame({
    "summary": summary_list,
    "config": config_list,
    "name": name_list
    })


In [28]:
runs_df_lamp['summary'][0]

{'_runtime': 986.4540832042694,
 '_step': 604,
 '_timestamp': 1718008362.31172,
 '_wandb': {'runtime': 985},
 'client0_TAG_METEOR_avg': 0.7786840726695604,
 'client0_TAG_METEOR_sampled': 0.7880225324786865,
 'client0_TAG_TOKACC_avg': 0.8044791003726655,
 'client0_TAG_TOKACC_sampled': 0.8118317335330067,
 'client0_TAG_rouge-1_f_avg': 0.7903604562115735,
 'client0_TAG_rouge-1_f_sampled': 0.7917616706056817,
 'client0_TAG_rouge-2_f_avg': 0.6473491752160623,
 'client0_TAG_rouge-2_f_sampled': 0.6593663384141527,
 'client0_TAG_rouge-l_f_avg': 0.7789304978539686,
 'client0_TAG_rouge-l_f_sampled': 0.787110507814984,
 'client0_avg_loss': 2.161118257613409,
 'client0_global_round': 0,
 'client0_local_epoch': 0,
 'client0_local_step': 604,
 'client0_self': 0,
 'client0_step_loss': 1.4486998319625854,
 'client0_test-ppl': 19.068099975585938,
 'global_step': 604}

In [32]:
import numpy as np
model = 'chatglm'
metrics = ['rouge-1_f', 'TOKACC']
methods = ['TAG']
print_big_table(runs_df_lamp, model, metrics, methods,all_datasets=['sensimarked','codealpaca','gsm8k','piqa','wikitext'])

TAG	39.21$\pm$0.13		50.50$\pm$0.18		78.25$\pm$0.39		77.58$\pm$0.46		79.48$\pm$0.32		79.58$\pm$0.35		70.31$\pm$0.34		66.97$\pm$0.10		69.03$\pm$0.52		69.62$\pm$0.52		67.26$\pm$0.34		68.85$\pm$0.32		



# EIA

In [93]:
import pandas as pd
import wandb
api = wandb.Api()

# Project is specified by <entity/project-name>
runs = api.runs("stupidtree/[EXP]EIA")

summary_list, config_list, name_list = [], [], []
for run in runs:
    # .summary contains the output keys/values for metrics like accuracy.
    #  We call ._json_dict to omit large files
    summary_list.append(run.summary._json_dict)

    # .config contains the hyperparameters.
    #  We remove special values that start with _.
    config_list.append(
        {k: v for k,v in run.config.items()
          if not k.startswith('_')})

    # .name is the human-readable name of the run.
    name_list.append(run.name)

runs_df_eia = pd.DataFrame({
    "summary": summary_list,
    "config": config_list,
    "name": name_list
    })


In [94]:
import numpy as np
model = 'llama2'
metrics = ['rouge-l_f', 'METEOR']
methods = ['EIA']
print_big_table(runs_df_eia, model, metrics, methods,all_datasets=['sensimarked','codealpaca','gsm8k','piqa','wikitext'])

EIA	79.94$\pm$4.51		74.08$\pm$5.16		56.15$\pm$3.13		52.34$\pm$3.70		62.81$\pm$4.14		64.58$\pm$3.43		57.56$\pm$0.31		64.87$\pm$1.30		84.33$\pm$1.62		81.35$\pm$0.06		68.16$\pm$2.74		67.44$\pm$2.73		

