In [None]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np

In [None]:
def make_df_into_right_columns(d, samplingmodels = False):
    def nested_dict_to_df(d, path=None):
        """ 
        Convert a nested dictionary with any number of nested dimensions to a pandas DataFrame. 
        Each row represents a unique path through the nested dictionary.
        """
        if path is None:
            path = []

        if isinstance(d, dict):
            rows = []
            for key, value in d.items():
                new_path = path + [key]
                rows.extend(nested_dict_to_df(value, new_path))
            return rows
        else:
            return [path + [d]]

    # Convert the nested dictionary to a list of rows
    rows = nested_dict_to_df(d)

    # Determine the maximum number of columns
    max_cols = max(len(row) for row in rows)

    # Create a DataFrame with appropriate column names
    df_final = pd.DataFrame(rows, columns=[f'Level_{i+1}' for i in range(max_cols - 1)] + ['Value'])
#     display(df_final)
    if samplingmodels:
        df_final[['data shard', 'nbr of shards']] = df_final['Level_1'].apply(pd.Series)
        df_final.drop('Level_1', axis=1, inplace=True)
    return df_final

def make_df_into_right_columns_secondmode(data, acc_col_name = ''):
    df = pd.DataFrame.from_dict(data, orient='index')
    df.index.name = 'data shard'
    df.reset_index(inplace=True)
    
    test_df = df[["data shard", "acc_test"]].rename(columns={"acc_test": acc_col_name})
    test_df["data split"] = "test"

    transformed_df = test_df.copy()
    if 'acc_train' in df.columns:
        train_df = df[["data shard", "acc_train"]].rename(columns={"acc_train": acc_col_name})
        train_df["data split"] = "train"

        # Concatenating the train and test dataframes
        transformed_df = pd.concat([transformed_df, test_df], ignore_index=True)
    
    return transformed_df





In [None]:
with open('global_accuracies_5beams.pkl', 'rb') as file:
    ga_5beams = pickle.load(file)
    # Optionally convert back to defaultdict
    # my_defaultdict = convert_to_defaultdict(loaded_dict)
    
with open('inter_group_accs_5beams.pkl', 'rb') as file:
    ga_samplingModels_5beams_probs = pickle.load(file)
    # Optionally convert back to defaultdict
    # my_defaultdict = convert_to_defaultdict(loaded_dict)
    

# To load and optionally convert back to defaultdict
# (You'll need to redefine your defaultdict structure as before)
with open('localnglobal_accuracies_allgroups_allpeers.pkl', 'rb') as file:
    ga_1beam = pickle.load(file)
    # Optionally convert back to defaultdict
    # my_defaultdict = convert_to_defaultdict(loaded_dict)

In [None]:
    
with open('inter_group_accs_1beams.pkl', 'rb') as file:
    ga_samplingModels_1beam_probs = pickle.load(file)
    # Optionally convert back to defaultdict
    # my_defaultdict = convert_to_defaultdict(loaded_dict)

       
with open('accuracies_top1.pkl', 'rb') as file:
    ga_samplingModels_1beam_probs_intragroup = pickle.load(file)
    # Optionally convert back to defaultdict
    # my_defaultdict = convert_to_defaultdict(loaded_dict)  
    

with open('accuracies_top5.pkl', 'rb') as file:
    ga_samplingModels_5beams_probs_intragroup = pickle.load(file)
    # Optionally convert back to defaultdict
    # my_defaultdict = convert_to_defaultdict(loaded_dict)  

In [None]:
print (ga_samplingModels_5beams_probs_intragroup)
print (ga_samplingModels_1beam_probs_intragroup)

In [None]:
ga_1beam = make_df_into_right_columns(ga_1beam)
ga_5beams = make_df_into_right_columns(ga_5beams)


ga_samplingModels_5beams_probs = make_df_into_right_columns(ga_samplingModels_5beams_probs, False)
ga_samplingModels_1beam_probs = make_df_into_right_columns(ga_samplingModels_1beam_probs, False)


ga_samplingModels_1beam_probs_intragroup = make_df_into_right_columns_secondmode(ga_samplingModels_1beam_probs_intragroup, 'probabilistic suggestions')
ga_samplingModels_5beams_probs_intragroup = make_df_into_right_columns_secondmode(ga_samplingModels_5beams_probs_intragroup, 'probabilistic suggestions')


In [None]:
# Normalizing format to not rewrite code
ga_samplingModels_1beam_probs['Level_2'] = 'acc_test'

In [None]:
print ("ga_1beam",ga_1beam.columns)
print ('ga_5beams',ga_5beams.columns)
print ('ga_samplingModels_5beams_probs',ga_samplingModels_5beams_probs.columns)
print ('ga_samplingModels_1beam_probs',ga_samplingModels_1beam_probs.columns)
print ('ga_samplingModels_1beam_probs_intragroup',ga_samplingModels_1beam_probs_intragroup.columns)
print ('ga_samplingModels_5beams_probs_intragroup',ga_samplingModels_5beams_probs_intragroup.columns)

In [None]:
ga_1beam = ga_1beam.rename(columns = 
                             {'Level_1':'data shard', 'Level_2': 'peer', 
                              'Level_3':'split', 'Level_4':'pers',
                             'dataset_group': 'data shard'}
                            )
ga_5beams = ga_5beams.rename(columns = 
                             {'Level_1':'data shard', 'Level_2': 'peer', 
                              'Level_3':'split', 'Level_4':'pers',
                             'dataset_group': 'data shard'}
                            )

ga_samplingModels_5beams_probs = ga_samplingModels_5beams_probs.rename(columns = 
                             {'Level_1':'data shard', 'Level_2': 'data split', 
                              'Value': 'probabilistic suggestions',
                             'dataset_group': 'data shard'}
                            )

ga_samplingModels_1beam_probs = ga_samplingModels_1beam_probs.rename(columns = 
                             {'Level_1':'data shard', 'Level_2': 'data split', 
                              'Value': 'probabilistic suggestions'}
                            )

ga_samplingModels_1beam_probs_intragroup['beams'] = 1
ga_samplingModels_5beams_probs_intragroup['beams'] = 5

ga_samplingModels_1beam_probs_intragroup['beams'] = 1
ga_samplingModels_5beams_probs_intragroup['beams'] = 5

In [None]:
ga_1beam

In [None]:
print ("ga_1beam",ga_1beam.columns)
print ('ga_5beams',ga_5beams.columns)
print ('ga_samplingModels_5beams_probs',ga_samplingModels_5beams_probs.columns)
print ('ga_samplingModels_1beam_probs',ga_samplingModels_1beam_probs.columns)
print ('ga_samplingModels_1beam_probs_intragroup',ga_samplingModels_1beam_probs_intragroup.columns)
print ('ga_samplingModels_5beams_probs_intragroup',ga_samplingModels_5beams_probs_intragroup.columns)

In [None]:
ga_samplingModels_1beam_probs_intragroup

In [None]:
ga_sm_5bms = ga_samplingModels_5beams_probs.drop(columns = 'data split')


ga_sm_1bm_ig = ga_samplingModels_1beam_probs_intragroup.drop(columns = 'data split')



ga_sm_5bms_ig = ga_samplingModels_5beams_probs_intragroup.drop(columns = 'data split')


ga_sm_ig = pd.concat([ga_sm_1bm_ig,ga_sm_5bms_ig])
# ga_sm_ig['nbr of shards'] = 3



In [None]:
ga_sm_ig = ga_sm_ig[[
    'data shard', 'beams', 'probabilistic suggestions']]
# ga_sm_5bms[ga_sm_5bms

In [None]:
ga_sm_ig

In [None]:
ga_sm_1bm = ga_samplingModels_1beam_probs.drop(columns = 'data split')

ga_sm_1bm[['probabilistic suggestions']] = ga_sm_1bm[['probabilistic suggestions']].astype(float)

ga_sm_1bm['beams'] = 1
ga_sm_5bms['beams'] = 5


ga_sm_ig = ga_sm_ig.rename(
    columns = {'probabilistic suggestions': 'ensemble accuracy', 'beams':'topk'})


ga_sm_interg = pd.concat([ga_sm_1bm,ga_sm_5bms])
ga_sm_interg = ga_sm_interg.rename(
    columns = {'probabilistic suggestions': 'ensemble accuracy', 'beams':'topk'})

In [None]:

# ga_sm_interg['nbr of shards'] = ga_sm_interg['nbr of shards'].replace({'two': 2, 'three': 3})



Above we can see the intra-group ensemble, where peers were chosen from a pool belonging to the same shard

In [None]:
display(ga_sm_interg[['data shard','topk', 'ensemble accuracy']].sort_values(['data shard', 'topk']))


Above we can see inter-group ensemble, where peers were chosen from a pool belonging to different shards, 3 peers sampled from each of the three shards.

In [None]:
ga_sm_ig['model pool'] = 'Only belonging to shard '
ga_sm_interg['model pool'] = 'Belonging to all shards'

In [None]:
ga_sm_ig

In [None]:
df_total = pd.concat([ga_sm_interg, ga_sm_ig])

df_total['data shard'] = df_total['data shard'].replace({'0': 'A', '1': 'B', '2': 'C'})

ga_sm_ig['model pool'] = 'Only belonging to shard' + ga_sm_ig['data shard'].astype(str)

In [None]:
df_total = df_total.pivot_table(index=['data shard', 'model pool'], columns='topk', values='ensemble accuracy', aggfunc='first').reset_index()
df_total = df_total.rename(columns = {'data shard': 'Shard', 1: 'top1', 5: 'top5'})

In [None]:
df_total

Thus, df_total represents the accuracies of top1 and top5 for the shards trained. It is equivalent to Table 3 from the paper.