# The code in this jupyter notebook 
* reads the models and the datasets
* implements the ensemble method and calculates its accuracy when the model pool is solely that of the shard's (i.e. no models can be chosen outside of the shard)


In [None]:
import pandas as pd
import os
from collections import defaultdict
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
import pickle

tokenizer = T5Tokenizer.from_pretrained('t5-small')

# 
groups = [str(i) for i in range(0,3)]
peers = [str(i) for i in range(8090, 8100)]
epoch_cutoff = 6000
number_of_models_sampled = 3

In [None]:
def find_first_files_with_str(directory, str_contain, epoch_cutoff):
    return os.path.join(str_contain + '_' + str(epoch_cutoff))

In [None]:
# reading individual peer datasets & group datasets, e.g. train_df_group1, train_df_group1_peer1 etc.
for group in groups:
    # creating train_df's
    exec(f'train_df_group{group} = pd.DataFrame()')
    for peer in peers:
        datasets_folder = os.path.join('aggregated_results',f'group{group}','datasets')
        exec_str = f"train_df_group{group}_peer{int(peer) - 8089} = pd.read_csv(os.path.join(datasets_folder,'{peer}_df.csv'))"
        exec(exec_str)
        exec(f'train_df_group{group} = pd.concat([train_df_group{group}, train_df_group{group}_peer{int(peer) - 8089}])')
       
    exec(f'train_df_group{group} = train_df_group{group}.drop_duplicates()')
    
    # creating test_df's: test_df_group1, test_df_group1_peer1
    
    datasets_folder = os.path.join('aggregated_results',f'group{group}','datasets')
    exec (f"test_df_group{group} = pd.read_csv(os.path.join(datasets_folder,'test_df.csv')) ")
    exec (f"test_df_group{group} = test_df_group{group}[test_df_group{group}['doc_id'].isin(train_df_group{group}['doc_id'].unique())]")
    for peer in peers:
        exec (f"test_df_group{group}_peer{int(peer) - 8089} = test_df_group{group}[test_df_group{group}['doc_id'].isin(train_df_group{group}_peer{int(peer) - 8089}['doc_id'].unique())]")
    
    
    

#reading models, e.g. model_group1_peer1
for group in groups:
    for peer in peers:
        model_folder = os.path.join('aggregated_results',f'group{group}', 'models')
        model_file = find_first_files_with_str(model_folder, peer, epoch_cutoff) # 10 is the largest number of saved models that all peers have finished training
        print (group, peer, model_file)
        exec_str = f"model_group{group}_peer{str(int(peer)-8089)} = T5ForConditionalGeneration.from_pretrained(os.path.join(model_folder, model_file))"
        
        
        exec(exec_str)

# Sampling random models and aggregating their suggestions - 5 beams, with probabilities

In [None]:
models_group1_list = []
models_group2_list = []
models_group0_list = []

for group in groups:
    for i, peer in enumerate(peers):
        exec(f'models_group{group}_list.append(model_group{group}_peer{int(peer)-8089})')
       

In [None]:
import random
from collections import defaultdict
import threading
from torch.nn.functional import softmax

class ModelManager:
    """Class to hold the models, tokenizer and dataframes - including with results after their calculation"""
    def __init__(self, model_list, train_df, test_df, tokenizer):
        self.model_list = model_list
        self.train_df = train_df.copy()
        self.test_df = test_df.copy()
        
        print ('train set size:', self.train_df.shape[0])
        print ('test set size:', self.test_df.shape[0])
    
        self.tokenizer = tokenizer
        self.counter = 0

    def generate_text_beams(self, query):
        '''Generates the result for each query in the test set'''
        self.counter += 1
        if self.counter % 300 == 0:
            print(f"Processed {self.counter} queries")
        results = defaultdict(float)
        
        # sampling 3 models from the ones available
        sampled_models = random.sample(self.model_list, number_of_models_sampled)

        # Computing the ensemble 
        for model in sampled_models:
            input_ids = self.tokenizer.encode(query, return_tensors='pt')
            output = model.generate(input_ids, do_sample=False, return_dict_in_generate=True, output_scores=True,
                                    num_beams=5, num_return_sequences=5, max_length = 20)

            beam_scores = output.sequences_scores
            probabilities = softmax(beam_scores, dim=0).tolist()
            model_res = [self.tokenizer.decode(output_id, skip_special_tokens=True) for output_id in output.sequences]

        
            for res, prob in zip(model_res, probabilities):
                results[res] += prob

        
        return self.top_5_generated_texts(results)

    def top_5_generated_texts(self, input_dict):
        '''Returns the 5 docids that the model is most confident about'''
        sorted_responses = sorted(input_dict.items(), key=lambda x: x[1], reverse=True)
        return [response[0] for response in sorted_responses[:5]]

    def evaluate_accuracy(self):
        self.train_df['generated_doc_id'] = self.train_df['query'].apply(self.generate_text_beams)
        self.test_df['generated_doc_id'] = self.test_df['query'].apply(self.generate_text_beams)

        acc_train = self.calculate_accuracy(self.train_df)
        acc_test = self.calculate_accuracy(self.test_df)

        return acc_train, acc_test

    def calculate_accuracy(self, df):
        return df.apply(lambda row: row['doc_id'] in row['generated_doc_id'], axis=1).sum() / df.shape[0]

# Assuming you have predefined dictionaries/lists for models and datasets, such as:
# two_groups_list = [...]
# three_groups_list = [...]
# train_df_group1 = ...
# test_df_group1 = ...
# ... and so on for other groups
# And a tokenizer instance

def run_evaluation(group_nbr, tokenizer):
    
    
    model_list = globals()[f'models_group{group_nbr}_list']
    train_df = globals()[f'train_df_group{group_nbr}']
    test_df = globals()[f'test_df_group{group_nbr}']
    
    manager = ModelManager(model_list, train_df, test_df, tokenizer)
    
    key = group_nbr
    model_managers[key] = manager
    
    
    acc_train, acc_test = manager.evaluate_accuracy()
    global_accuracies[key] = {'acc_train': acc_train, 'acc_test': acc_test}

    print(f"Group: {group_nbr}, Train Acc: {acc_train}, Test Acc: {acc_test}")

    
# Global collection to store ModelManager instances
model_managers = {}
global_accuracies = {}
    
# Threading
threads = []

for group_nbr in groups:
    thread = threading.Thread(target=run_evaluation, args=(group_nbr, tokenizer))
    thread.start()
    threads.append(thread)

for thread in threads:
    thread.join()


In [None]:

def defaultdict_to_dict(d):
    """ Recursively convert defaultdict to dict. """
    if isinstance(d, defaultdict):
        d = {key: defaultdict_to_dict(value) for key, value in d.items()}
    return d


In [None]:
# Assuming 'my_defaultdict' is your existing defaultdict
# Convert it to a regular dictionary
regular_dict_top5 = defaultdict_to_dict(global_accuracies)

# Serialize and save to a file
with open('accuracies_top5.pkl', 'wb') as file:
    pickle.dump(regular_dict_top5, file)

In [None]:
with open('accuracies_top5.pkl', 'rb') as file:
    loaded_dict = pickle.load(file)
    # Optionally convert back to defaultdict
    # my_defaultdict = convert_to_defaultdict(loaded_dict)
    
loaded_dict

In [None]:
regular_dict_top1 = {}
for group in model_managers:
    df_temp = model_managers[group].test_df.copy()
    df_temp['generated_doc_id_1beam'] = df_temp['generated_doc_id'].apply(lambda x: [x[0]])
    regular_dict_top1[group] = {'acc_test':df_temp.apply(lambda row: row['doc_id'] in row['generated_doc_id_1beam'], 
                                axis=1).sum() / df_temp.shape[0]}
    print (group, df_temp.apply(lambda row: row['doc_id'] in row['generated_doc_id_1beam'], 
                                axis=1).sum() / df_temp.shape[0])
    
    
# display(df_temp)

In [None]:
with open('accuracies_top1.pkl', 'wb') as file:
    pickle.dump(regular_dict_top1, file)

In [None]:
with open('accuracies_top1.pkl', 'rb') as file:
    loaded_dict = pickle.load(file)
    # Optionally convert back to defaultdict
    # my_defaultdict = convert_to_defaultdict(loaded_dict)
    
loaded_dict