In [None]:
import collections

import math
import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import tensorflow_federated as tff
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from keras.models import Sequential 
from keras.layers import Dense
from keras.layers import Dense, Dropout
from keras.initializers import GlorotUniform


TEST_SIZE = 0.2
NUM_CLIENTS = 4
BATCH_SIZE = 512
DROPOUT = 0.2
EPOCHS = 10
PREFETCH_BUFFER = 10
NUM_ROUNDS = 10
UNBALANCED = False
path = os.path.dirname(tff.__file__)
print(path)

np.random.seed(42)
tf.get_logger().setLevel('ERROR')

In [77]:
# Import del dataset e divisione in train e test
train_df = pd.read_csv('datasets/train_internet.csv')
test_df = pd.read_csv('datasets/test_internet.csv')

train_x = train_df.drop(columns=['label'])
train_y = train_df['label'].astype(int)

test_x = test_df.drop(columns=['label'])
test_y = test_df['label'].astype(int)

# Funzione per il preprocessing dei dati del singolo client che divide il dataset in batch
def preprocess(dataset):
  return dataset.repeat(EPOCHS).batch(BATCH_SIZE).prefetch(PREFETCH_BUFFER)

# Funzione per aggiungere una colonna client_num al dataset in modo tale che ogni client abbia una percentuale di 
# righe del dataset diversa.
def client_unbalanced(dataset, num_clients):
    client_num = []
    prob = np.random.pareto(1, num_clients)
    prob /= np.sum(prob)
    print(f'Le probabilità per client sono {prob}')
    for i in range(len(dataset)):
        client_num.append(np.random.choice(num_clients, p=prob))
    dataset['client_num'] = client_num
    return dataset

# Funzione per aggiungere una colonna client_num al dataset in modo tale che ogni client peschi esempi da un numero di classi che sia 
# il totale meno un numero passato come parametro
def distribute_clients(dataset, exclude_count):
    num_clients = 4
    client_0 = []
    client_1 = []
    client_2 = []
    client_3 = []
    dl = dataset['label'].unique()
  
    for client in range(num_clients):
        classes_to_exclude = []
        for i in range(exclude_count):
            if client+i < len(dl):
                inc = dl[client+i]
                classes_to_exclude.append(inc)
            else:
                classes_to_exclude.append(dl[client+i-len(dl)])
        classes_to_include = [x for x in dl if x not in classes_to_exclude]
        print(classes_to_include)
        for class_label in classes_to_include:
            class_label = int(class_label)
            class_indices = dataset.index[dataset['label'] == class_label].tolist()
            if(class_label == 0):
                client_0.extend([client] * (math.ceil(len(class_indices)/(4-exclude_count))))
            elif(class_label == 1):
                client_1.extend([client] * (math.ceil(len(class_indices)/(4-exclude_count))))   
            elif(class_label == 2):
                client_2.extend([client] * (math.ceil(len(class_indices)/(4-exclude_count))))
            else:
                client_3.extend([client] * (math.ceil(len(class_indices)/(4-exclude_count))))
    client_ids = client_0 + client_1 + client_2 + client_3
    if len(client_ids) > len(dataset):
        client_ids = client_ids[:len(dataset)]
        
    dataset = dataset.sort_values(by='label')
    dataset['client_num'] = client_ids
    return dataset

# Funzione per la creazione di un dataset ClientData a partire dal dataset di training a cui viene
# aggiunta una colonna client_num che assegna ad ogni riga un client randomico
def create_clients(dataset, unbalanced, num_clients=NUM_CLIENTS, exclude_count=0):
    if unbalanced==1: 
        dataset = client_unbalanced(dataset, num_clients)
    elif unbalanced==0:
        # Viene creata una lista di client
        clients = np.random.uniform(0, 1, len(dataset))
        dataset['client_num'] = clients
    elif unbalanced==2:
        dataset = distribute_clients(dataset, exclude_count)
    else:
        ret = 'non valido'
        return ret

    # Viene convertito il dataset in dizionari, uno per ogni client, con label e pixel associati
    client_train_dataset = collections.OrderedDict()
    grouped_dataset = dataset.groupby('client_num')
    for key, item in grouped_dataset:
        current_client = grouped_dataset.get_group(key)
        data = collections.OrderedDict((('y', train_y), ('x', train_x)))
        client_train_dataset[key] = data

    # I dizionari vengono convertiti in ClientDataset
    def serializable_dataset_fn(client_id):
        client_data = client_train_dataset[client_id]
        return tf.data.Dataset.from_tensor_slices(client_data)

    tff_train_data = tff.simulation.datasets.ClientData.from_clients_and_tf_fn(
        client_ids=list(client_train_dataset.keys()),
        serializable_dataset_fn=serializable_dataset_fn
    )

    return tff_train_data

# Creazione della lista contenente i client con i relativi dataset
elem_spec = {}
def init(dataset, active_clients=NUM_CLIENTS, unbalanced=0, exclude_count=0): 
    client_data_df = create_clients(dataset, unbalanced, active_clients, exclude_count)
    client_ids = sorted(client_data_df.client_ids)[:active_clients]
    return [preprocess(client_data_df.create_tf_dataset_for_client(x)) for x in client_ids]

In [101]:
def create_keras_model():
  model = Sequential()
  seed = 1

  model.add(Dense(112, activation='softmax', kernel_initializer=GlorotUniform(seed), input_dim=train_x.shape[-1]))
  model.add(Dropout(DROPOUT))
  model.add(Dense(272, kernel_initializer=GlorotUniform(seed),activation='tanh'))
  model.add(Dropout(DROPOUT))
  model.add(Dense(4, kernel_initializer=GlorotUniform(seed), activation='softmax'))   

  return model

In [73]:
def model_fn():
  keras_model = create_keras_model()
  return tff.learning.models.from_keras_model(
      keras_model,
      input_spec=elem_spec,
      loss=tf.keras.losses.SparseCategoricalCrossentropy(),
      metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

In [74]:
def aggregator(algo, prox):
    if algo == 'weighted avg':
        training_process = tff.learning.algorithms.build_weighted_fed_avg(model_fn, 
                                                                          client_optimizer_fn=tff.learning.optimizers.build_adam(learning_rate=0.01),
                                                                          server_optimizer_fn=tff.learning.optimizers.build_adam(learning_rate=0.01))

    if algo == 'unweighted avg':
        training_process = tff.learning.algorithms.build_unweighted_fed_avg(model_fn, 
                                                                            client_optimizer_fn=tff.learning.optimizers.build_adam(learning_rate=0.01),
                                                                            server_optimizer_fn=tff.learning.optimizers.build_adam(learning_rate=0.01))

    if algo == 'weighted prox':
        training_process = tff.learning.algorithms.build_weighted_fed_prox(model_fn, 
                                                                           proximal_strength=prox, 
                                                                           client_optimizer_fn=tff.learning.optimizers.build_adam(learning_rate=0.01),
                                                                           server_optimizer_fn=tff.learning.optimizers.build_adam(learning_rate=0.01))
    if algo == 'unweighted prox':
        training_process = tff.learning.algorithms.build_weighted_fed_prox(model_fn, 
                                                                           proximal_strength=prox,
                                                                           client_optimizer_fn=tff.learning.optimizers.build_adam(learning_rate=0.01),
                                                                           server_optimizer_fn=tff.learning.optimizers.build_adam(learning_rate=0.01))
    return training_process

In [None]:
federated_train_data = init(train_df)
elem_spec = federated_train_data[0].element_spec
training_process = aggregator('weighted avg', 20.0)
train_state = training_process.initialize()
for round_num in range(NUM_ROUNDS):
  result = training_process.next(train_state, federated_train_data)
  train_state = result.state
  train_metrics = result.metrics
  print('round {:2d}, metrics={}'.format(round_num, train_metrics))

In [75]:
def keras_evaluate(state, training_process):
  keras_model = create_keras_model()
  keras_model.compile(
      loss=tf.keras.losses.SparseCategoricalCrossentropy(),
      metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
  model_weights = training_process.get_model_weights(state)
  model_weights.assign_weights_to(keras_model)
  loss, accuracy = keras_model.evaluate(x=test_x, y=test_y)
  print('\tEval: loss={l:.3f}, accuracy={a:.3f}'.format(l=loss, a=accuracy))
  return loss, accuracy

In [None]:
keras_evaluate(train_state, training_process)

Esperimenti
==============

***Algoritmo di aggregazione***

In [None]:
federated_train_data = init(train_df)
elem_spec = federated_train_data[0].element_spec
# Tuning del parametro di proximal strength
def tune_proximal_strength():
    prox_list = []
    for i in [1.0, 10.0, 20.0, 128.0, 256.0, 512.0]:
        training_process = aggregator('weighted prox', i)
        train_state = training_process.initialize()
        curr = []
        for round_num in range(NUM_ROUNDS):
            result = training_process.next(train_state, federated_train_data)
            train_state = result.state
            train_metrics = result.metrics
            print('round {:2d}, metrics={}'.format(round_num, train_metrics))
            acc_tuple = (round_num, 
                         train_metrics['client_work']['train']['sparse_categorical_accuracy'])
            curr.append(acc_tuple)
        prox_list.append((i, curr))

    return prox_list

prox_list = tune_proximal_strength()

In [None]:
i = 0
rounds = []
accuracies = []

for algo, acc_list in prox_list:
    rounds.append([x[0] for x in acc_list])
    accuracies.append([x[1] for x in acc_list])
    i+=1

fig, ax = plt.subplots()
for i in range(len(prox_list)):
    ax.plot(rounds[i], accuracies[i], label='prox strength = {}'.format(prox_list[i][0]))
ax.set(xlabel='rounds', ylabel='accuracy',
       title='Proximal strength tuning')
ax.legend()
plt.show()


In [None]:
federated_train_data = init(train_df, unbalanced=1)
elem_spec = federated_train_data[0].element_spec

def agg_experiment():
    prox_list = []
    for i in ['weighted avg', 'unweighted avg', 'weighted prox', 'unweighted prox']:
        training_process = aggregator(i, 1.0)
        train_state = training_process.initialize()
        curr = []
        for round_num in range(NUM_ROUNDS):
            result = training_process.next(train_state, federated_train_data)
            train_state = result.state
            train_metrics = result.metrics
            print('round {:2d}, metrics={}'.format(round_num, train_metrics))
            acc_tuple = (round_num, 
                         train_metrics['client_work']['train']['sparse_categorical_accuracy'])
            curr.append(acc_tuple)
        prox_list.append((i, curr))
    return prox_list

agg_algo_list = agg_experiment()


In [None]:
i = 0
rounds = []
accuracies = []

for algo, acc_list in agg_algo_list:
    rounds.append([x[0] for x in acc_list])
    accuracies.append([x[1] for x in acc_list])
    i+=1

fig, ax = plt.subplots()
for i in range(len(agg_algo_list)):
    ax.plot(rounds[i], accuracies[i], label='prox strength = {}'.format(agg_algo_list[i][0]))
ax.set(xlabel='rounds', ylabel='accuracy',
       title='Aggregation Algorithm')
ax.legend()
plt.show()

***Numero e Percentuale clients***

In [None]:
import math
def client_perc_experiment():
    client_list = []
    eval_list = []
    for i in [10, 50, 100]:
        perc_list = []
        for j in [0.25, 0.50, 0.75, 1]:
            federated_train_data = init(train_df, active_clients=math.floor(i*j))
            global elem_spec 
            elem_spec = federated_train_data[0].element_spec
            training_process = aggregator('weighted avg', 1.0)
            train_state = training_process.initialize()
  
            curr = []
            for round_num in range(NUM_ROUNDS):
                result = training_process.next(train_state, federated_train_data)
                train_state = result.state
                train_metrics = result.metrics
                print('round {:2d}, metrics={}'.format(round_num, train_metrics))
                acc_tuple = (round_num, 
                             train_metrics['client_work']['train']['sparse_categorical_accuracy'])
                curr.append(acc_tuple)
            eval = keras_evaluate(train_state, training_process)
            eval_list.append((i, j, eval))
            perc_list.append((j, curr))
        client_list.append((i, perc_list))
    return client_list, eval_list

client_list, eval_list = client_perc_experiment()

In [None]:
def plot_metric(data, metric_index, metric_name):
    fig, axs = plt.subplots(2, 2, figsize=(15, 10))
    
    for i, (clients, portions_data) in enumerate(data):
        ax = axs[i // 2, i % 2]
        for portion, epoch_data in portions_data:
            epochs = [e[0] for e in epoch_data]
            metric_values = [e[metric_index] for e in epoch_data]
            ax.plot(epochs, metric_values, label=f'{portion*100}% data')
        
        ax.set_title(f'{clients} Clients')
        ax.set_xlabel('Metrics')
        ax.set_ylabel(metric_name)
        ax.legend()
        ax.grid(True) 
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()

plot_metric(client_list, 1, 'Accuracy')


In [None]:
print(eval_list)

***Distribuzione classi***

In [None]:
def client_perc_experiment():
    eval_list = []
    perc_list = []
    for i in [0, 1, 2, 3]:
        federated_train_data = init(train_df, active_clients=NUM_CLIENTS, unbalanced=2, exclude_count=i)
        global elem_spec 
        elem_spec = federated_train_data[0].element_spec
        training_process = aggregator('weighted avg', 1.0)
        train_state = training_process.initialize()

        curr = []
        for round_num in range(NUM_ROUNDS):
            result = training_process.next(train_state, federated_train_data)
            train_state = result.state
            train_metrics = result.metrics
            print('round {:2d}, metrics={}'.format(round_num, train_metrics))
            acc_tuple = (round_num, 
                         train_metrics['client_work']['train']['sparse_categorical_accuracy'])
            curr.append(acc_tuple)
        eval = keras_evaluate(train_state, training_process)
        eval_list.append((i, eval))
        perc_list.append((i, curr))
    return perc_list, eval_list
    
perc_list, eval_list = client_perc_experiment()

In [None]:
rounds = []
accuracies = []

for algo, acc_list in perc_list:
    rounds.append([x[0] for x in acc_list])
    accuracies.append([x[1] for x in acc_list])

def plot_metrics(ax, rounds, metrics, labels=['4 classi','3 classi', '2 classi', '1 classe']):
    for i in range(len(metrics)):
        ax.plot(rounds[i], metrics[i], label=labels[i])
        ax.legend(loc='lower right')

fig, ax = plt.subplots(figsize=(12, 8))

plot_metrics(ax, rounds, accuracies)

ax.set_title('Accuracy')
ax.set_xlabel('Rounds')
ax.set_ylabel('Accuracy')

plt.tight_layout()
plt.show()

In [None]:
eval_list