In [1]:
import collections
import math

import numpy as np
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
tf.get_logger().setLevel('ERROR')
from tensorflow import keras
import tensorflow_federated as tff
import pandas as pd
import matplotlib.pyplot as plt

from keras.models import Sequential 
from keras.layers import Dense
from keras.initializers import GlorotUniform
from keras.initializers import HeUniform
from keras.layers import Dense, Dropout


TEST_SIZE = 0.2
NUM_CLIENTS = 10
ACTIVE_CLIENTS = 10
BATCH_SIZE = 512
DROPOUT = 0.1
EPOCHS = 20
PREFETCH_BUFFER = 10
NUM_ROUNDS = 5
AGG_ALGO = "weighted prox"
UNBALANCED = False

np.random.seed(42)
tf.random.set_seed(42)


In [2]:
# Import del dataset e divisione in train e test
train_df = pd.read_csv('datasets/train_titanic.csv')
test_df = pd.read_csv('datasets/test_titanic.csv')

test_x = test_df.drop(columns=['Transported'])
test_y = test_df['Transported']

# Funzione per il preprocessing dei dati del singolo client che divide il dataset in batch
def preprocess(dataset):
  return dataset.repeat(EPOCHS).batch(BATCH_SIZE).prefetch(PREFETCH_BUFFER)

def client_distribution(dataset, part, num_clients):
    positive = []
    negative = []

    dataset_0 = dataset[dataset['Transported'] == 0]
    dataset_1 = dataset[dataset['Transported'] == 1]
    print(len(dataset_0), len(dataset_1))

    while len(positive) < len(dataset_1) and len(negative) < len(dataset_0):
        for c in range(round(num_clients/2)):
            positive.extend([c]*part)
            negative.extend([c]*(4-part))
        for c in range(round(num_clients/2), num_clients):
            positive.extend([c]*(4-part))
            negative.extend([c]*part)
    print(len(negative), len(positive))

    if len(dataset_1) < len(positive):
        positive = positive[:len(dataset_1)]
    else:
        dataset_1 = dataset_1.iloc[:len(positive),:]
    
    if len(dataset_0) < len(negative):
        negative = negative[:len(dataset_0)]
    else:
        dataset_0 = dataset_0.iloc[:len(negative),:]

    dataset_1['client_num'] = positive
    dataset_0['client_num'] = negative
    ret = pd.concat([dataset_1, dataset_0])
    print(ret.shape)

    return ret

# Funzione per aggiungere una colonna client_num al dataset in modo tale che ogni client abbia una percentuale di 
# righe del dataset diversa
def client_unbalanced(dataset, num_clients):
    client_num = []
    prob = np.random.pareto(1, num_clients)
    prob /= np.sum(prob)
    print(prob)
    for i in range(len(dataset)):
        client_num.append(np.random.choice(num_clients, p=prob))
    print(len(client_num))
    print([client_num.count(x) for x in range(num_clients)])
    dataset['client_num'] = client_num
    return dataset


# Funzione per la creazione di un dataset ClientData a partire dal dataset di training a cui viene
# aggiunta una colonna client_nums che assegna ad ogni riga un client randomico
def create_clients(dataset, perc, num_clients=NUM_CLIENTS):
    if UNBALANCED: 
        dataset = client_unbalanced(dataset, num_clients)
    else:
        dataset = client_distribution(dataset, perc, num_clients)

    # Viene convertito il dataset in dizionari, uno per ogni client, con label e pixel associati
    client_train_dataset = collections.OrderedDict()
    grouped_dataset = dataset.groupby('client_num')
    for key, item in grouped_dataset:
        current_client = grouped_dataset.get_group(key)
        data = collections.OrderedDict((('y',current_client.iloc[:,-2]), ('x', current_client.iloc[:,:-2])))
        client_train_dataset[key] = data

    # I dizionari vengono convertiti in ClientDataset
    def serializable_dataset_fn(client_id):
        client_data = client_train_dataset[client_id]
        return tf.data.Dataset.from_tensor_slices(client_data)

    tff_train_data = tff.simulation.datasets.ClientData.from_clients_and_tf_fn(
        client_ids=list(client_train_dataset.keys()),
        serializable_dataset_fn=serializable_dataset_fn
    )

    return tff_train_data

# Creazione della lista contenente i client con i relativi dataset
elem_spec = {}
def init(dataset, perc, active_clients=ACTIVE_CLIENTS): 
    client_data_df = create_clients(dataset, perc)
    client_ids = sorted(client_data_df.client_ids)[:active_clients]
    return [preprocess(client_data_df.create_tf_dataset_for_client(x)) for x in client_ids]

In [3]:
def create_keras_model():
  model = Sequential()

  model.add(Dense(test_x.shape[-1], kernel_initializer = HeUniform(42), activation = 'relu', input_dim = test_x.shape[-1]))
  model.add(Dropout(DROPOUT))
  model.add(Dense(1024, kernel_initializer = HeUniform(42), activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(30e-6)))
  model.add(Dropout(DROPOUT))
  model.add(Dense(256, kernel_initializer = HeUniform(42), activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(30e-6)))
  model.add(Dropout(DROPOUT))
  model.add(Dense(128, kernel_initializer = HeUniform(42), activation = 'relu', kernel_regularizer = tf.keras.regularizers.l2(30e-6)))
  model.add(Dropout(DROPOUT))
  model.add(Dense(1, kernel_initializer = GlorotUniform(42), activation = 'sigmoid'))
  return model

In [4]:
def model_fn():
  keras_model = create_keras_model()
  return tff.learning.models.from_keras_model(
      keras_model,
      input_spec=elem_spec,
      loss=tf.keras.losses.BinaryFocalCrossentropy(),
      metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])

In [5]:
def aggregator(algo):
    if algo == 'weighted avg':
        training_process = tff.learning.algorithms.build_weighted_fed_avg(model_fn, 
                                                                          client_optimizer_fn=tff.learning.optimizers.build_adam(learning_rate=0.0001),
                                                                          server_optimizer_fn=tff.learning.optimizers.build_adam(learning_rate=0.0001))

    if algo == 'unweighted avg':
        training_process = tff.learning.algorithms.build_unweighted_fed_avg(model_fn, 
                                                                            client_optimizer_fn=tff.learning.optimizers.build_adam(learning_rate=0.0001),
                                                                            server_optimizer_fn=tff.learning.optimizers.build_adam(learning_rate=0.0001))

    if algo == 'weighted prox':
        training_process = tff.learning.algorithms.build_weighted_fed_prox(model_fn, 
                                                                           proximal_strength=20.0, 
                                                                           client_optimizer_fn=tff.learning.optimizers.build_adam(learning_rate=0.0001),
                                                                           server_optimizer_fn=tff.learning.optimizers.build_adam(learning_rate=0.0001))
    if algo == 'unweighted prox':
        training_process = tff.learning.algorithms.build_weighted_fed_prox(model_fn, 
                                                                           proximal_strength=20.0,
                                                                           client_optimizer_fn=tff.learning.optimizers.build_adam(learning_rate=0.0001),
                                                                           server_optimizer_fn=tff.learning.optimizers.build_adam(learning_rate=0.0001))
    return training_process


In [None]:
federated_train_data = init(train_df, 2)

print(federated_train_data)
elem_spec = federated_train_data[0].element_spec
training_process = aggregator(AGG_ALGO)

for round_num in range(NUM_ROUNDS):
  train_state = training_process.initialize()
  result = training_process.next(train_state, federated_train_data)
  train_state = result.state
  train_metrics = result.metrics
  print('round {:2d}, metrics={}'.format(round_num, train_metrics))

In [None]:
print('Numero di clients: '+str(len(federated_train_data)))
total = 0
for x in range(len(federated_train_data)):
    num_elem = 0
    for i in federated_train_data[x]:
        num = len(list(i['x']))
        num_elem += num
        total += num
    print('Numero di batch per client {}: {}\nNumero elementi per client: {}'.format(x, str(len(federated_train_data[x])), str(num_elem)))
print('TOT TRAIN CD: {} \nTOT TRAIN DF: {}'.format(total, train_df.shape))

In [6]:
def keras_evaluate(state, training_process):
  keras_model = create_keras_model()
  keras_model.compile(
      loss=tf.keras.losses.BinaryFocalCrossentropy(),
      metrics=[tf.keras.metrics.BinaryAccuracy(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
  model_weights = training_process.get_model_weights(state)
  model_weights.assign_weights_to(keras_model)
  loss, accuracy, precision, recall = keras_model.evaluate(x=test_x, y=test_y)
  print('\tEval: loss={l:.3f}, accuracy={a:.3f}'.format(l=loss, a=accuracy))
  return loss, accuracy, precision, recall 

In [None]:
keras_evaluate(train_state)

Esperimenti
==============

***Algoritmo di aggregazione***\
Il primo set di esperimenti riguarda la strategia di aggregazione dei pesi dei client da parte dei server. Sono state valutate 4 strategie, due sulla media dei pesi (pesata e non pesata sul numero di elementi del set per client) e due con l'algoritmo di prossimità che esegue l'aggregazione dei pesi come per la media, ma con un parametro aggiuntivo che aggiunge un termine di regolarizzazione per evitare che i pesi si allontanino troppo dai pesi del server.

In [None]:
federated_train_data = init(train_df, 2)
elem_spec = federated_train_data[0].element_spec
# Tuning del parametro di proximal strength
def tune_proximal_strength():
    prox_list = []
    for i in [1.0, 10.0, 20.0, 128.0, 256.0, 512.0]:
        training_process = aggregator('weighted prox')
        train_state = training_process.initialize()
        curr = []
        for round_num in range(NUM_ROUNDS):
            result = training_process.next(train_state, federated_train_data)
            train_state = result.state
            train_metrics = result.metrics
            print('round {:2d}, metrics={}'.format(round_num, train_metrics))
            acc_tuple = (round_num, 
                         train_metrics['client_work']['train']['binary_accuracy'], 
                         train_metrics['client_work']['train']['precision'], 
                         train_metrics['client_work']['train']['recall'])
            curr.append(acc_tuple)
        prox_list.append((i, curr))

    return prox_list

prox_list = tune_proximal_strength()

In [None]:
print(prox_list)

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(12, 8))
i = 0
rounds = []
accuracies = []
precisions = []
recalls = []
for algo, acc_list in prox_list:
    rounds.append([x[0] for x in acc_list])
    accuracies.append([x[1] for x in acc_list])
    precisions.append([x[2] for x in acc_list])
    recalls.append([x[3] for x in acc_list])
    i+=1

def plot_metrics(ax, rounds, metrics, labels=[1, 10, 20, 128, 256, 512]):
    for i in range(len(metrics)):
        ax.plot(rounds, metrics[i], label=labels[i])
        ax.legend(loc='lower right')

print(accuracies[0])
plot_metrics(axs[0, 0], rounds[0], accuracies)
plot_metrics(axs[0, 1], rounds[1], precisions)
plot_metrics(axs[1, 0], rounds[2], recalls)

axs[0, 0].set_title('Accuracy')
axs[0, 1].set_title('Precision')
axs[1, 0].set_title('Recall')

plt.tight_layout()
plt.show()

Viene creato il dataset federato assegnando ad ogni client una riga del dataset con probabilità che segue una distribuzione di pareto in modo da rendere le probabilità più sbilanciate e poter testare l'importanza o meno di considerare la media pesata. Viene fissato il numero di epoche per client per round e round totali rispettivamente a 10 e 5. Anche il numero di client è fissato a 10.

In [None]:
federated_train_data = init(train_df, 2)
elem_spec = federated_train_data[0].element_spec

def agg_experiment():
    prox_list = []
    for i in ['weighted avg', 'unweighted avg', 'weighted prox', 'unweighted prox']:
        training_process = aggregator(i)
        train_state = training_process.initialize()
        curr = []
        for round_num in range(NUM_ROUNDS):
            result = training_process.next(train_state, federated_train_data)
            train_state = result.state
            train_metrics = result.metrics
            print('round {:2d}, metrics={}'.format(round_num, train_metrics))
            acc_tuple = (round_num, 
                         train_metrics['client_work']['train']['binary_accuracy'], 
                         train_metrics['client_work']['train']['precision'], 
                         train_metrics['client_work']['train']['recall'])
            curr.append(acc_tuple)
        prox_list.append((i, curr))
    return prox_list

agg_algo_list = agg_experiment()


In [None]:
print(agg_algo_list)

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(12, 8))
i = 0
rounds = []
accuracies = []
precisions = []
recalls = []
for algo, acc_list in agg_algo_list:
    rounds.append([x[0] for x in acc_list])
    accuracies.append([x[1] for x in acc_list])
    precisions.append([x[2] for x in acc_list])
    recalls.append([x[3] for x in acc_list])
    i+=1

def plot_metrics(ax, rounds, metrics, labels=['weighted avg', 'unweighted avg', 'weighted prox', 'unweighted prox']):
    for i in range(len(metrics)):
        ax.plot(rounds, metrics[i], label=labels[i])
        ax.legend(loc='lower right')

print(accuracies[0])
plot_metrics(axs[0, 0], rounds[0], accuracies)
plot_metrics(axs[0, 1], rounds[1], precisions)
plot_metrics(axs[1, 0], rounds[2], recalls)

axs[0, 0].set_title('Accuracy')
axs[0, 1].set_title('Precision')
axs[1, 0].set_title('Recall')

plt.tight_layout()
plt.show()

***Numero e percentuale client***\
Vengono fatti una serie di esperimenti al variare del numero di client, con 10, 50, 100 e 500 client, individuando con 500 il limite massimo di client supportati a livello di risorse. Per ogni blocco di client viene perso il 25% 50% 75% 100%. Viene fissato il numero di epoche per client per round e numero di round come nel caso precedente. Questa volta i client vengono presi in modo bilanciato, con circa lo stesso numero di entry ciascuno e le label divise 50% di positive e 50% negative.

In [None]:
def client_perc_experiment():
    client_list = []
    eval_list = []
    for i in [10, 50, 100, 500]:
        perc_list = []
        for j in [0.25, 0.50, 0.75, 1]:
            federated_train_data = init(train_df, perc = 2, active_clients=math.floor(i*j))
            global elem_spec 
            elem_spec = federated_train_data[0].element_spec
            training_process = aggregator('weighted avg')
            train_state = training_process.initialize()
  
            curr = []
            for round_num in range(NUM_ROUNDS):
                result = training_process.next(train_state, federated_train_data)
                train_state = result.state
                train_metrics = result.metrics
                print('round {:2d}, metrics={}'.format(round_num, train_metrics))
                acc_tuple = (round_num, 
                             train_metrics['client_work']['train']['binary_accuracy'], 
                             train_metrics['client_work']['train']['precision'], 
                             train_metrics['client_work']['train']['recall'])
                curr.append(acc_tuple)
            eval = keras_evaluate(train_state, training_process)
            eval_list.append((i, j, eval))
            perc_list.append((j, curr))
        client_list.append((i, perc_list))
    return client_list, eval_list

client_list, eval_list = client_perc_experiment()

In [None]:
def plot_metric(data, metric_index, metric_name):
    fig, axs = plt.subplots(2, 2, figsize=(15, 10))
    
    for i, (clients, portions_data) in enumerate(data):
        ax = axs[i // 2, i % 2]
        for portion, epoch_data in portions_data:
            epochs = [e[0] for e in epoch_data]
            metric_values = [e[metric_index] for e in epoch_data]
            ax.plot(epochs, metric_values, label=f'{portion*100}% data')
        
        ax.set_title(f'{clients} Clients')
        ax.set_xlabel('Metrics')
        ax.set_ylabel(metric_name)
        ax.legend()
        ax.grid(True) 
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()

plot_metric(client_list, 1, 'Accuracy') 
plot_metric(client_list, 2, 'Precision')  
plot_metric(client_list, 3, 'Recall') 

In [None]:
print(eval_list)

***Distribuzione delle label***\
Vengono fatti una serie di esperimenti per valutare le prestazioni al variare della percentuale che ogni client osserva delle due label. I parametri di epoche per round e numero di round vengono lasciati invariati, si scelgono 10 client che vengono presi tutti quanti. Gli esperimenti fatti prevedno che:
* I client osservano per metà il 75% di label positive ed il 25% negative e per metà il viceversa
* Tutti i client osservano il 50% di label positive ed il 50% negative
* Metà dei client osservano solo esempi negativi, metà solo positivi

In [None]:
def client_perc_experiment():
    eval_list = []
    perc_list = []
    for i in [4, 3, 2]:
        federated_train_data = init(train_df, perc=i, active_clients=NUM_CLIENTS)
        global elem_spec 
        elem_spec = federated_train_data[0].element_spec
        training_process = aggregator('weighted avg')
        train_state = training_process.initialize()

        curr = []
        for round_num in range(NUM_ROUNDS):
            result = training_process.next(train_state, federated_train_data)
            train_state = result.state
            train_metrics = result.metrics
            print('round {:2d}, metrics={}'.format(round_num, train_metrics))
            acc_tuple = (round_num, 
                         train_metrics['client_work']['train']['binary_accuracy'], 
                         train_metrics['client_work']['train']['precision'], 
                         train_metrics['client_work']['train']['recall'])
            curr.append(acc_tuple)
        eval = keras_evaluate(train_state, training_process)
        eval_list.append((i, eval))
        perc_list.append((i, curr))
    return perc_list, eval_list
    
perc_list, eval_list = client_perc_experiment()

In [None]:
print(eval_list)

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(12, 8))
i = 0
rounds = []
accuracies = []
precisions = []
recalls = []
for algo, acc_list in perc_list:
    rounds.append([x[0] for x in acc_list])
    accuracies.append([x[1] for x in acc_list])
    precisions.append([x[2] for x in acc_list])
    recalls.append([x[3] for x in acc_list])
    i+=1

def plot_metrics(ax, rounds, metrics, labels=['100/0', '75/25', '50/50']):
    for i in range(len(metrics)):
        ax.plot(rounds, metrics[i], label=labels[i])
        ax.legend(loc='lower right')

print(accuracies[0])
plot_metrics(axs[0, 0], rounds[0], accuracies)
plot_metrics(axs[0, 1], rounds[1], precisions)
plot_metrics(axs[1, 0], rounds[2], recalls)

axs[0, 0].set_title('Accuracy')
axs[0, 1].set_title('Precision')
axs[1, 0].set_title('Recall')

plt.tight_layout()
plt.show()