# Federated Insurance

## Prerequisites
---

In [1]:

import pandas as pd
import numpy as np
import tensorflow_federated as tff
import tensorflow as tf
from sklearn.model_selection import RepeatedKFold

from FLutils import load_df, create_keras_model, model_fn, prep_fed_train, prep_fed_test, train_fed
#import FLutils

2023-05-11 22:14:25.281610: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Data preperation
---

In [2]:
# ingest data

df_locs = [
    '../output/data/insurance-clean.csv',
    "https://raw.githubusercontent.com/Olhaau/fl-official-statistics-addon/main/output/data/insurance-clean.csv"
]

df = load_df(df_locs)
df.head(3)

loaded data from ../output/data/insurance-clean.csv


Unnamed: 0,age,sex,bmi,children,smoker,region,charges,region0,region1,region2,region3
0,0.021739,0.0,0.321227,0.0,1.0,southwest,16884.924,0.0,0.0,0.0,1.0
1,0.0,1.0,0.47915,0.2,0.0,southeast,1725.5523,0.0,0.0,1.0,0.0
2,0.217391,1.0,0.458434,0.6,0.0,southeast,4449.462,0.0,0.0,1.0,0.0


In [3]:
# select features and target (first column)
# =========================================
features = ['age', 'sex', 'bmi', 'children', 'smoker'
            , 'region0', 'region1', 'region2', 'region3']
target = 'charges'

df.loc[:, [target]+features].head(3)

Unnamed: 0,charges,age,sex,bmi,children,smoker,region0,region1,region2,region3
0,16884.924,0.021739,0.0,0.321227,0.0,1.0,0.0,0.0,0.0,1.0
1,1725.5523,0.0,1.0,0.47915,0.2,0.0,0.0,0.0,1.0,0.0
2,4449.462,0.217391,1.0,0.458434,0.6,0.0,0.0,0.0,1.0,0.0


In [4]:
# create client data
# ==================

clients = [
    df.loc[df['region'] == x, [target] + features] for x in df['region'].unique()]

# or randomly
# clients = [df[[target] + features].sample(frac = 1./4, ignore_index = True) for _ in range(4)]

print("clients shape: %s" % [client.shape for client in clients])

clients shape: [(325, 10), (364, 10), (325, 10), (324, 10)]


## Evaluation split
---

In [5]:
# create evaluation splits
# ========================

nfolds, nreps = 5, 5

rsmp = RepeatedKFold(n_splits = nfolds, n_repeats = nreps, random_state = 42)
client_splits = [list(rsmp.split(data)) for data in clients]

[np.array(client, dtype = object).shape[0] for client in client_splits]

print("number of splits per client: %s" % [np.array(client, dtype = object).shape[0] for client in client_splits])

number of splits per client: [25, 25, 25, 25]


## Federated model
---

In [6]:
def keras_blueprint(compile = False):
    return create_keras_model(
        nfeatures = 9, 
        units = [40, 40, 20], 
        activations = ['relu'] * 3, 
        compile = compile)

client_optimizer = lambda: tf.optimizers.Adam(learning_rate = .05)
server_optimizer = lambda: tf.optimizers.Adam(learning_rate = .05)

process_fed = tff.learning.algorithms.build_weighted_fed_avg(
    model_fn(keras_creator = keras_blueprint),
    client_optimizer_fn = client_optimizer,
    server_optimizer_fn = server_optimizer)


results = []

for i in range(2#nfolds * nreps
    ):
    indices_train = [split[i][0] for split in client_splits]
    clients_train = [clients[i].iloc[indices_train[i]] for i in range(len(clients))]

    train_data_fed = [prep_fed_train(df[features], df[target]) for df in clients_train]

    result = train_fed(
        process_fed,
        #process_fed, 
        train_data_fed,
        #validation_split = 0.0,
        NUM_ROUNDS = 50,
        NUM_EPOCHS = 50,
        BATCH_SIZE = 128,
        SHUFFLE_BUFFER = 20,
        PREFETCH_BUFFER = 5,
        SEED = 42,
        verbose = False
    )

    results.append(result)
    print("trial {} / {}: performance train = {}".format(i + 1, nfolds * nreps,result["history"][-1]))


# tba: eval
# state = process_fed.initialize()
# #hist = []

# eval = tff.learning.build_federated_evaluation(model_fn(keras_creator = keras_blueprint))

# for round in range(100):
#     state, train_perf = process_fed.next(state, train_data)
#     eval_perf = eval(process_fed.get_model_weights(state), test_data_fed)

#     print(' TRAINING round {:2d}, loss={}'.format(round, train_perf['client_work']['train']['loss']))
#     print(' TESTING round {:2d}, loss={}'.format(round, eval_perf['eval']['loss']))
#     print('Train: {}'.format(dict(train_perf['client_work']['train'].items())))
#     print('Test: {}'.format(dict(eval_perf['eval'])))


# tba: test

2023-05-11 22:14:37.837394: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-11 22:14:38.484493: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-11 22:14:38.484641: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-11 22:14:42.700708: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-11 22:14:42.701130: I tensorflow/compile

trial 1 / 25: performance train = {'mean_absolute_error': 2092.1199, 'mean_squared_error': 24955292.0, 'loss': 2089.2192, 'num_examples': 53500, 'num_batches': 420}
trial 2 / 25: performance train = {'mean_absolute_error': 1969.738, 'mean_squared_error': 22623290.0, 'loss': 1967.9249, 'num_examples': 53500, 'num_batches': 420}


In [9]:
from sklearn.model_selection import train_test_split
clients_split = [train_test_split(data, test_size = 20, random_state = 42) for data in clients]
clients_train = [split[0] for split in clients_split]
clients_test  = [split[1] for split in clients_split]

def prep_fed_test(X_test, y_test):
   return tf.data.Dataset.from_tensor_slices((
       tf.convert_to_tensor(np.expand_dims(X_test, axis=0)), 
       tf.convert_to_tensor(np.expand_dims(y_test, axis=0))
       )) 

train_data_fed = [prep_fed_train(df[features], df[target]) for df in clients_train]

test_data_fed  = [prep_fed_test (df[features], df[target]) for df in clients_test]


In [13]:
def train_fed(
    model,
    train_data,
    eval_data = None,
    #process, 
    client_optimizer = lambda: tf.optimizers.Adam(learning_rate = .05),
    server_optimizer = lambda: tf.optimizers.Adam(learning_rate = .05),
    validation_split = 0.0,
    NUM_ROUNDS = 50,
    NUM_EPOCHS = 50,
    BATCH_SIZE = 128,
    SHUFFLE_BUFFER = 20,
    PREFETCH_BUFFER = 5,
    SEED = 42,
    verbose = True
    ):
    
    # eval setup
    #n_eval = [int(
    #    validation_split * 
    #    len(list(data.as_numpy_iterator()))
    #) for data in train_data]

    #train_data = [data.shuffle(10, seed = SEED, reshuffle_each_iteration=False) for data in train_data]
    #eval_data  = [train_data[i].take(n_eval[i]) for i in range(len(train_data))]
    #train_data = [train_data[i].skip(n_eval[i]) for i in range(len(train_data))]


    # prep the data
    train_data = [
        data.
            repeat(NUM_EPOCHS).
            shuffle(SHUFFLE_BUFFER, seed = SEED).
            batch(BATCH_SIZE).
            prefetch(PREFETCH_BUFFER)

        for data in train_data]
    
    process = tff.learning.algorithms.build_weighted_fed_avg(
        model,
        client_optimizer_fn = client_optimizer,
        server_optimizer_fn = server_optimizer)
    
    #eval_process = tff.learning.algorithms.build_fed_eval(model)


    # initialize the process
    state      =      process.initialize()
    #eval_state = eval_process.initialize()
    
    eval = tff.learning.algorithms.build_fed_eval(model)



    hist = []

    for round in range(NUM_ROUNDS):
        
        if SEED != None: tf.keras.utils.set_random_seed(SEED)
        
        state, perf_train = process.next(state, train_data)
        
        if verbose: 
            print('ROUND {:2d} / {}'.format(round + 1, NUM_ROUNDS))
            print('- train: {}'.format(dict(perf_train['client_work']['train'].items())))
                
        
        if eval_data != None:
            perf_eval = None
            #FIXME: error
            #perf_eval = eval(process.get_model_weights(state), eval_data)
            if verbose:
                print(perf_eval)   


        #model_weights = process.get_model_weights(state)
        #eval_state = eval_process.set_model_weights(eval_state, model_weights)
        #eval_output = eval_process.next(eval_state,test_data_fed# train_data                               )
        #perf_eval = eval_output.metrics

        #perf_eval = evaluate(process.get_model_weights(state), eval_data)
        #result  = process.next(state, train_data)
        #state   = result.state

        perf_train = dict(perf_train['client_work']['train'].items())


        hist.append(perf_train) # fix add also perf_eval
        #result['history'][1].keys()
        #result['history'][0] | result['history'][1]

  



    return {'process': process, 'history': hist, 'state': state}



#process_fed = tff.learning.algorithms.build_weighted_fed_avg(
#    model_fn(keras_creator = keras_blueprint),
#    client_optimizer_fn = client_optimizer,
#    server_optimizer_fn = server_optimizer)



indices_train = [split[3][0] for split in client_splits]
clients_train = [clients[3].iloc[indices_train[3]] for i in range(len(clients))]

train_data_fed = [prep_fed_train(df[features], df[target]) for df in clients_train]

result = train_fed(
    model = model_fn(keras_creator = keras_blueprint),
    #process_fed, 
    train_data = train_data_fed,
    eval_data  = test_data_fed,
    validation_split = 0.1,
    NUM_ROUNDS = 3,
    NUM_EPOCHS = 3,
    BATCH_SIZE = 128,
    SHUFFLE_BUFFER = 20,
    PREFETCH_BUFFER = 5,
    SEED = 42,
    verbose = True
)

2023-05-11 22:19:36.626458: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-11 22:19:36.626586: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1
2023-05-11 22:19:36.626867: I tensorflow/core/grappler/clusters/single_machine.cc:358] Starting new session
2023-05-11 22:19:36.627513: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-11 22:19:36.627633: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-11 22:19:36.627716: I tensor

ROUND  1 / 3
- train: {'mean_absolute_error': 13092.607, 'mean_squared_error': 288860830.0, 'loss': 13345.83, 'num_examples': 3108, 'num_batches': 28}
ModelWeights(trainable=[array([[ 0.20068134,  0.1638154 , -0.09888187,  0.36492583, -0.2452966 ,
         0.10661042,  0.37928   , -0.182414  ,  0.06664816,  0.14151147,
        -0.2394478 , -0.03754193,  0.27615684,  0.02924222, -0.04793675,
         0.23592661, -0.28674978, -0.21528852,  0.05996547, -0.30343947,
        -0.27358198,  0.29352465,  0.34073812, -0.11618793,  0.30161902,
        -0.21071473, -0.0763935 ,  0.09389494,  0.09667373, -0.20984285,
        -0.1531738 ,  0.39234257, -0.22301891,  0.01053665, -0.11440462,
        -0.31522837,  0.21316484, -0.17231613,  0.17687973,  0.15527052],
       [-0.32009175,  0.09453493, -0.11609063, -0.02919249,  0.36809784,
        -0.06271109,  0.07202365, -0.12124158, -0.21397388,  0.20104836,
         0.13147177,  0.00556773, -0.3304978 , -0.21617037,  0.15393317,
         0.3062712 , 

TypeError: 'LearningProcess' object is not callable

In [83]:
train_data_fed

[<_TensorSliceDataset element_spec=(TensorSpec(shape=(9,), dtype=tf.float64, name=None), TensorSpec(shape=(), dtype=tf.float64, name=None))>,
 <_TensorSliceDataset element_spec=(TensorSpec(shape=(9,), dtype=tf.float64, name=None), TensorSpec(shape=(), dtype=tf.float64, name=None))>,
 <_TensorSliceDataset element_spec=(TensorSpec(shape=(9,), dtype=tf.float64, name=None), TensorSpec(shape=(), dtype=tf.float64, name=None))>,
 <_TensorSliceDataset element_spec=(TensorSpec(shape=(9,), dtype=tf.float64, name=None), TensorSpec(shape=(), dtype=tf.float64, name=None))>]

In [84]:
test_data_fed

[<_TensorSliceDataset element_spec=(TensorSpec(shape=(20, 9), dtype=tf.float64, name=None), TensorSpec(shape=(20,), dtype=tf.float64, name=None))>,
 <_TensorSliceDataset element_spec=(TensorSpec(shape=(20, 9), dtype=tf.float64, name=None), TensorSpec(shape=(20,), dtype=tf.float64, name=None))>,
 <_TensorSliceDataset element_spec=(TensorSpec(shape=(20, 9), dtype=tf.float64, name=None), TensorSpec(shape=(20,), dtype=tf.float64, name=None))>,
 <_TensorSliceDataset element_spec=(TensorSpec(shape=(20, 9), dtype=tf.float64, name=None), TensorSpec(shape=(20,), dtype=tf.float64, name=None))>]

In [86]:
np.expand_dims(clients_test[0], axis=0)

array([[[1.99334580e+04, 1.52173913e-01, 1.00000000e+00, 3.69652946e-01,
         6.00000000e-01, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 1.00000000e+00],
        [2.21440320e+04, 5.21739130e-01, 1.00000000e+00, 3.77723971e-01,
         0.00000000e+00, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 1.00000000e+00],
        [5.38353600e+03, 4.34782609e-01, 0.00000000e+00, 3.13155771e-01,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 1.00000000e+00],
        [5.11945591e+04, 2.17391304e-01, 1.00000000e+00, 5.49905838e-01,
         2.00000000e-01, 1.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 1.00000000e+00],
        [3.26019900e+03, 2.82608696e-01, 1.00000000e+00, 1.19451170e-01,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
         0.00000000e+00, 1.00000000e+00],
        [1.74963060e+04, 3.04347826e-01, 0.00000000e+00, 2.3

{'mean_absolute_error': 13463.292,
 'mean_squared_error': 306679580.0,
 'loss': 13560.814,
 'num_examples': 2808,
 'num_batches': 24}

In [35]:
len(list(train_data_fed[0].as_numpy_iterator()))

259

In [49]:
validation_split = 0.0

n_eval = int(validation_split * len(list(train_data_fed[0].as_numpy_iterator())))

data = train_data_fed[0].shuffle(1)

eval_data = data.take(n_val)
train_data = data.skip(n_val)

In [50]:
eval_data
len(list(eval_data.as_numpy_iterator()))

2023-05-11 20:53:37.679822: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'Placeholder/_1' with dtype double and shape [259]
	 [[{{node Placeholder/_1}}]]


0

In [27]:
train_data_fed[0].take(100)

<_TakeDataset element_spec=(TensorSpec(shape=(9,), dtype=tf.float64, name=None), TensorSpec(shape=(), dtype=tf.float64, name=None))>

## Centralized model
---

In [64]:
for i in range(nfolds * nreps):
    train_indices = [split[1][0] for split in client_splits]
    test_indices = [split[1][1] for split in client_splits]

    data_train = pd.concat([clients[i].iloc[train_indices[i]] for i in range(len(clients))])
    X_train = data_train[features]
    y_train = data_train[target]

