# Federated Insurance

## Prerequisites
---

In [12]:

import pandas as pd
import numpy as np
import tensorflow_federated as tff
import tensorflow as tf
from sklearn.model_selection import RepeatedKFold

from FLutils import load_df, create_keras_model, model_fn, prep_fed_train, prep_fed_test, train_fed
#import FLutils

## Data preperation
---

In [3]:
# ingest data

df_locs = [
    '../output/data/insurance-clean.csv',
    "https://raw.githubusercontent.com/Olhaau/fl-official-statistics-addon/main/output/data/insurance-clean.csv"
]

df = load_df(df_locs)
df.head(3)

loaded data from ../output/data/insurance-clean.csv


Unnamed: 0,age,sex,bmi,children,smoker,region,charges,region0,region1,region2,region3
0,0.021739,0.0,0.321227,0.0,1.0,southwest,16884.924,0.0,0.0,0.0,1.0
1,0.0,1.0,0.47915,0.2,0.0,southeast,1725.5523,0.0,0.0,1.0,0.0
2,0.217391,1.0,0.458434,0.6,0.0,southeast,4449.462,0.0,0.0,1.0,0.0


In [4]:
# select features and target (first column)
# =========================================
features = ['age', 'sex', 'bmi', 'children', 'smoker'
            , 'region0', 'region1', 'region2', 'region3']
target = 'charges'

df.loc[:, [target]+features].head(3)

Unnamed: 0,charges,age,sex,bmi,children,smoker,region0,region1,region2,region3
0,16884.924,0.021739,0.0,0.321227,0.0,1.0,0.0,0.0,0.0,1.0
1,1725.5523,0.0,1.0,0.47915,0.2,0.0,0.0,0.0,1.0,0.0
2,4449.462,0.217391,1.0,0.458434,0.6,0.0,0.0,0.0,1.0,0.0


In [5]:
# create client data
# ==================

clients = [
    df.loc[df['region'] == x, [target] + features] for x in df['region'].unique()]

# or randomly
# clients = [df[[target] + features].sample(frac = 1./4, ignore_index = True) for _ in range(4)]

print("clients shape: %s" % [client.shape for client in clients])

clients shape: [(325, 10), (364, 10), (325, 10), (324, 10)]


## Evaluation split
---

In [6]:
# create evaluation splits
# ========================

nfolds, nreps = 5, 5

rsmp = RepeatedKFold(n_splits = nfolds, n_repeats = nreps, random_state = 42)
client_splits = [list(rsmp.split(data)) for data in clients]

[np.array(client, dtype = object).shape[0] for client in client_splits]

print("number of splits per client: %s" % [np.array(client, dtype = object).shape[0] for client in client_splits])

number of splits per client: [25, 25, 25, 25]


## Federated model
---

In [14]:
def keras_blueprint(compile = False):
    return create_keras_model(
        nfeatures = 9, 
        units = [40, 40, 20], 
        activations = ['relu'] * 3, 
        compile = compile)

client_optimizer = lambda: tf.optimizers.Adam(learning_rate = .05)
server_optimizer = lambda: tf.optimizers.Adam(learning_rate = .05)

process_fed = tff.learning.algorithms.build_weighted_fed_avg(
    model_fn(keras_creator = keras_blueprint),
    client_optimizer_fn = client_optimizer,
    server_optimizer_fn = server_optimizer)


results = []

for i in range(2#nfolds * nreps
    ):
    indices_train = [split[i][0] for split in client_splits]
    clients_train = [clients[i].iloc[indices_train[i]] for i in range(len(clients))]

    train_data_fed = [prep_fed_train(df[features], df[target]) for df in clients_train]

    result = train_fed(
        process_fed, 
        train_data_fed,
        NUM_ROUNDS = 50,
        NUM_EPOCHS = 50,
        BATCH_SIZE = 128,
        SHUFFLE_BUFFER = 20,
        PREFETCH_BUFFER = 5,
        SEED = 42,
        verbose = False
    )

    results.append(result)
    print("trial {} / {}: performance train = {}".format(i + 1, nfolds * nreps,result["history"][-1]))


# tba: eval
# state = process_fed.initialize()
# #hist = []

# eval = tff.learning.build_federated_evaluation(model_fn(keras_creator = keras_blueprint))

# for round in range(100):
#     state, train_perf = process_fed.next(state, train_data)
#     eval_perf = eval(process_fed.get_model_weights(state), test_data_fed)

#     print(' TRAINING round {:2d}, loss={}'.format(round, train_perf['client_work']['train']['loss']))
#     print(' TESTING round {:2d}, loss={}'.format(round, eval_perf['eval']['loss']))
#     print('Train: {}'.format(dict(train_perf['client_work']['train'].items())))
#     print('Test: {}'.format(dict(eval_perf['eval'])))


# tba: test

2023-05-11 16:50:40.856761: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-11 16:50:40.856822: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1
2023-05-11 16:50:40.858067: I tensorflow/core/grappler/clusters/single_machine.cc:358] Starting new session
2023-05-11 16:50:40.859157: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-11 16:50:40.859226: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-11 16:50:40.859265: I tensor

trial 1 / 25: performance train = {'mean_absolute_error': 2085.082, 'mean_squared_error': 25082944.0, 'loss': 2081.8523, 'num_examples': 53500, 'num_batches': 420}
trial 2 / 25: performance train = {'mean_absolute_error': 1982.3473, 'mean_squared_error': 22663422.0, 'loss': 1979.8163, 'num_examples': 53500, 'num_batches': 420}


## Centralized model
---

In [64]:
for i in range(nfolds * nreps):
    train_indices = [split[1][0] for split in client_splits]
    test_indices = [split[1][1] for split in client_splits]

    data_train = pd.concat([clients[i].iloc[train_indices[i]] for i in range(len(clients))])
    X_train = data_train[features]
    y_train = data_train[target]

