# Federated Insurance

## Prerequisites
---

In [29]:
# imports
import pandas as pd
import numpy as np
import tensorflow_federated as tff
import tensorflow as tf
from sklearn.model_selection import RepeatedKFold, train_test_split


from FLutils import load_df, create_keras_model, model_fn, prep_fed_train, prep_fed_test, train_fed
#import FLutils

## Data preperation
---

In [2]:
# ingest data

df_locs = [
    '../output/data/insurance-clean.csv',
    "https://raw.githubusercontent.com/Olhaau/fl-official-statistics-addon/main/output/data/insurance-clean.csv"
]

df = load_df(df_locs)
df.head(3)

loaded data from ../output/data/insurance-clean.csv


Unnamed: 0,age,sex,bmi,children,smoker,region,charges,region0,region1,region2,region3
0,0.021739,0.0,0.321227,0.0,1.0,southwest,16884.924,0.0,0.0,0.0,1.0
1,0.0,1.0,0.47915,0.2,0.0,southeast,1725.5523,0.0,0.0,1.0,0.0
2,0.217391,1.0,0.458434,0.6,0.0,southeast,4449.462,0.0,0.0,1.0,0.0


In [3]:
# select features and target (first column)
# =========================================
features = ['age', 'sex', 'bmi', 'children', 'smoker'
            , 'region0', 'region1', 'region2', 'region3']
target = 'charges'

df.loc[:, [target]+features].head(3)

Unnamed: 0,charges,age,sex,bmi,children,smoker,region0,region1,region2,region3
0,16884.924,0.021739,0.0,0.321227,0.0,1.0,0.0,0.0,0.0,1.0
1,1725.5523,0.0,1.0,0.47915,0.2,0.0,0.0,0.0,1.0,0.0
2,4449.462,0.217391,1.0,0.458434,0.6,0.0,0.0,0.0,1.0,0.0


In [4]:
# create client data
# ==================

clients = [
    df.loc[df['region'] == x, [target] + features] for x in df['region'].unique()]

# or randomly
# clients = [df[[target] + features].sample(frac = 1./4, ignore_index = True) for _ in range(4)]

print("clients shape: %s" % [client.shape for client in clients])

clients shape: [(325, 10), (364, 10), (325, 10), (324, 10)]


## Evaluation split
---

In [5]:
# create evaluation splits
# ========================

nfolds, nreps = 5, 5

rsmp = RepeatedKFold(n_splits = nfolds, n_repeats = nreps, random_state = 42)
client_splits = [list(rsmp.split(data)) for data in clients]


print("number of splits per client: %s" % [np.array(client, dtype = object).shape[0] for client in client_splits])

number of splits per client: [25, 25, 25, 25]


## Federated Learning
---

In [39]:
# define model architecture
def keras_blueprint(compile = False):
    return create_keras_model(
        nfeatures = 9, 
        units = [40, 40, 20], 
        activations = ['relu'] * 3, 
        compile = compile)

# Note: we do not compile the model yet. The loss, metrics, and optimizers are introduced later.
# S. https://www.tensorflow.org/federated/tutorials/federated_learning_for_image_classification#creating_a_model_with_keras

keras_blueprint().summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 40)                400       
                                                                 
 dense_9 (Dense)             (None, 40)                1640      
                                                                 
 dense_10 (Dense)            (None, 20)                820       
                                                                 
 dense_11 (Dense)            (None, 1)                 21        
                                                                 
Total params: 2,881
Trainable params: 2,881
Non-trainable params: 0
_________________________________________________________________


In [44]:
# train

# create train, eval data and prep
eval_ind = 0

train_data_fed = []
eval_data_fed  = []
for client_ind in range(len(clients)):
    indices_train = client_splits[client_ind][eval_ind][0]

    data = clients[client_ind].iloc[indices_train]
    train_data, eval_data = train_test_split(data, test_size = 0.1, random_state = 42)

    train_data_fed.append(prep_fed_train(train_data[features], train_data[target])) 
    eval_data_fed.append(prep_fed_test(eval_data[features], eval_data[target]))
    
# train
result = train_fed(
    model = model_fn(keras_creator = keras_blueprint),
    train_data = train_data_fed,
    eval_data  = eval_data_fed,
    client_optimizer = lambda: tf.optimizers.Adam(learning_rate = .05),
    server_optimizer = lambda: tf.optimizers.Adam(learning_rate = .05),
    NUM_ROUNDS = 3,
    NUM_EPOCHS = 3,
    BATCH_SIZE = 128,
    SHUFFLE_BUFFER = 20,
    PREFETCH_BUFFER = 5,
    SEED = 42,
    verbose = True
)

2023-05-12 01:23:51.581138: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-12 01:23:51.581195: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 1
2023-05-12 01:23:51.581393: I tensorflow/core/grappler/clusters/single_machine.cc:358] Starting new session
2023-05-12 01:23:51.581778: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-12 01:23:51.581849: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:982] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-05-12 01:23:51.581898: I tensor

TRAIN: {'mean_absolute_error': 13153.898, 'mean_squared_error': 318542850.0, 'loss': 13224.697, 'num_examples': 2886, 'num_batches': 25}
EVAL:  {'mean_absolute_error': 14236.82, 'mean_squared_error': 367672000.0, 'loss': 14347.774, 'num_examples': 108, 'num_batches': 4}
TRAIN: {'mean_absolute_error': 13005.955, 'mean_squared_error': 314334240.0, 'loss': 13109.725, 'num_examples': 2886, 'num_batches': 25}
EVAL:  {'mean_absolute_error': 14232.834, 'mean_squared_error': 367543360.0, 'loss': 14343.762, 'num_examples': 108, 'num_batches': 4}
TRAIN: {'mean_absolute_error': 12792.372, 'mean_squared_error': 308366660.0, 'loss': 12934.466, 'num_examples': 2886, 'num_batches': 25}
EVAL:  {'mean_absolute_error': 14215.776, 'mean_squared_error': 366994800.0, 'loss': 14326.642, 'num_examples': 108, 'num_batches': 4}


In [51]:
# train result
result['history'][-1]

{'mean_absolute_error': 12792.372,
 'mean_squared_error': 308366660.0,
 'loss': 12934.466,
 'num_examples': 2886,
 'num_batches': 25,
 'val_mean_absolute_error': 14215.776,
 'val_mean_squared_error': 366994800.0,
 'val_loss': 14326.642,
 'val_num_examples': 108,
 'val_num_batches': 4}

In [66]:
# test result
eval_ind = 0

# fetch test data
clients_test = []
for client_ind in range(len(clients)):
    indices_test = client_splits[client_ind][eval_ind][1]
    clients_test.append(clients[client_ind].iloc[indices_test])

test_data = pd.concat(clients_test)
X_test = test_data[features]
y_test = test_data[target]


model = keras_blueprint(compile = True)



perf_test = zip(
    model.metrics_names, 
    model.evaluate(X_test, y_test, verbose = 0)
    )
#perf_test = dict(perf_test)

#dict(zip(model.metrics_names, model.evaluate(X_test, y_test, verbose = 0)))


{}


In [67]:
print(dict(perf_test))


{}


In [62]:
perf_test = zip(model.metrics_names, model.evaluate(X_test, y_test, verbose = 0))
perf_test = dict(perf_test)
perf_test

{'loss': 12880.8779296875,
 'mae': 12880.8779296875,
 'mean_squared_error': 307815744.0,
 'r2_score': -1.3974257707595825}

## Centralized DNN
---

In [64]:
for i in range(nfolds * nreps):
    train_indices = [split[1][0] for split in client_splits]
    test_indices = [split[1][1] for split in client_splits]

    data_train = pd.concat([clients[i].iloc[train_indices[i]] for i in range(len(clients))])
    X_train = data_train[features]
    y_train = data_train[target]

