<a href="https://colab.research.google.com/github/Olhaau/fl-official-statistics-addon/blob/main/_dev/04b_wrapup_tuned_insurance_federated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Medical Insurance - a Federated Learning Use Case.

This notebook contains Federated Learning.

-- **more tba** --

## Setup
---

In [1]:
# Is a repo-clone and installs needed (e.g. in colabs)? 
need_clone_install = True

### Pull Repo

In [35]:
if need_clone_install:
    import os
    
    # rm repo from gdrive
    if os.path.exists("fl-official-statistics-addon"):
      %rm -r fl-official-statistics-addon

    # clone
    !git clone https://github.com/Olhaau/fl-official-statistics-addon
    %cd fl-official-statistics-addon

    # pull (the currenct version of the repo)
    !git pull

Cloning into 'fl-official-statistics-addon'...
remote: Enumerating objects: 906, done.[K
remote: Counting objects: 100% (88/88), done.[K
remote: Compressing objects: 100% (78/78), done.[K
remote: Total 906 (delta 60), reused 14 (delta 10), pack-reused 818[K
Receiving objects: 100% (906/906), 33.83 MiB | 18.00 MiB/s, done.
Resolving deltas: 100% (420/420), done.
/content/fl-official-statistics-addon/fl-official-statistics-addon
Already up to date.


### Installs

In [36]:
if need_clone_install: 
  !pip install --quiet nest_asyncio
  !pip install --quiet tensorflow_federated
  !pip install --quiet tensorflow_addons

In [4]:
!python --version

Python 3.9.16


In [37]:
!pip list

Package                       Version
----------------------------- ------------
absl-py                       1.0.0
alabaster                     0.7.13
albumentations                1.2.1
altair                        4.2.2
anyio                         3.6.2
appdirs                       1.4.4
argon2-cffi                   21.3.0
argon2-cffi-bindings          21.2.0
arviz                         0.15.1
astropy                       5.2.2
astunparse                    1.6.3
attrs                         21.4.0
audioread                     3.0.0
autograd                      1.5
Babel                         2.12.1
backcall                      0.2.0
beautifulsoup4                4.11.2
bleach                        6.0.0
blis                          0.7.9
blosc2                        2.0.0
bokeh                         2.4.3
branca                        0.6.0
CacheControl                  0.12.11
cached-property               1.5.2
cachetools                    3.1.1
catalogue   

### Imports

In [38]:
import collections
import numpy as np
import pandas as pd
import tqdm
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
import tensorflow_federated as tff
from tensorflow_addons.metrics import RSquare
import tensorflow_addons as tfa
import nest_asyncio
from sklearn.metrics import r2_score

np.random.seed(0)

## Ingest and Split the Data

In [39]:
df = pd.read_csv("https://raw.githubusercontent.com/Olhaau/fl-official-statistics-addon/main/output/data/insurance-clean.csv", index_col = 0)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,region0,region1,region2,region3
0,0.021739,0.0,0.321227,0.0,1.0,southwest,16884.924,0.0,0.0,0.0,1.0
1,0.0,1.0,0.47915,0.2,0.0,southeast,1725.5523,0.0,0.0,1.0,0.0
2,0.217391,1.0,0.458434,0.6,0.0,southeast,4449.462,0.0,0.0,1.0,0.0
3,0.326087,1.0,0.181464,0.0,0.0,northwest,21984.47061,0.0,1.0,0.0,0.0
4,0.304348,1.0,0.347592,0.0,0.0,northwest,3866.8552,0.0,1.0,0.0,0.0


In [40]:
NUM_CLIENTS = 4
NUM_EPOCHS = 50
BATCH_SIZE = 128
SHUFFLE_BUFFER = 20
PREFETCH_BUFFER = 5
NUM_ROUNDS = 50
RUN_NAME = f'0,8-3({NUM_ROUNDS})-{NUM_EPOCHS}-epochs-{BATCH_SIZE}-batch-WithRegion/'

In [41]:
syn_samples_per_region = 1000

def get_dataset_for_region(dataset, region_index, test_size_per_region=20):
    """Min-max scale and return data for a single, given region. The scaler must be fitted before.

    :param dataset: The dataset to get the regional data from
    :type dataset: pandas.DataFrame
    :param region_index: The index number of the region to return
    :type region_index: int
    :param test_size_per_region: The amount of values to separate for testing, default are 20
    :type test_size_per_region: int, optional
    :return: The dataset specific for the defined region, the test values, the test labels
    :rtype: tensorflow.python.data.ops.dataset_ops.PrefetchDataset, tuple of pandas.core.series.Series
    """
    region_ds = dataset[dataset['region'] == region_index]
    region_ds = region_ds.drop(columns=['region'])
    len = region_ds.shape[0]

    # The scaling into [0, 1] is not necessary anymore, it happens when the data loads already
    # region_ds[['age', 'bmi', 'children']] = scaler.transform(region_ds[['age', 'bmi', 'children']])

    X_test = region_ds.head(test_size_per_region)
    y_test = X_test.pop('charges')

    X_train = region_ds.tail(len - test_size_per_region)
    y_train = X_train.pop('charges')

    fed_train_dataset = tf.data.Dataset.from_tensor_slices((tf.convert_to_tensor(X_train), tf.convert_to_tensor(y_train)))

    return (
        fed_train_dataset.repeat(NUM_EPOCHS).shuffle(SHUFFLE_BUFFER, seed=1).batch(BATCH_SIZE).prefetch(PREFETCH_BUFFER),
        (X_test, y_test)
    )

In [42]:
# Create test and train sets and put them into random_client_ds, use four clients which are independent of the region
def get_dataset_random_region(dataset, num_clients=4, test_size_per_region=20):
    """Creates a list with client datasets independent of the region.

    :param dataset: The dataset to get the regional data from
    :type dataset: pandas.DataFrame
    :param num_clients: the number of clients create (equal big datasets per client), default value is 4 clients
    :type num_clients: int, optional
    :param test_size_per_region: The amount of values to separate for testing, default are 20
    :type test_size_per_region: int, optional
    :return: List of the prepared dataset with one entry per region, the test values and labels for each region
    :rtype: List of (tensorflow.python.data.ops.dataset_ops.PrefetchDataset, tuple of pandas.core.series.Series)"""
    size_of_client_ds = int(dataset.shape[0] / num_clients)

    dataset_to_split = dataset.copy()
    dataset_to_split.pop("region")
    random_client_ds = []
    for i in range(num_clients):
        sampled = dataset_to_split.sample(n=size_of_client_ds)
        dataset_to_split.drop(sampled.index)

        X_test = sampled.head(test_size_per_region)
        y_test = X_test.pop('charges')

        X_train = sampled.tail(size_of_client_ds - test_size_per_region)
        y_train = X_train.pop('charges')

        fed_train_dataset = tf.data.Dataset.from_tensor_slices((tf.convert_to_tensor(X_train), tf.convert_to_tensor(y_train)))

        train_set = fed_train_dataset.repeat(NUM_EPOCHS).shuffle(SHUFFLE_BUFFER, seed=1).batch(BATCH_SIZE).prefetch(PREFETCH_BUFFER)
        test_set = (X_test, y_test)

        random_client_ds.append((train_set, test_set))

    return random_client_ds

In [43]:
# Training data for clients with regional data (each client one region)
test_size_per_region = 20
regions = ['region0', 'region1', 'region2', 'region3']

federated_insurance_data = [
    get_dataset_for_region(df.drop(regions, axis=1), i, test_size_per_region=test_size_per_region)
    for i in range(NUM_CLIENTS-1)]
federated_insurance_data

[(<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 5), dtype=tf.float64, name=None), TensorSpec(shape=(None,), dtype=tf.float64, name=None))>,
  (Empty DataFrame
   Columns: [age, sex, bmi, children, smoker]
   Index: [],
   Series([], Name: charges, dtype: float64))),
 (<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 5), dtype=tf.float64, name=None), TensorSpec(shape=(None,), dtype=tf.float64, name=None))>,
  (Empty DataFrame
   Columns: [age, sex, bmi, children, smoker]
   Index: [],
   Series([], Name: charges, dtype: float64))),
 (<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 5), dtype=tf.float64, name=None), TensorSpec(shape=(None,), dtype=tf.float64, name=None))>,
  (Empty DataFrame
   Columns: [age, sex, bmi, children, smoker]
   Index: [],
   Series([], Name: charges, dtype: float64)))]

In [44]:
# Training data for clients with regional independent data
random_client_ds = get_dataset_random_region(df, num_clients=NUM_CLIENTS, test_size_per_region=test_size_per_region)

### Code Tests

In [45]:
print("splitted datasets: {}".format(len(federated_insurance_data)))
print("------------------------------")
# Q: why range(NUM_CLIENTS-1) and, thus, only 3 clients?

# Q: how to output? -> does not work or nothing in it?
# -> i think its empty
list(federated_insurance_data[0][0].as_numpy_iterator())[:10]

splitted datasets: 3
------------------------------


[]

#### Show Tensor

In [46]:
# Create a Tensor Object
X_train = df.iloc[:,0:4]
y_train = df['charges']
df2 = tf.data.Dataset.from_tensor_slices((tf.convert_to_tensor(X_train), tf.convert_to_tensor(y_train)))
df2

<_TensorSliceDataset element_spec=(TensorSpec(shape=(4,), dtype=tf.float64, name=None), TensorSpec(shape=(), dtype=tf.float64, name=None))>

In [47]:
# Show Tensor Object
# S. https://stackoverflow.com/questions/62436302/extract-target-from-tensorflow-prefetchdataset
list(df2.as_numpy_iterator())[:10]

[(array([0.02173913, 0.        , 0.3212268 , 0.        ]), 16884.924),
 (array([0.        , 1.        , 0.47914985, 0.2       ]), 1725.5523),
 (array([0.2173913 , 1.        , 0.45843422, 0.6       ]), 4449.462),
 (array([0.32608696, 1.        , 0.18146355, 0.        ]), 21984.47061),
 (array([0.30434783, 1.        , 0.34759214, 0.        ]), 3866.8552),
 (array([0.2826087 , 0.        , 0.26311542, 0.        ]), 3756.6216),
 (array([0.60869565, 0.        , 0.47027172, 0.2       ]), 8240.5896),
 (array([0.41304348, 0.        , 0.31692225, 0.6       ]), 7281.5056),
 (array([0.41304348, 1.        , 0.37315039, 0.4       ]), 6406.4107),
 (array([0.91304348, 0.        , 0.26580576, 0.        ]), 28923.13692)]

## Model Builder

In [48]:
from keras.layers import ReLU
from keras.models import Sequential
from keras.layers import Dense

def create_keras_model(
  input_features = 9
  ):

  model = Sequential([
    Dense(40, input_dim = 9, activation = 'relu'),
    Dense(40, activation = 'relu'),
    Dense(20, activation = 'relu'),
    Dense(1)
    ])
  #model = Sequential()
  #model.add(Dense(40, input_dim=9))
  #model.add(ReLU())
  #model.add(Dense(40))
  #model.add(ReLU())
  #model.add(Dense(20))
  #model.add(ReLU())
  #model.add(Dense(1))
  return model


# A helper function for federated learning
def model_fn(    
  input_features = 9):

  # We _must_ create a new model here, and _not_ capture it from an external
  # scope. TFF will call this within different graph contexts.
  keras_model = create_keras_model(
          input_features = input_features
  )
  return tff.learning.models.from_keras_model(
      keras_model,
      # without region: 
      #input_spec = federated_insurance_data[0][0].element_spec,
      input_spec = random_client_ds[0][0].element_spec,
      loss = tf.keras.losses.MeanAbsoluteError(),
      #loss = tf.keras.losses.MeanSquaredError(),
      metrics = [tf.keras.metrics.MeanAbsoluteError()
      #metrics = [tf.keras.losses.MeanSquaredError()
      #,tfa.metrics.RSquare()
      ]
  )

# Helper functions for different features as input
def model_fn_5():
    return model_fn(5)

def model_fn_5_mod():
  return model_fn(5, initializer = 'glorot_uniform', activation = 'relu')

def model_fn_9():
  return model_fn(9)

def model_fn_9_mod():
  return model_fn(9, initializer = 'glorot_uniform', activation = 'relu')

## Federated Learning

### Tuned Model

#### Create Learning Process

In [49]:
# Create iterative learning process which will perform the federated learning
iterative_process = tff.learning.algorithms.build_weighted_fed_avg(
    model_fn_9,
    client_optimizer_fn=lambda: tf.optimizers.Adam(learning_rate = .05),
    server_optimizer_fn=lambda: tf.optimizers.Adam(learning_rate = .05),)

In [50]:
# The initial setup of the learning process
print(iterative_process.initialize.type_signature.formatted_representation())

( -> <
  global_model_weights=<
    trainable=<
      float32[9,40],
      float32[40],
      float32[40,40],
      float32[40],
      float32[40,20],
      float32[20],
      float32[20,1],
      float32[1]
    >,
    non_trainable=<>
  >,
  distributor=<>,
  client_work=<>,
  aggregator=<
    value_sum_process=<>,
    weight_sum_process=<>
  >,
  finalizer=<
    int64,
    float32[9,40],
    float32[9,40],
    float32[40],
    float32[40],
    float32[40,40],
    float32[40,40],
    float32[40],
    float32[40],
    float32[40,20],
    float32[40,20],
    float32[20],
    float32[20],
    float32[20,1],
    float32[20,1],
    float32[1],
    float32[1]
  >
>@SERVER)


#### Train

In [None]:
state = iterative_process.initialize()

In [None]:
# Train the federated model with random clients
for round_num in tqdm.tqdm(range(1, NUM_ROUNDS)):
    result = iterative_process.next(state, [f[0] for f in random_client_ds])
    state = result.state
    metrics = result.metrics
    for name, value in metrics['client_work']['train'].items():
        tf.summary.scalar(name, value, step=round_num)

In [None]:
result.metrics

In [None]:
result.state
# all weights are zero
# -> nothing is learned 

#### Evaluate

In [None]:
# Create the test data for model evaluation
X_test = pd.concat([f[1][0] for f in random_client_ds])
y_test = pd.concat([f[1][1] for f in random_client_ds])

test_sets = [tf.data.Dataset.from_tensor_slices(
    (tf.convert_to_tensor(np.expand_dims(el[1][0], axis=0)), 
    tf.convert_to_tensor(np.expand_dims(el[1][1], axis=0)))) 
    for el in random_client_ds]

In [None]:
# Model evaluation
evaluation = tff.learning.build_federated_evaluation(model_fn_9)
# print(evaluation.type_signature.formatted_representation())
model_weights = iterative_process.get_model_weights(state)
train_metrics = evaluation(model_weights, test_sets)
train_metrics

In [None]:
# Create model from training results and evaluate
model = create_keras_model(input_features = 9)
model_weights.assign_weights_to(model)
model.compile(
    loss=tf.losses.mae,
    # loss=tf.losses.mean_squared_error,
    optimizer=tf.optimizers.Adam(),
    metrics=["mae", 'mean_squared_error']
)
# The evaluation results, for technical reasons the metrics_names is called afterwards. However, its order fits to the results
print(model.evaluate(X_test, y_test))
print(model.metrics_names)

Results federated:

fl-loss (mae): 2280.2578 
🚀 JW

OH: 3185.3455