<a href="https://colab.research.google.com/github/Olhaau/fl-official-statistics-addon/blob/main/_dev/04_insurance_wrapup.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Medical Insurance - a Federated Learning Use Case.

Revision by Destatis (Julius Weißmann and Oliver Hauke)

## Summary
---

- We stablized the centralized deep neural network (DNN)
  - model with more units and layers, without dropout, Xavier Initializion
  - more robust, faster and precise results
- We fixed the Federated Learning (FL) Algorithm
  - same model as in the centralized setting
  - great improvement in loss, MAE similiar to centralized setting
- outline
  - FL highly suitable for the available data
  - suggestions:
    - fixed train/val/test-split for centralized vs federated
    - cross validation
    - tests for 5 or 9 features

## Initial Results
---

### Centralized

*Training Performance after tuning:*
![](https://github.com/Olhaau/fl-official-statistics-addon/blob/main/original_work/med-insurance/rsquared_hyperparams.jpg?raw=1)




### Federated

S. 
https://github.com/joshua-stock/fl-official-statistics/blob/main/med-insurance/med-insurance-federated.ipynb

- "*Ergebnisse sehen deutlich schlechter aus als zentralisiert.*"
- "*MAE geht nicht unter ~8700 (vs. ~2900 im zentralisierten Modell)*"
- "*R² ist negativ!*"

## Setup
---

In [1]:
# Is a repo-clone and installs needed (e.g. in colabs)? 
need_clone_install = True

### Pull Repo

In [2]:
if need_clone_install:
    import os
    
    # rm repo from gdrive
    if os.path.exists("fl-official-statistics-addon"):
      %rm -r fl-official-statistics-addon

    # clone
    !git clone https://github.com/Olhaau/fl-official-statistics-addon
    %cd fl-official-statistics-addon

    # pull (the currenct version of the repo)
    !git pull

Cloning into 'fl-official-statistics-addon'...
remote: Enumerating objects: 877, done.[K
remote: Counting objects: 100% (59/59), done.[K
remote: Compressing objects: 100% (49/49), done.[K
remote: Total 877 (delta 39), reused 15 (delta 10), pack-reused 818[K
Receiving objects: 100% (877/877), 33.56 MiB | 14.15 MiB/s, done.
Resolving deltas: 100% (399/399), done.
/content/fl-official-statistics-addon
Already up to date.


### Installs

#### Python Version

In [3]:
#https://www.datasciencelearner.com/change-python-version-in-google-colab-steps/
if False:
  !python --version
  print("-----------------------------------------------------------")
  !sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1
  !sudo update-alternatives --config python3
  print("-----------------------------------------------------------")
  !python --version

#### Packages

In [None]:
if need_clone_install:
  #!pip install --quiet nest-asyncio==1.5.6
  #!pip install --quiet tensorflow==2.11.*
  !pip install --quiet tensorflow-federated==0.48.*
  !pip install --quiet tensorflow-addons==0.19.*

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 KB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.3/71.3 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m142.7/142.7 KB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.9/89.9 KB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m243.8/243.8 KB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.9/238.9 KB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!pip list

### Imports

In [None]:
import pandas as pd
import time
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, ShuffleSplit, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import r2_score

# DNN
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, ReLU
from tensorflow_addons.metrics import RSquare

# TFF
import tensorflow_federated as tff

### Visualizations for neural networks

(optional)

In [None]:
!pip install --quiet keras_visualizer==3.1.1 
from keras_visualizer import visualizer
from IPython.display import Image

## Ingest Data
---

In [None]:
df = pd.read_csv("https://raw.githubusercontent.com/Olhaau/fl-official-statistics-addon/main/output/data/insurance-clean.csv", index_col = 0)
df.head()

### Train Test Split

In [None]:
# Divide data into train and test data
features = ['age', 'sex', 'bmi', 'children', 'smoker', 'region0', 'region1', 'region2', 'region3']
target = 'charges'

df_ml = df[features + [target]]

X_train, X_test, y_train, y_test = train_test_split(
    df_ml[features], df_ml[[target]], 
    test_size = 0.2, random_state = 42, shuffle = True)

## Centralized Neural Networks
---

### Initial Model

See section 'DNN' from https://github.com/joshua-stock/fl-official-statistics/blob/main/med-insurance/med-insurance.ipynb

#### Define + Compile

In [None]:


model0 = Sequential([
    keras.Input(shape=(9,)),
    Dense(32, activation='relu'),
    Dropout(0.05),
    Dense(16, activation='relu'),
    Dense(1)
])

# alternatively ...
#model0 = Sequential()
#model0.add(Dense(32, input_dim = 9))
#model0.add(ReLU())
#model0.add(Dropout(0.05))
#...

model0.compile(
    loss        = 'mae', 
    optimizer   = tf.optimizers.SGD(),
    metrics     = ["mae", 'mean_squared_error', r2_score], 
    run_eagerly = True
    )

model0.summary()

In [None]:
visualizer(model0, file_format = 'png', file_name = '_initial_model', view=True)
Image('_initial_model.png')

#### Train

In [None]:
rtime = time.time() 
tf.random.set_seed(42)

with tf.device('/device:GPU:0'):
  hist0 = model0.fit(
      X_train, y_train,
      shuffle = True,
      validation_split = 0.2,
      epochs = 100,
      verbose = 0
  )

rtime = time.time() - rtime
print(rtime / 60)

#### Evaluate

In [None]:
def plot_loss(hist, msr = 'loss'):
  """Plot the training history and save the figure.
  :param hist: The history object including the metrics to plot
  :type hist: keras.callbacks.History
  :param msr: The metrics to plot
  :type msr: str, optional
  """
  plt.plot(hist.history[msr])
  plt.plot(hist.history['val_' + msr])
  plt.ylabel(msr)
  plt.xlabel('epoch')
  plt.legend(['train', 'eval'], loc='upper left')


plot_loss(hist0, 'r2_score')
plt.title('Initial Model')
plt.ylim([0.5, 0.9])
plt.show()
#fig = plt.figure()
#fig.savefig(plot_experiment_path + "/40_40_20_mse_adam_0_05_ReLU_bs256.png", dpi=fig.dpi)

In [None]:
# Test
pd.DataFrame({
    'measure': ["mae", 'mse', 'r2_score'],
    'value': hist0.model.evaluate(X_test, y_test, verbose = 0)[1:]
})

### Improved Model

In [None]:
# define + compile
model = Sequential([
    Dense(40, input_dim = 9, activation = 'relu'),
    Dense(40, activation = 'relu'),
    Dense(20, activation = 'relu'),
    Dense(1)
])
# removed dropout, increased units and added another big layer in the middle

model.compile(
      loss = 'mean_squared_error', 
      optimizer = tf.optimizers.Adam(learning_rate = .05),
      # Adam + lower learning rate
      metrics = ["mae", 'mean_squared_error', r2_score], 
      run_eagerly = True
      )

model.summary()
print("=================================================================")

# train
rtime = time.time() 
tf.random.set_seed(42)

with tf.device('/device:GPU:0'):
  hist = model.fit(
      X_train, y_train,
      batch_size = 128, # <- higher batch_size
      shuffle = True,
      validation_split = 0.2,
      epochs = 100,
      verbose = 0
  )

rtime = time.time() - rtime
print('-- Training --')
print('time to train: ', rtime / 60)
print("=================================================================")

# evaluate
plot_loss(hist, 'r2_score')
plt.title('Modified Model')
plt.ylim([0.5, 0.9])
plt.show()
#fig = plt.figure()
#fig.savefig(plot_experiment_path + "/40_40_20_mse_adam_0_05_ReLU_bs256.png", dpi=fig.dpi)

# Test
print("=================================================================")
print("-- Test Performance --")
pd.DataFrame({
    'measure': ["mae", 'mse', 'r2_score'],
    'value': hist.model.evaluate(X_test, y_test, verbose = 0)[1:]
})

## Federated Learning
---

### Setup

### FedAvg