# Pandas Dataframe oefening

# Inladen van een pandas DataFrame

Maak gebruik van een kleine <a href="https://archive.ics.uci.edu/ml/datasets/heart+Disease" class="external">dataset</a> over hartziektes dat beschikbaar gesteld wordt door de UCI Machine Learning Repository.
Deze dataset is een csv bestaande uit een aantal honderd lijnen.
Elke lijn beschrijft een patient en elke kolom een kenmerk van de patient.
We gaan in deze notebook proberen te voorspelen of een patient een hartziekte heeft op basis van deze gegevens.
Dit is een binaire classificatie taak.

## Inlezen van de data via pandas

Schrijf in de code cell hieronder de nodige code om met behulp van pandas (zoals we bij datascience gezien hebben) de dataset te downloaden en in te lezen via pandas.
De link naar de dataset zelf is https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/.
Print daarna de eerste 10 rijen uit van de dataset en de datatypes van elke kolom.
Splits ten slotte de dataset in twee delen, namelijk de features en de labels/targets.

In [1]:
import opendatasets as od
import pandas as pd

od.download("https://storage.googleapis.com/download.tensorflow.org/data/heart.csv")

df = pd.read_csv("heart.csv")
display(df.head(10))
display(df.dtypes)

features = df.loc[:, df.columns != "target"]
display(features.head())
targets = df["target"]
display(targets.head())

Using downloaded and verified file: .\heart.csv


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0
5,56,1,2,120,236,0,0,178,0,0.8,1,0,normal,0
6,62,0,4,140,268,0,2,160,0,3.6,3,2,normal,1
7,57,0,4,120,354,0,0,163,1,0.6,1,0,normal,0
8,63,1,4,130,254,0,2,147,0,1.4,2,1,reversible,1
9,53,1,4,140,203,1,2,155,1,3.1,3,0,reversible,0


age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal         object
target        int64
dtype: object

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal


0    0
1    1
2    0
3    0
4    0
Name: target, dtype: int64

In [2]:
import tensorflow as tf
import pandas as pd
import os

path = os.getcwd()
# path is nodig hier omdat de file anders in een cache directory geplaatst wordt
csv_file = tf.keras.utils.get_file(path + '\heart.csv', 'https://storage.googleapis.com/download.tensorflow.org/data/heart.csv')

df = pd.read_csv(csv_file)
display(df.head(10))
display(df.dtypes)

targets = df.pop("target")
display(df.head())
display(targets.head())

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed,0
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal,1
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible,0
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal,0
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal,0
5,56,1,2,120,236,0,0,178,0,0.8,1,0,normal,0
6,62,0,4,140,268,0,2,160,0,3.6,3,2,normal,1
7,57,0,4,120,354,0,0,163,1,0.6,1,0,normal,0
8,63,1,4,130,254,0,2,147,0,1.4,2,1,reversible,1
9,53,1,4,140,203,1,2,155,1,3.1,3,0,reversible,0


age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal         object
target        int64
dtype: object

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,1,145,233,1,2,150,0,2.3,3,0,fixed
1,67,1,4,160,286,0,2,108,1,1.5,2,3,normal
2,67,1,4,120,229,0,2,129,1,2.6,2,2,reversible
3,37,1,3,130,250,0,0,187,0,3.5,3,0,normal
4,41,0,2,130,204,0,2,172,0,1.4,1,0,normal


0    0
1    1
2    0
3    0
4    0
Name: target, dtype: int64

## Model trainen met minimale preprocessing

Op basis van deze dataset kan je nu een model trainen.
Bij Data Science hebben we hiervoor gebruik gemaakt van de sklearn library.

Maak voor het model te trainen een pipeline aan.
Voer in deze pipeline de volgende preprocessing stappen uit.
* Voer normalisatie uit op de numerieke kolommen ('age', 'thalach', 'trestbps',  'chol', 'oldpeak')

Na het uitvoeren van de preprocessing stappen, train een Random Forest Classifier met zelfgekozen hyperparameters.
Welke accuraatheid behaal je met een test-size van 20%?

In [3]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

num_cols = ['age', 'thalach', 'trestbps', 'chol', 'oldpeak']

X_train, X_test, y_train, y_test = train_test_split(df, targets, test_size = 0.2)

#  pipeline
p = Pipeline(steps = [
    ("preprocessor", ColumnTransformer(transformers=[
        ('num_imputer', StandardScaler(), num_cols)
    ])),
    ("rf", RandomForestClassifier())
])

# Preprocessing
p.fit(X_train, y_train)
p.score(X_test, y_test)

0.7704918032786885

In [4]:
# met tensorflow/keras en neuraal netwerk

# only using numeric columns
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(df[num_cols])

# sample output of the normalizer
normalizer(df[num_cols].iloc[:3])

model = tf.keras.Sequential([
    normalizer,
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(1)
])

model.compile(optimizer='adam',
    loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
    metrics=['accuracy'])

# output in terminal/commandline
model.fit(X_train[num_cols], y_train, epochs=500, batch_size=32)

Epoch 1/1500
Epoch 2/1500
Epoch 3/1500
Epoch 4/1500
Epoch 5/1500
Epoch 6/1500
Epoch 7/1500
Epoch 8/1500
Epoch 9/1500
Epoch 10/1500
Epoch 11/1500
Epoch 12/1500
Epoch 13/1500
Epoch 14/1500
Epoch 15/1500
Epoch 16/1500
Epoch 17/1500
Epoch 18/1500
Epoch 19/1500
Epoch 20/1500
Epoch 21/1500
Epoch 22/1500
Epoch 23/1500
Epoch 24/1500
Epoch 25/1500
Epoch 26/1500
Epoch 27/1500
Epoch 28/1500
Epoch 29/1500
Epoch 30/1500
Epoch 31/1500
Epoch 32/1500
Epoch 33/1500
Epoch 34/1500
Epoch 35/1500
Epoch 36/1500
Epoch 37/1500
Epoch 38/1500
Epoch 39/1500
Epoch 40/1500
Epoch 41/1500
Epoch 42/1500
Epoch 43/1500
Epoch 44/1500
Epoch 45/1500
Epoch 46/1500
Epoch 47/1500
Epoch 48/1500
Epoch 49/1500
Epoch 50/1500
Epoch 51/1500
Epoch 52/1500
Epoch 53/1500
Epoch 54/1500
Epoch 55/1500
Epoch 56/1500
Epoch 57/1500
Epoch 58/1500
Epoch 59/1500
Epoch 60/1500
Epoch 61/1500
Epoch 62/1500
Epoch 63/1500
Epoch 64/1500
Epoch 65/1500
Epoch 66/1500
Epoch 67/1500
Epoch 68/1500
Epoch 69/1500
Epoch 70/1500
Epoch 71/1500
Epoch 72/1500
E

<keras.callbacks.History at 0x1ec4792db70>

In [5]:
results = model.evaluate(X_test[num_cols], y_test, batch_size=128)
print("test loss, test acc:", results)

test loss, test acc: [0.5995242595672607, 0.7868852615356445]


## Model trainen met volledige preprocessing

Maak voor het model te trainen een pipeline aan.
Voer in deze pipeline de volgende preprocessing stappen uit.
* Voer normalisatie uit op de numerieke kolommen ('age', 'thalach', 'trestbps',  'chol', 'oldpeak')
* Voer one-hot encoding uit op de categorieke kolommen

Na het uitvoeren van de preprocessing stappen, train een Random Forest Classifier met zelfgekozen hyperparameters.
Welke accuraatheid behaal je met een test-size van 20%?

In [7]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

num_cols = ['age', 'thalach', 'trestbps', 'chol', 'oldpeak']
cat_cols = list(set(df.columns) - set(num_cols))

X_train, X_test, y_train, y_test = train_test_split(df, targets, test_size = 0.2)

#  pipeline
p = Pipeline(steps = [
    ("preprocessor", ColumnTransformer(transformers=[
        ('num_imputer', StandardScaler(), num_cols),
        ('ohe_encoder', OneHotEncoder(), cat_cols)
    ])),
    ("rf", RandomForestClassifier())
])

# Preprocessing
p.fit(X_train, y_train)
p.score(X_test, y_test)

0.8032786885245902

In [None]:
# met tensorflow/keras en neuraal netwerk
# complexer omdat tensors verwachten dat alles van hetzelfde datatype is
# hier is dit niet zo

# drie soorten kolommen
# numeriek -> doe scaling
# binaire klassen -> moet niets doen
# categorieke kolommen -> zet klassen om met een IntegerLookup of StringLookup
binary_feature_names = ['sex', 'fbs', 'exang']
categorical_feature_names = ['cp', 'restecg', 'slope', 'thal', 'ca']

# zet het dataframe om naar een Keras Input
inputs = {}
for name, column in df.items():
    if type(column[0]) == str:
        dtype = tf.string
    elif (name in categorical_feature_names or name in binary_feature_names):
        dtype = tf.int64
    else:
        dtype = tf.float32

    inputs[name] = tf.keras.Input(shape=(), name=name, dtype=dtype)
    
print(inputs)

preprocessed = []
# voor de binary kolommen moet niets gebeuren
for name in binary_feature_names:
    inp = inputs[name]
    inp = inp[:, tf.newaxis]
    float_value = tf.cast(inp, tf.float32)
    preprocessed.append(float_value)
    
# functie om de juiste inputs te extraheren
def stack_dict(inputs, fun=tf.stack):
    values = []
    for key in sorted(inputs.keys()):
        values.append(tf.cast(inputs[key], tf.float32))

    return fun(values, axis=-1)

# numeric inputs moeten geschaald worden
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(stack_dict(dict(df[num_cols])))

numeric_inputs = {}
for name in num_cols:
    numeric_inputs[name]=inputs[name]

numeric_inputs = stack_dict(numeric_inputs) # voeg ze als stack door de normalizer ipv elke feature individueel
numeric_normalized = normalizer(numeric_inputs)

preprocessed.append(numeric_normalized)

# categorieke inputs -> zet ze om met one hot encoding
for name in categorical_feature_names:
    vocab = sorted(set(df[name]))
    print(f'name: {name}')
    print(f'vocab: {vocab}\n')

    if type(vocab[0]) is str:
        lookup = tf.keras.layers.StringLookup(vocabulary=vocab, output_mode='one_hot')
    else:
        lookup = tf.keras.layers.IntegerLookup(vocabulary=vocab, output_mode='one_hot')

    x = inputs[name][:, tf.newaxis]
    x = lookup(x)
    preprocessed.append(x)
    
# groepeer de preprocessing results
preprocesssed_result = tf.concat(preprocessed, axis=-1)
preprocessor = tf.keras.Model(inputs, preprocesssed_result)
#tf.keras.utils.plot_model(preprocessor, rankdir="LR", show_shapes=True)

# ML-model
body = tf.keras.Sequential([
  tf.keras.layers.Dense(10, activation='relu'),
  tf.keras.layers.Dense(10, activation='relu'),
  tf.keras.layers.Dense(1)
])

x = preprocessor(inputs)
result = body(x)

model = tf.keras.Model(inputs, result)

model.compile(optimizer='adam',
                loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
                metrics=['accuracy'])

# interactive tensorboard window
callbacks = [
    tf.keras.callbacks.TensorBoard(
        log_dir="./logs",
        histogram_freq=0,  # How often to log histogram visualizations
        embeddings_freq=0,  # How often to log embedding visualizations
        update_freq="epoch",
    )  # How often to write logs (default: once per epoch)
]

history = model.fit(dict(X_train), y_train, epochs=500, callbacks=callbacks, batch_size=32)
results = model.evaluate(dict(X_test), y_test, batch_size=128)
print("test loss, test acc:", results)

{'age': <KerasTensor: shape=(None,) dtype=float32 (created by layer 'age')>, 'sex': <KerasTensor: shape=(None,) dtype=int64 (created by layer 'sex')>, 'cp': <KerasTensor: shape=(None,) dtype=int64 (created by layer 'cp')>, 'trestbps': <KerasTensor: shape=(None,) dtype=float32 (created by layer 'trestbps')>, 'chol': <KerasTensor: shape=(None,) dtype=float32 (created by layer 'chol')>, 'fbs': <KerasTensor: shape=(None,) dtype=int64 (created by layer 'fbs')>, 'restecg': <KerasTensor: shape=(None,) dtype=int64 (created by layer 'restecg')>, 'thalach': <KerasTensor: shape=(None,) dtype=float32 (created by layer 'thalach')>, 'exang': <KerasTensor: shape=(None,) dtype=int64 (created by layer 'exang')>, 'oldpeak': <KerasTensor: shape=(None,) dtype=float32 (created by layer 'oldpeak')>, 'slope': <KerasTensor: shape=(None,) dtype=int64 (created by layer 'slope')>, 'ca': <KerasTensor: shape=(None,) dtype=int64 (created by layer 'ca')>, 'thal': <KerasTensor: shape=(None,) dtype=string (created by 

In [None]:
tf.keras.utils.plot_model(model, "model.png", show_shapes=False)