# Gestructureerde data

In deze notebook gaan we het klassieke Titanic classificatieprobleem oplossen.
In het eerste deel gaan we dit met pytorch doen, in het tweede deel met keras en tensorflow.

Hierbij worden de volgende stappen uitgevoerd:
- Data inladen
- Data Modelling
- Model evaluation

De dataset die we hierbij gebruiken kan als volgt gedownload worden:

In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Define URL to load the Titanic dataset
url = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"

# Load the dataset into a Pandas DataFrame
df_orig = pd.read_csv(url)

# Display the first few rows of the dataset
df_orig.head()


Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y


Deze data kan dan als volgt gepreprocessed worden

In [2]:
df = df_orig.copy()

# Handle missing data: Fill missing values with the median (for numerical data) or mode (for categorical data)
age_mean = df['age'].median()
df['age'].fillna(age_mean, inplace=True)
df['embark_town'].fillna("unknown", inplace=True)

# Encode categorical variables
df['sex'] = LabelEncoder().fit_transform(df['sex'])
df['class'] = LabelEncoder().fit_transform(df['class'])
df['embark_town'] = LabelEncoder().fit_transform(df['embark_town'])
df['alone'] = LabelEncoder().fit_transform(df['alone'])
df = pd.concat([df, pd.get_dummies(df['deck'], dtype=int)],axis=1)
df = df.drop('deck', axis=1)

# Split features and target
X = df.drop(columns=['survived'])
y = df['survived']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(age_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['embark_town'].fillna("unknown", inplace=True)


## Modelling met pytorch

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

device = torch.device('cpu')

# Convert the data into PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Create DataLoader for training and testing sets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the Feed-Forward Neural Network model in PyTorch
class TitanicModel(nn.Module):
    def __init__(self):
        super(TitanicModel, self).__init__()
        self.fc1 = nn.Linear(16, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

# Instantiate the model, define the loss function and optimizer
model = TitanicModel()
criterion = nn.BCELoss() # deze verwacht een sigmoid activatiefunctie
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the model with Early Stopping
early_stopping_tolerance = 5
min_loss = np.inf
patience = 0

for epoch in range(100):
    model.train()
    epoch_loss = 0.0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    # Validation
    model.eval()
    with torch.no_grad():
        val_loss = 0.0
        for inputs, labels in test_loader:
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            val_loss += loss.item()

    print(f"Epoch {epoch+1}, Training Loss: {epoch_loss/len(train_loader)}, Validation Loss: {val_loss/len(test_loader)}")

    # Early Stopping
    if val_loss < min_loss:
        min_loss = val_loss
        patience = 0
    else:
        patience += 1
        if patience >= early_stopping_tolerance:
            print("Early stopping")
            break


Epoch 1, Training Loss: 0.6599463410675526, Validation Loss: 0.637076199054718
Epoch 2, Training Loss: 0.638684269040823, Validation Loss: 0.614509791135788
Epoch 3, Training Loss: 0.6167835108935833, Validation Loss: 0.5869619250297546
Epoch 4, Training Loss: 0.5899760611355305, Validation Loss: 0.5546158701181412
Epoch 5, Training Loss: 0.5591618604958057, Validation Loss: 0.5167573317885399
Epoch 6, Training Loss: 0.5247539039701223, Validation Loss: 0.4839297905564308
Epoch 7, Training Loss: 0.49535467103123665, Validation Loss: 0.4559408128261566
Epoch 8, Training Loss: 0.46660900861024857, Validation Loss: 0.4400290176272392
Epoch 9, Training Loss: 0.44957354851067066, Validation Loss: 0.4291984736919403
Epoch 10, Training Loss: 0.43465789780020714, Validation Loss: 0.424580417573452
Epoch 11, Training Loss: 0.4217523820698261, Validation Loss: 0.4235852286219597
Epoch 12, Training Loss: 0.41702260076999664, Validation Loss: 0.4209333658218384
Epoch 13, Training Loss: 0.411472218

Hieronder staat de code om een visuele weergave van het model te bekomen

In [4]:
import torch
from torchviz import make_dot

# Gebruik een voorbeeldinput om het computationele grafiek te genereren
example_input = torch.randn(1, 16)  # A batch of 1, with 7 features (assuming 7 input features)
output = model(example_input)

# Maak een visualisatie van het computationele grafiek
make_dot(output, params=dict(model.named_parameters())).render("model_structure", format="png")

'model_structure.png'

In [5]:
from torchsummary import summary

summary(model, input_size=(16,), device=str(device))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 32]             544
              ReLU-2                   [-1, 32]               0
            Linear-3                   [-1, 16]             528
              ReLU-4                   [-1, 16]               0
            Linear-5                    [-1, 1]              17
           Sigmoid-6                    [-1, 1]               0
Total params: 1,089
Trainable params: 1,089
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00
----------------------------------------------------------------


## Tensorflow implementation

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

# Define the Feed-Forward Neural Network model in TensorFlow
model_tf = models.Sequential([
    layers.InputLayer(input_shape=(16,)),
    layers.Dense(32, activation='relu'),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# Compile the model
model_tf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define callbacks for Early Stopping and TensorBoard
early_stopping_cb = callbacks.EarlyStopping(patience=5, restore_best_weights=True)
tensorboard_cb = callbacks.TensorBoard(log_dir='./logs')

# Train the model with Early Stopping and TensorBoard callbacks
history = model_tf.fit(X_train, y_train, validation_split=0.2, epochs=100, 
                       callbacks=[early_stopping_cb, tensorboard_cb], batch_size=32)

# Evaluate the model
loss, accuracy = model_tf.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

# Create predictions for the confusion matrix
y_pred_tf = (model_tf.predict(X_test) > 0.5).astype("int32")

# Plot the confusion matrix
cm_tf = confusion_matrix(y_test, y_pred_tf)
disp_tf = ConfusionMatrixDisplay(confusion_matrix=cm_tf)
disp_tf.plot()
plt.show()

In [None]:
# teken de graaf van dit preprocessing model
tf.keras.utils.plot_model(model = model_tf , rankdir="LR", dpi=72, show_shapes=True)

In [None]:
# print de samenvatting van dit preprocessing model
model_tf.summary()

## Tensorflow met preprocessing in het netwerk

Hierboven werd het preprocessing gedeelte nog gedaan met sci-kit learn.
Het kan beter zijn voor productie om de (sommige) preprocessing mee te betrekken in het neuraal netwerk.
Dit om te voorkomen dat je alle preprocessingstappen moet beschikbaar maken voor developers die het op dezelfde manier moeten implementeren.
Hierdoor kan je soms data vrijgeven (zoals gemiddelden voor scalers, woordenboeken voor categorieke data, ...) wat niet gewenst is.

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Normalization, StringLookup, CategoryEncoding, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras import Input

# Split into features and labels
target = "survived"
features = df_orig.copy()
labels = features.pop(target)

# Identify categorical and numerical columns
categorical_columns = ['sex', 'class', 'deck', 'embark_town', 'alone']
numerical_columns = ['age', 'n_siblings_spouses', 'parch', 'fare']


# Create Input layers and preprocessing layers
inputs = {}
preprocessed_inputs = []

# Process numerical features
for col in numerical_columns:
    input_col = Input(shape=(1,), name=col)
    # Create a Normalization layer and adapt it to the data
    normalization_layer = Normalization()
    normalization_layer.adapt(features[col].values.reshape(-1, 1))
    normalized_col = normalization_layer(input_col)
    inputs[col] = input_col
    preprocessed_inputs.append(normalized_col)

# Process categorical features
for col in categorical_columns:
    input_col = Input(shape=(1,), name=col, dtype=tf.string)
    # Create a StringLookup layer and adapt it to the data
    lookup_layer = StringLookup(output_mode='int')  # output_mode='int' for one-hot encoding
    lookup_layer.adapt(features[col])
    encoded_col = lookup_layer(input_col)
    # One-hot encode the integer categorical indices
    one_hot_layer = CategoryEncoding(num_tokens=lookup_layer.vocabulary_size(), output_mode='one_hot')
    one_hot_col = one_hot_layer(encoded_col)
    inputs[col] = input_col
    preprocessed_inputs.append(one_hot_col)

# Concatenate all preprocessed inputs
all_features = Concatenate()(preprocessed_inputs)

# Define the neural network model
x = Dense(128, activation='relu')(all_features)
x = Dense(64, activation='relu')(x)
output = Dense(1, activation='sigmoid')(x)  # Output layer for binary classification

# Create the model
model = Model(inputs=inputs, outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model.summary()

# Convert the DataFrame to a dictionary of NumPy arrays
features_dict = {name: tf.convert_to_tensor(features[name].values) for name in features.columns}

# Train the model
model.fit(x=features_dict, y=labels, epochs=10, validation_split=0.2)

In [None]:
# teken de graaf van dit preprocessing model

tf.keras.utils.plot_model(model = model , rankdir="LR", dpi=72, show_shapes=True)