# Exercises in neural network and deep learning II

## Exercise 2

Based on the **adult dataset**, build a neural network classifier for the target variable `income`. 

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_recall_fscore_support, precision_recall_curve, auc
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers 


<h1>GPU fun</h1>

In [None]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            tf.keras.mixed_precision.set_global_policy('mixed_float16')
        print(f"GPUs available: {gpus}")
        print(f"Memory growth enabled.")
        print("Mixed precision enabled (float16).")
    except Exception as e:
        print(f"Could not set memory growth: {e}")
else:
    print("No GPUs found. TensorFlow will run on CPU")

In [None]:
np.random.seed(2)
tf.random.set_seed(2)

In [None]:
df = pd.read_csv("adult.csv")
df.columns = df.columns.str.strip()
df.replace('?', np.nan, inplace=True)
df['income'] = (
    df['income']
     .str.replace('.', '', regex=False)
     .str.strip()
     .map({'>50K': 1, '<=50K': 0})
     .astype(int)
)
print(df['income'].value_counts())


<h5>Test, validation and train split </h5>

In [None]:
X = df.drop(columns=['income'])
y = df['income'].values

num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.25, random_state=2, stratify=y_temp
)

<h5>Handle nulls with imputations </h5>

In [None]:
X_train_imp = X_train.copy()
X_val_imp = X_val.copy()
X_test_imp = X_test.copy()

# For numbers
for col in num_cols:
    if col in X_train_imp.columns:
       median_val = X_train_imp[col].median()
       X_train_imp[col] = X_train_imp[col].fillna(median_val)
       if col in X_val_imp.columns:
           X_val_imp[col] = X_val_imp[col].fillna(median_val)
       if col in X_test_imp.columns:
           X_test_imp[col] = X_test_imp[col].fillna(median_val)

# For categories
for col in cat_cols:
    if col in X_train_imp.columns:
        mode_val = X_train_imp[col].mode(dropna=True).iloc[0]
        X_train_imp[col] = X_train_imp[col].fillna(mode_val)
        if col in X_val_imp.columns:
            X_val_imp[col] = X_val_imp[col].fillna(mode_val)
        if col in X_test_imp.columns:
            X_test_imp[col] = X_test_imp[col].fillna(mode_val)

<h4>One Shot encode and align columns</h4>

In [None]:
X_train_enc = pd.get_dummies(X_train_imp, columns=cat_cols, drop_first=True)
X_val_enc = pd.get_dummies(X_val_imp, columns=cat_cols, drop_first=True)
X_test_enc = pd.get_dummies(X_test_imp, columns=cat_cols, drop_first=True)

X_val_enc = X_val_enc.reindex(columns=X_train_enc.columns, fill_value=0)
X_test_enc = X_test_enc.reindex(columns=X_train_enc.columns, fill_value=0)

<h4>Scale</h4>

In [None]:
# We only scale numeric features so to avoid categories we make a mask to avoid scaling the one-hot encoded columns
encoded_num_mask = [c in num_cols for c in X_train_enc.columns]
num_idx = np.where(encoded_num_mask)[0]

scaler = StandardScaler()

X_train_scaled = X_train_enc.copy()
X_val_scaled = X_val_enc.copy()
X_test_scaled = X_test_enc.copy()

X_train_scaled.iloc[:, num_idx] = scaler.fit_transform(X_train_enc.iloc[:, num_idx])
X_val_scaled.iloc[:, num_idx] = scaler.transform(X_val_enc.iloc[:, num_idx])
X_test_scaled.iloc[:, num_idx] = scaler.transform(X_test_enc.iloc[:, num_idx])

# Numpy arrays for Keras
X_tr = X_train_scaled.astype(np.float32)
X_vl = X_val_scaled.astype(np.float32)
X_te = X_test_scaled.astype(np.float32)

y_tr = y_train.astype(np.int32)
y_vl = y_val.astype(np.int32)
y_te = y_test.astype(np.int32)

n_features = X_tr.shape[1]

<h4>Class weights</h4>
<p> we add this because the target is rarely positiv. It's to avoid the model cheating by favoring the majority class AKA under 50k </p>

In [None]:
classes = np.array([0,1])
cw_values = compute_class_weight(class_weight='balanced', classes=classes, y=y_tr)
class_weight = {0: float(cw_values[0]), 1: float(cw_values[1])}

<h4>The model</h3>

In [None]:
def build_model(n_features: int) -> keras.Model:
    inputs = keras.Input(shape=(n_features,))
    x = layers.Dense(128, activation='relu')(inputs)
    x = layers.Dropout(0.2)(x)
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dropout(0.2)(x)
    outputs = layers.Dense(1, activation='sigmoid')(x)
    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-3),
        loss='binary_crossentropy',
        metrics=[
            "accuracy",
            keras.metrics.AUC(curve="PR", name="pr_auc"),
            keras.metrics.AUC(curve="ROC", name="roc_auc"),
        ],
    )
    return model

model = build_model(n_features)

checkpoint_path = "adult_best.keras"
callbacks = [
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    keras.callbacks.ModelCheckpoint(checkpoint_path, monitor='val_loss', save_best_only=True)
]

<h4>Train</h3>

In [None]:
history = model.fit(
    X_tr, y_tr,
    validation_data=(X_vl, y_vl),
    epochs=300,
    batch_size=64,
    class_weight=class_weight,
    callbacks=callbacks,
    verbose=0,
)

<h4>Tune threshold</h4>

We tune on validation data to avoid peaking into the test-set

In [None]:
val_proba = model.predict(X_vl, verbose=0).ravel()

prec_v, rec_v, thr_v = precision_recall_curve(y_vl, val_proba)


f1_v = 2 * (prec_v * rec_v) / (prec_v + rec_v + 1e-12)
best_idx = int(np.nanargmax(f1_v))
best_thr = float(thr_v[max(best_idx - 1, 0)]) if thr_v.size > 0 else 0.5
print(f"Best val F1: {np.nanmax(f1_v):.4f} at threshold {best_thr:.3f}")

test_proba = model.predict(X_te, verbose=0).ravel()
y_test_pred = (test_proba >= best_thr).astype(int)

acc_te = accuracy_score(y_te, y_test_pred)
prec_te, rec_te, f1_te, _ = precision_recall_fscore_support(
    y_te, y_test_pred, average='binary', zero_division=0
    )

prec_te_curve, rec_te_curve, _ = precision_recall_curve(y_te, test_proba)
pre_auc = auc(rec_te_curve, prec_te_curve)

<h4>Save plots on PC after run</h4>

In [None]:
outdir = "Figures"
os.makedirs(outdir, exist_ok=True)

def savefig(name):
    path = os.path.join(outdir, name)
    plt.savefig(path, bbox_inches='tight', dpi=200)
    print(f"Saved figure to {path}")

# Training vs Validation loss
plt.figure(figsize=(7,4))
plt.plot(history.history["loss"], label="Train loss")
plt.plot(history.history["val_loss"], label="Val vs loss")
plt.title("Training vs Validation Loss")
plt.xlabel("Epoch"); plt.ylabel("Binary cross-entropy")
plt.grid(True, alpha=0.3); plt.legend()
savefig("loss.png"); plt.show()

# PR-AUC over epochs
plt.figure(figsize=(7,4))
plt.plot(history.history["pr_auc"], label="Train PR-AUC")
plt.plot(history.history["val_pr_auc"], label="Val PR-AUC")
plt.title("PR-AUC over Epochs")
plt.xlabel("Epoch"); plt.ylabel("PR-AUC")
plt.grid(True, alpha=0.3); plt.legend()
savefig("pr_auc_history.png"); plt.show()

# Validation Precision–Recall with best-F1 marker
plt.figure(figsize=(6,4))
plt.plot(rec_v, prec_v, label="Validation PR curve")
plt.scatter(rec_v[best_idx], prec_v[best_idx], label="Best F1 (val)")
plt.title("Validation Precision–Recall")
plt.xlabel("Recall"); plt.ylabel("Precision")
plt.grid(True, alpha=0.3); plt.legend()
savefig("val_pr_curve.png"); plt.show()

# Test Precision–Recall with tuned-threshold marker
plt.figure(figsize=(6,4))
plt.plot(rec_te_curve, prec_te_curve, label="Test PR curve")
plt.scatter(rec_te, prec_te, label=f"Tuned threshold (val test)")
plt.title("Test Precision–Recall")
plt.xlabel("Recall"); plt.ylabel("Precision")
plt.grid(True, alpha=0.3); plt.legend()
savefig("test_pr_curve.png"); plt.show()

#  Confusion matrix / Classification results
cm = confusion_matrix(y_te, y_test_pred)
plt.figure(figsize=(5,4.5))
plt.imshow(cm, interpolation="nearest")
plt.title("Confusion Matrix / Classification results")
plt.xlabel("Predicted label"); plt.ylabel("True label")
plt.xticks([0,1], ["<=50K", ">50K"]); plt.yticks([0,1], ["<=50K", ">50K"])
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, str(cm[i, j]), ha="center", va="center")
plt.colorbar(); plt.tight_layout()
savefig("confusion_matrix.png"); plt.show()