# Server Failure Prediction - Model Training

This notebook trains a neural network to predict server failures within 48 hours based on system metrics.

**Note:** This notebook contains intentional bugs for learning purposes.

## 1. Import Libraries

In [None]:
import os
import pandas as pd
import tensorflow as tf
import keras

import data_utils

# Set the seeds for reproducibility
keras.utils.set_random_seed(83)
tf.config.experimental.enable_op_determinism()

## 2. Load and Prepare Data

Read the original data from `server_metrics.csv`.
The goal is to predict the `failure_within_48h` target variable, based on the other variables in the data set.

In [None]:
# Load raw data
raw_server_metrics = data_utils.load_data("server_metrics.csv")
raw_server_metrics.head()

Process and split the raw dataset into a training and a test set.

In [None]:
# Preprocess features (including data scaling)
X, y = data_utils.preprocess_features(raw_server_metrics)

# Split data into train/set subsets
X_train, X_test, y_train, y_test = data_utils.split_data(X, y)

# Save prepared data train/test data for inspection
data_utils.save_prepared_data(X_train, X_test, y_train, y_test)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

print("\nClass distribution in training set:")
print(y_train.value_counts())
print(f"\nFailure rate: {y_train.mean():.1%}")

## 3. Handle Class Imbalance

Server failures are rare (~16%), so use class weights to handle the imbalance.

Class weights tell the model to pay more attention to the minority class (failures) during training.

In [None]:
# Calculate class weights to handle imbalance
class_weights = data_utils.calculate_class_weights(y_train)

print(
    "\nThe minority class (failures) should have "
    f"{class_weights[1]:.1f}x more weight during training."
)

## 4. Inspect Training Set Dimensions

In [None]:
# Inspect dimensions of training set (num_samples x num_features)
X_num_samples = X_train.shape[0]
print("\nNumber of training samples:")
print(X_num_samples)

# TODO. Fix BUG: Wrong input shape! Should be X_train.shape[1]
X_num_features = 10
print("\nNumber of training features:")
print(X_num_features)

## 5. Define Neural Network Model

In [None]:
# Define the network
model = keras.Sequential([
    keras.layers.Input(shape=(X_num_features,)),
    keras.layers.Dense(50, activation='relu'),
    keras.layers.Dropout(0.3),
    keras.layers.Dense(10, activation='relu'),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(int(2e6), activation='relu'), # TODO. Fix BUG: Layer is too large!
    keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall')
    ]
)

model.summary()

## 6. Setup TensorBoard Logging

In [None]:
# Setup TensorBoard with descriptive logging
log_dir = "logs/neural_network"
tensorboard_callback = keras.callbacks.TensorBoard(
    log_dir=log_dir
)

print(f"TensorBoard logs will be saved to: {log_dir}")

## 7. Train the Model

**Note:** This cell fails initially due to the bugs above. Fix them first.

**Tip:** Open the Kernel Usage panel (right sidebar) to monitor memory usage during training.

In [None]:
history = model.fit(
    X_train,
    y_train,
    epochs=50,
    batch_size=3000,
    validation_split=0.2,
    class_weight=class_weights,
    callbacks=[tensorboard_callback]
)

## 8. Evaluate the Model

In [None]:
# Evaluate on test set
test_loss, test_precision, test_recall = model.evaluate(X_test, y_test, verbose=0)

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")

# Calculate F1 score
f1_score = 2 * (test_precision * test_recall) / (test_precision + test_recall)
print(f"Test F1 Score: {f1_score:.4f}")

## 9. Train a Second Model for Comparison

After fixing the bugs, train a second model and compare it with the first one.

This model uses a simpler logistic regression architecture.

In [None]:
# Simple logistic regression model for comparison
# Logistic regression = single layer with sigmoid activation
model_v2 = keras.Sequential([
    keras.layers.Input(shape=(X_train.shape[1],)),
    keras.layers.Dense(1, activation='sigmoid')
])

model_v2.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall')
    ]
)

print("Logistic Regression model")
model_v2.summary()

# Setup TensorBoard for second model
log_dir_v2 = "logs/logistic_regression"
tensorboard_callback_v2 = keras.callbacks.TensorBoard(
    log_dir=log_dir_v2
)

print("\nTraining model v2...")
history_v2 = model_v2.fit(
    X_train,
    y_train,
    epochs=50,
    batch_size=3000,
    validation_split=0.2,
    class_weight=class_weights,
    callbacks=[tensorboard_callback_v2]
)

print("\nLogistic regression model trained. Compare both models in TensorBoard.")

## 11. Compare Models

Evaluate the second model and compare results.

In [None]:
# Calculate model sizes
params_v1 = model.count_params()
params_v2 = model_v2.count_params()
print(f"\nModel v1 (neural net): {params_v1:,} parameters")
print(f"Model v2 (logistic regression): {params_v2:,} parameters")

# Evaluate second model on test set
test_loss_v2, test_precision_v2, test_recall_v2 = model_v2.evaluate(X_test, y_test, verbose=0)

print("\nModel Comparison (Test set):")
print("="*60)
print(f"{'Metric':<20} {'Model v1':<20} {'Model v2':<20}")
print("="*60)
print(f"{'Precision':<20} {test_precision:.4f}{'':<16} {test_precision_v2:.4f}")
print(f"{'Recall':<20} {test_recall:.4f}{'':<16} {test_recall_v2:.4f}")

# Calculate F1 scores
f1_v1 = 2 * (test_precision * test_recall) / (test_precision + test_recall)
f1_v2 = 2 * (test_precision_v2 * test_recall_v2) / (test_precision_v2 + test_recall_v2)
print(f"{'F1 Score':<20} {f1_v1:.4f}{'':<16} {f1_v2:.4f}")
print("="*60)

## 12. Test Predictions on Sample Servers

Test both models on specific server scenarios to see how they perform in practice.

In [None]:
# Test both models on sample cases
test_cases = [
    {
        'desc': 'Healthy server',
        'actual': 0,  # Did not fail
        'metrics': {
            'server_age_months': 12,
            'cpu_temp_celsius': 55.0,
            'cpu_utilization_percent': 45.0,
            'memory_usage_percent': 60.0,
            'disk_io_ops_per_sec': 500,
            'network_throughput_mbps': 300.0,
            'fan_speed_rpm': 3000,
            'power_draw_watts': 250.0,
            'disk_read_errors_24h': 0,
            'memory_errors_24h': 0,
            'workload_type': 'web'
        }
    },
    {
        'desc': 'Failed server',
        'actual': 1,  # Failed
        'metrics': {
            'server_age_months': 48,
            'cpu_temp_celsius': 85.0,
            'cpu_utilization_percent': 92.0,
            'memory_usage_percent': 88.0,
            'disk_io_ops_per_sec': 1800,
            'network_throughput_mbps': 850.0,
            'fan_speed_rpm': 2100,
            'power_draw_watts': 420.0,
            'disk_read_errors_24h': 15,
            'memory_errors_24h': 8,
            'workload_type': 'database'
        }
    },
    {
        'desc': 'Moderate load',
        'actual': 0,  # Did not fail
        'metrics': {
            'server_age_months': 15,
            'cpu_temp_celsius': 70.0,
            'cpu_utilization_percent': 63.0,
            'memory_usage_percent': 49.0,
            'disk_io_ops_per_sec': 1000,
            'network_throughput_mbps': 550.0,
            'fan_speed_rpm': 2600,
            'power_draw_watts': 320.0,
            'disk_read_errors_24h': 5,
            'memory_errors_24h': 2,
            'workload_type': 'compute'
        }
    }
]

print(f"{'Case':<20} {'Actual':<10} {'Model v1':<20} {'Model v2':<20} {'Correct Model':<10}")
print("="*90)

for case in test_cases:
    # Preprocess the data using saved scaler from training
    sample = pd.DataFrame([case['metrics']])
    sample_processed = data_utils.preprocess_for_inference(sample)

    pred_v1 = model.predict(sample_processed, verbose=0)[0][0]
    pred_v2 = model_v2.predict(sample_processed, verbose=0)[0][0]

    # Determine predictions (>0.5 = failure)
    pred_v1_class = int(pred_v1 > 0.5)
    pred_v2_class = int(pred_v2 > 0.5)

    # Who got it right?
    v1_correct = "✓" if pred_v1_class == case['actual'] else "✗"
    v2_correct = "✓" if pred_v2_class == case['actual'] else "✗"

    if pred_v1_class == case['actual'] and pred_v2_class != case['actual']:
        winner = "Model v1"
    elif pred_v2_class == case['actual'] and pred_v1_class != case['actual']:
        winner = "Model v2"
    elif pred_v1_class == case['actual'] and pred_v2_class == case['actual']:
        winner = "Both"
    else:
        winner = "Neither"

    actual_str = "Fail" if case['actual'] == 1 else "OK"
    v1_prediction_str = "Fail" if pred_v1_class == 1 else "OK"
    v2_prediction_str = "Fail" if pred_v2_class == 1 else "OK"
    print(
        f"{case['desc']:<20} {actual_str:<10} "
        f"{v1_correct} - {v1_prediction_str:<4} ({pred_v1:.2f}){'':<5} "
        f"{v2_correct} - {v2_prediction_str:<4} ({pred_v2:.2f}){'':<5} "
        f"{winner:<10}"
    )


## 13. Launch TensorBoard

Complete and run the cells below to visualize training metrics in TensorBoard.

**Comparing models in TensorBoard:**
- Both models log to separate directories: `logs/neural_network` and `logs/logistic_regression`
- In TensorBoard's left sidebar, ensure both runs are checked (selected)
- The `SCALARS` tab can overlay both models on the same graph for easy comparison

In [None]:
# TODO: Load TensorBoard extension

In [None]:
# TODO: Set the TensorBoard proxy URL for RHOAI workbench

In [None]:
# TODO: Launch TensorBoard