# Server Failure Prediction - Model Training

This notebook trains a neural network to predict server failures within 48 hours based on system metrics.

**Note:** This notebook contains intentional bugs for learning purposes.

## 1. Import Libraries

In [None]:
import os
import datetime
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.utils import resample

## 2. Load Prepared Data

In [None]:
# Load training and test data
train_df = pd.read_csv('train_data.csv')
test_df = pd.read_csv('test_data.csv')

print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")

# Separate features and target
X_train = train_df.drop('failure_within_48h', axis=1)
y_train = train_df['failure_within_48h']

X_test = test_df.drop('failure_within_48h', axis=1)
y_test = test_df['failure_within_48h']

print(f"\nClass distribution in training set:")
print(y_train.value_counts())
print(f"\nFailure rate: {y_train.mean():.1%}")

## 3. Handle Class Imbalance

Server failures are rare (~5%), so we balance the classes for better model training.

In [None]:
# Separate majority and minority classes
train_majority = train_df[train_df['failure_within_48h'] == 0]
train_minority = train_df[train_df['failure_within_48h'] == 1]

print(f"Majority class samples: {len(train_majority)}")
print(f"Minority class samples: {len(train_minority)}")

# Upsample minority class to match majority
train_minority_upsampled = resample(
    train_minority,
    replace=True,
    n_samples=len(train_majority),
    random_state=42
)

print(f"Upsampled minority class: {len(train_minority_upsampled)} samples")

# Combine majority and upsampled minority
train_balanced = pd.concat([train_majority, train_minority_upsampled])

print(f"Balanced training set: {len(train_balanced)} samples")

# Shuffle the data
train_balanced = train_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Separate features and target again
X_train_balanced = train_balanced.drop('failure_within_48h', axis=1)
y_train_balanced = train_balanced['failure_within_48h']

print("\nBalanced class distribution:")
print(y_train_balanced.value_counts())

## 4. Inspect Training Set Dimensions

In [None]:
# Inspect dimensions of training set (num_samples x num_features)
X_num_samples = X_train_balanced.shape[0]
print("\nNumber of training samples:")
print(X_num_samples)

# TODO. Fix BUG: Wrong input shape! Should be X_train_balanced.shape[1]
X_num_features = 10
print("\nNumber of training features:")
print(X_num_features)

## 5. Define Neural Network Model

In [None]:
# Set random seeds for reproducibility
tf.random.set_seed(42)
np.random.seed(42)

# Define the network
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_num_features,)),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(5, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(int(2e7), activation='relu'), # TODO. Fix BUG: Layer is too large!
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',  # BUG: Wrong loss function!
    metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

model.summary()

## 5. Setup TensorBoard Logging

In [None]:
# Create log directory with timestamp
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

# Early stopping to prevent overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

print(f"TensorBoard logs will be saved to: {log_dir}")

## 6. Train the Model

**Note:** This cell will fail due to the bugs above. Fix them first!

**Tip:** Open the Kernel Usage panel (right sidebar) to monitor memory usage during training.

In [None]:
# Train the model
history = model.fit(
    X_train_balanced,
    y_train_balanced,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    callbacks=[tensorboard_callback, early_stopping],
    verbose=1
)

## 7. Evaluate the Model

In [None]:
# Evaluate on test set
test_loss, test_acc, test_precision, test_recall = model.evaluate(X_test, y_test, verbose=0)

print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")

# Calculate F1 score
f1_score = 2 * (test_precision * test_recall) / (test_precision + test_recall)
print(f"Test F1 Score: {f1_score:.4f}")

## 8. Launch TensorBoard

Run this cell to visualize training metrics in TensorBoard.

In [None]:
# TODO: Load TensorBoard extension

In [None]:
# TODO: Set the TensorBoard proxy URL for RHOAI workbench

In [None]:
# TODO: Launch TensorBoard

## 9. Train a Second Model for Comparison

After fixing the bugs, train a second model with different architecture to compare in TensorBoard.

This model uses a deeper architecture with more layers but appropriately sized.

In [None]:
# Build a simple logistic regression model for comparison
# Logistic regression = single layer with sigmoid activation
model_v2 = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(X_train_balanced.shape[1],)),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model_v2.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

print("Model v2 architecture")
model_v2.summary()

# Setup TensorBoard for second model
log_dir_v2 = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "_v2_logreg"
tensorboard_callback_v2 = tf.keras.callbacks.TensorBoard(log_dir=log_dir_v2, histogram_freq=1)

# Train second model
print("\nTraining model v2...")
history_v2 = model_v2.fit(
    X_train_balanced,
    y_train_balanced,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    callbacks=[tensorboard_callback_v2, early_stopping],
    verbose=1
)

print("\nLogistic regression model trained! Compare both models in TensorBoard.")

## 10. Compare Models

Evaluate the second model and compare results.

In [None]:
# Evaluate second model on test set
test_loss_v2, test_acc_v2, test_precision_v2, test_recall_v2 = model_v2.evaluate(X_test, y_test, verbose=0)

print("Model Comparison:")
print("="*60)
print(f"{'Metric':<20} {'Model v1':<20} {'Model v2':<20}")
print("="*60)
print(f"{'Test Accuracy':<20} {test_acc:.4f}{'':<16} {test_acc_v2:.4f}")
print(f"{'Test Precision':<20} {test_precision:.4f}{'':<16} {test_precision_v2:.4f}")
print(f"{'Test Recall':<20} {test_recall:.4f}{'':<16} {test_recall_v2:.4f}")

# Calculate F1 scores
f1_v1 = 2 * (test_precision * test_recall) / (test_precision + test_recall)
f1_v2 = 2 * (test_precision_v2 * test_recall_v2) / (test_precision_v2 + test_recall_v2)
print(f"{'Test F1 Score':<20} {f1_v1:.4f}{'':<16} {f1_v2:.4f}")
print("="*60)

# Calculate model sizes
params_v1 = model.count_params()
params_v2 = model_v2.count_params()
print(f"\nModel v1 parameters: {params_v1:,}")
print(f"Model v2 parameters: {params_v2:,}")
print(f"Parameter reduction: {(1 - params_v2/params_v1)*100:.1f}%")

## 11. Test Predictions on Sample Servers

Let's test both models on specific server scenarios to see how they perform in practice.

In [None]:
# Create sample servers with clear characteristics
# We need to match the same features as our training data

# Get feature names (excluding target)
feature_names = X_train.columns.tolist()
print(f"Features: {feature_names}\n")

# Sample 1: HEALTHY server - normal metrics
healthy_server = pd.DataFrame([{
    'server_age_months': 12,
    'cpu_temp_celsius': 55.0,
    'cpu_utilization_percent': 45.0,
    'memory_usage_percent': 60.0,
    'disk_io_ops_per_sec': 500,
    'network_throughput_mbps': 300.0,
    'fan_speed_rpm': 3000,
    'power_draw_watts': 250.0,
    'disk_read_errors_24h': 0,
    'memory_errors_24h': 0,
    'workload_type_compute': 0,
    'workload_type_database': 0,
    'workload_type_storage': 0,
    'workload_type_web': 1
}])

# Sample 2: AT-RISK server - high temperature, errors, old server
at_risk_server = pd.DataFrame([{
    'server_age_months': 48,
    'cpu_temp_celsius': 85.0,
    'cpu_utilization_percent': 92.0,
    'memory_usage_percent': 88.0,
    'disk_io_ops_per_sec': 1800,
    'network_throughput_mbps': 850.0,
    'fan_speed_rpm': 2100,
    'power_draw_watts': 420.0,
    'disk_read_errors_24h': 15,
    'memory_errors_24h': 8,
    'workload_type_compute': 0,
    'workload_type_database': 1,
    'workload_type_storage': 0,
    'workload_type_web': 0
}])

# Sample 3: MODERATE-RISK server - some warning signs
moderate_server = pd.DataFrame([{
    'server_age_months': 30,
    'cpu_temp_celsius': 72.0,
    'cpu_utilization_percent': 75.0,
    'memory_usage_percent': 78.0,
    'disk_io_ops_per_sec': 1200,
    'network_throughput_mbps': 550.0,
    'fan_speed_rpm': 2600,
    'power_draw_watts': 320.0,
    'disk_read_errors_24h': 5,
    'memory_errors_24h': 2,
    'workload_type_compute': 1,
    'workload_type_database': 0,
    'workload_type_storage': 0,
    'workload_type_web': 0
}])

# Make predictions
def predict_and_explain(server_data, server_name):
    """Make predictions and display results."""
    pred_v1 = model.predict(server_data, verbose=0)[0][0]
    pred_v2 = model_v2.predict(server_data, verbose=0)[0][0]
    
    print(f"\n{'='*70}")
    print(f"{server_name.upper()}")
    print(f"{'='*70}")
    
    # Display key metrics
    print(f"\nServer Characteristics:")
    print(f"  Age: {server_data['server_age_months'].values[0]} months")
    print(f"  CPU Temp: {server_data['cpu_temp_celsius'].values[0]:.1f}°C")
    print(f"  CPU Usage: {server_data['cpu_utilization_percent'].values[0]:.1f}%")
    print(f"  Memory Usage: {server_data['memory_usage_percent'].values[0]:.1f}%")
    print(f"  Fan Speed: {server_data['fan_speed_rpm'].values[0]} RPM")
    print(f"  Disk Errors (24h): {server_data['disk_read_errors_24h'].values[0]}")
    print(f"  Memory Errors (24h): {server_data['memory_errors_24h'].values[0]}")
    
    # Display predictions
    print(f"\nFailure Predictions (within 48 hours):")
    print(f"  Model v1: {pred_v1*100:.1f}% probability")
    print(f"  Model v2: {pred_v2*100:.1f}% probability")
    
    # Interpretation
    threshold = 0.5
    risk_v1 = "HIGH RISK" if pred_v1 > threshold else "Low risk"
    risk_v2 = "HIGH RISK" if pred_v2 > threshold else "Low risk"
    
    print(f"\nRisk Assessment (threshold = 50%):")
    print(f"  Model v1: {risk_v1}")
    print(f"  Model v2: {risk_v2}")
    
    # Agreement
    if (pred_v1 > threshold) == (pred_v2 > threshold):
        print(f"\n✓ Both models agree on the risk level")
    else:
        print(f"\n⚠ Models disagree - further investigation recommended")

# Run predictions
predict_and_explain(healthy_server, "Healthy Server")
predict_and_explain(at_risk_server, "At-Risk Server (High Failure Probability)")
predict_and_explain(moderate_server, "Moderate-Risk Server")

print(f"\n{'='*70}")
print(f"\nConclusion:")
print(f"- Healthy servers show low failure probability (<10%)")
print(f"- At-risk servers (high temp, errors, old) show high probability (>50%)")
print(f"- Both models should identify similar risk patterns")
print(f"- Model v2 may show slightly better calibration due to deeper architecture")
print(f"{'='*70}")