In [8]:
# kubernetes_multiclass_failure_predictor.py

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.metrics import classification_report, confusion_matrix

# ----------------------
# Step 1: Load + Label
# ----------------------
df = pd.read_csv("kubernetes_merged_dataset.csv")

def label_failure_type(row):
    if row['cpu_usage'] > row['cpu_limit'] * 0.9 or row['memory_usage'] > row['memory_limit'] * 0.9:
        return 1  # Resource exhaustion
    elif row['network_latency'] > 100:
        return 2  # Network issue
    elif row['disk_io'] > 800:
        return 3  # Disk I/O
    elif row['node_temperature'] > 85:
        return 4  # Overheating
    elif row['restart_count'] > 3:
        return 5  # Restart loop
    else:
        return 0  # No failure

df['failure_type'] = df.apply(label_failure_type, axis=1)

# ----------------------
# Step 2: Oversample
# ----------------------
features = [
    'cpu_usage', 'memory_usage', 'cpu_limit', 'memory_limit',
    'node_temperature', 'disk_io', 'network_latency',
    'network_bandwidth_usage', 'restart_count', 'uptime_seconds'
]

# Balance all classes
dfs = [df[df['failure_type'] == i] for i in range(6)]
max_len = max(len(d) for d in dfs)
dfs_resampled = [resample(d, replace=True, n_samples=max_len, random_state=42) for d in dfs]
df_balanced = pd.concat(dfs_resampled).sample(frac=1, random_state=42)

X = df_balanced[features].values
y = df_balanced['failure_type'].values

# ----------------------
# Step 3: Preprocess
# ----------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.long)

dataset = TensorDataset(X_tensor, y_tensor)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ----------------------
# Step 4: Model
# ----------------------
class MulticlassNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(MulticlassNN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.GELU(),
            nn.LayerNorm(64),
            nn.Dropout(0.05),
            nn.Linear(64, 32),
            nn.GELU(),
            nn.Linear(32, output_dim)
        )

    def forward(self, x):
        return self.model(x)

model = MulticlassNN(input_dim=X.shape[1], output_dim=6).to(device)

# ----------------------
# Step 5: Training
# ----------------------
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5)

for epoch in range(1, 201):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        out = model(xb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    scheduler.step(total_loss)
    if epoch % 10 == 0 or epoch == 1:
        print(f"Epoch {epoch}, Loss: {total_loss:.4f}")

# ----------------------
# Step 6: Evaluation
# ----------------------
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for xb, yb in val_loader:
        xb = xb.to(device)
        out = model(xb)
        preds = torch.argmax(out, dim=1).cpu().numpy()
        all_preds.extend(preds)
        all_labels.extend(yb.numpy())

print("\nClassification Report:\n", classification_report(all_labels, all_preds))
print("Confusion Matrix:\n", confusion_matrix(all_labels, all_preds))

# ----------------------
# Step 7: Inference
# ----------------------

# Sample Input (customize values)
sample_input = pd.DataFrame([{
    'cpu_usage': 4.8,
    'memory_usage': 8000,
    'cpu_limit': 5.0,
    'memory_limit': 8192,
    'node_temperature': 88.0,
    'disk_io': 850.0,
    'network_latency': 110.0,
    'network_bandwidth_usage': 500.0,
    'restart_count': 4,
    'uptime_seconds': 4000
}])

sample_scaled = scaler.transform(sample_input[features])
sample_tensor = torch.tensor(sample_scaled, dtype=torch.float32).to(device)

with torch.no_grad():
    output = model(sample_tensor)
    probs = torch.softmax(output, dim=1).squeeze()
    pred_class = torch.argmax(probs).item()
    pred_prob = probs[pred_class].item()

# ----------------------
# Step 8: Interpret
# ----------------------
failure_labels = {
    0: "🟢 No Failure",
    1: "🧠 Resource Exhaustion (CPU/RAM)",
    2: "🌐 Network Latency Issue",
    3: "💾 Disk I/O Bottleneck",
    4: "🔥 Node Overheating",
    5: "🔁 Pod Restart Loop"
}

# Risk Level Mapping based on any failure prediction probability
if pred_class == 0:
    risk = "🟢 LOW"
elif pred_prob <= 0.6:
    risk = "🟡 MEDIUM"
elif pred_prob <= 0.85:
    risk = "🟠 HIGH"
else:
    risk = "🔴 CRITICAL"

print(f"\n🧠 Predicted Risk Level: {risk}")
print(f"📊 Predicted Failure Type: {failure_labels[pred_class]}")
print(f"🔢 Probability: {pred_prob:.4f}")

Epoch 1, Loss: 173.6973
Epoch 10, Loss: 13.5641
Epoch 20, Loss: 5.9192
Epoch 30, Loss: 3.6997
Epoch 40, Loss: 2.5499
Epoch 50, Loss: 1.9999
Epoch 60, Loss: 1.1335
Epoch 70, Loss: 0.6302
Epoch 80, Loss: 0.7063
Epoch 90, Loss: 0.8345
Epoch 100, Loss: 0.5053
Epoch 110, Loss: 0.5718
Epoch 120, Loss: 0.7004
Epoch 130, Loss: 1.0755
Epoch 140, Loss: 0.7820
Epoch 150, Loss: 0.5839
Epoch 160, Loss: 0.7033
Epoch 170, Loss: 0.7053
Epoch 180, Loss: 0.7679
Epoch 190, Loss: 0.6150
Epoch 200, Loss: 0.6909

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       569
           1       1.00      0.99      0.99       571
           2       1.00      1.00      1.00       563
           3       0.99      1.00      1.00       588
           4       1.00      1.00      1.00       563
           5       1.00      1.00      1.00       571

    accuracy                           1.00      3425
   macro avg       1.00      1.00      1.00  

