In [8]:
import subprocess
import sys

def install_and_import(package):
    try:
        __import__(package)
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"{package} installed successfully!")

packages = [
    "pandas", "numpy", "matplotlib", "seaborn", "scikit-learn",
    "tensorflow"
]

# Install missing packages
for package in packages:
    install_and_import(package)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import IsolationForest
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf

# Load dataset
file_path = "../data/guidewire-2.csv"
df = pd.read_csv(file_path)

# Convert Timestamp to datetime
df["Timestamp"] = pd.to_datetime(df["Timestamp"])
df = df.sort_values(by="Timestamp")

# Select relevant features
features = ["CPU Usage (%)", "Memory Usage (%)", "Pod Restarts",
            "Network Receive Bytes", "Network Transmit Bytes",
            "Network Receive Packets (p/s)", "Network Transmit Packets (p/s)"]

target = "Pod Status"

# Drop NaNs
df = df[features + [target]].dropna()

# Normalize features
scaler = MinMaxScaler()
df[features] = scaler.fit_transform(df[features])

# Convert Pod Status to binary classification (0: normal, 1: failure)
df[target] = (df[target] != 5).astype(int)  # Assuming 5 is the 'healthy' status

# Convert to NumPy array
data = df[features].values
labels = df[target].values

# Create sequences for LSTM (using past 10 steps)
def create_sequences(data, labels, seq_length=10):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(labels[i+seq_length])
    return np.array(X), np.array(y)

seq_length = 10
X, y = create_sequences(data, labels, seq_length)

# Split into train and test sets
split = int(0.8 * len(X))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]


Installing scikit-learn...
scikit-learn installed successfully!


In [None]:
# Build LSTM Model
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(seq_length, X.shape[2])),
    Dropout(0.2),
    LSTM(32, return_sequences=False),
    Dropout(0.2),
    Dense(1, activation="sigmoid")
])

# Compile Model
model.compile(loss="binary_crossentropy", optimizer=Adam(learning_rate=0.001), metrics=["accuracy"])

# Train Model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))


  super().__init__(**kwargs)


Epoch 1/10
[1m887/887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 9ms/step - accuracy: 0.7437 - loss: 0.5390 - val_accuracy: 0.8964 - val_loss: 0.3361
Epoch 2/10
[1m887/887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - accuracy: 0.7499 - loss: 0.5192 - val_accuracy: 0.8964 - val_loss: 0.3374
Epoch 3/10
[1m887/887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 9ms/step - accuracy: 0.7474 - loss: 0.5193 - val_accuracy: 0.8964 - val_loss: 0.3368
Epoch 4/10
[1m887/887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - accuracy: 0.7492 - loss: 0.5155 - val_accuracy: 0.8964 - val_loss: 0.3361
Epoch 5/10
[1m887/887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7ms/step - accuracy: 0.7455 - loss: 0.5176 - val_accuracy: 0.8964 - val_loss: 0.3366
Epoch 6/10
[1m887/887[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 9ms/step - accuracy: 0.7525 - loss: 0.5089 - val_accuracy: 0.8964 - val_loss: 0.3406
Epoch 7/10
[1m887/887[0m

In [None]:
# Train Isolation Forest on training data
iso_forest = IsolationForest(n_estimators=100, contamination=0.05, random_state=42)
iso_forest.fit(X_train.reshape(X_train.shape[0], -1))  # Flatten time-series

# Predict anomalies (-1 means anomaly, 1 means normal)
anomaly_scores = iso_forest.predict(X_test.reshape(X_test.shape[0], -1))
anomaly_scores = np.where(anomaly_scores == -1, 1, 0)  # Convert to binary (1 = anomaly)

# Combine LSTM and Isolation Forest results
final_predictions = np.logical_or(model.predict(X_test).flatten() > 0.5, anomaly_scores).astype(int)

# Evaluate
print("Accuracy:", accuracy_score(y_test, final_predictions))
print(classification_report(y_test, final_predictions))


[1m222/222[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step
Accuracy: 0.8420236753100339
              precision    recall  f1-score   support

           0       0.89      0.94      0.91      6361
           1       0.05      0.03      0.04       735

    accuracy                           0.84      7096
   macro avg       0.47      0.48      0.48      7096
weighted avg       0.81      0.84      0.82      7096

