In [15]:
import pandas as pd
import numpy as np
import os
import zipfile
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [16]:
k8_data_path = "/Users/shreyasingh/kubernetes1/K8"  # ✅ Your actual folder path


In [17]:
datasets = {
    'metrics_aks': pd.read_csv(os.path.join(k8_data_path, 'metrics_aks.csv')),
    'dataSynthetic': pd.read_csv(os.path.join(k8_data_path, 'dataSynthetic.csv')),
    'metrics': pd.read_csv(os.path.join(k8_data_path, 'metrics.csv')),
    'processed': pd.read_csv(os.path.join(k8_data_path, 'processed.csv'))
}
processed_dfs = []

In [18]:
# Clean & label each dataset
for name, df in datasets.items():
    try:
        if name in ['metrics_aks', 'dataSynthetic']:
            df = df[[
                'CPU Usage (%)', 'Memory Usage (%)', 'Pod Restarts',
                'Network Receive Packets Dropped (p/s)', 'Network Transmit Packets Dropped (p/s)'
            ]].copy()
            df['Risk_Label'] = (
                (df['CPU Usage (%)'] > 90).astype(int) +
                (df['Memory Usage (%)'] > 80).astype(int) +
                (df['Network Receive Packets Dropped (p/s)'] > 1).astype(int)
            ).clip(upper=1)
            processed_dfs.append(df.dropna())
        elif name == 'metrics':
            df = df.drop(columns=[col for col in df.columns if 'Feature' in col])
            df['Risk_Label'] = df['Adjusted_Anomaly']
            df = df.drop(columns=['Anomaly', 'Adjusted_Anomaly'])
            processed_dfs.append(df.dropna())
        elif name == 'processed':
            df = df[['metric_value']]
            df['CPU Usage (%)'] = df['metric_value']
            df['Memory Usage (%)'] = df['metric_value'] / 2
            df['Pod Restarts'] = 0
            df['Network Receive Packets Dropped (p/s)'] = 0
            df['Network Transmit Packets Dropped (p/s)'] = 0
            df['Risk_Label'] = (df['CPU Usage (%)'] > 90).astype(int)
            df = df[['CPU Usage (%)', 'Memory Usage (%)', 'Pod Restarts',
                     'Network Receive Packets Dropped (p/s)', 'Network Transmit Packets Dropped (p/s)', 'Risk_Label']]
            processed_dfs.append(df.dropna())
    except Exception as e:
        print(f"Error in {name}: {e}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['CPU Usage (%)'] = df['metric_value']


In [None]:
# Combine all
final_df = pd.concat(processed_dfs, ignore_index=True)
final_df.fillna(0, inplace=True)


In [7]:
# ------------------ TRAINING ------------------
X = final_df.drop(columns=['Risk_Label'])
y = final_df['Risk_Label']
feature_names = X.columns.tolist()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [22]:
CPU_HARD_THRESHOLD = 85
MEMORY_HARD_THRESHOLD = 750
NET_IO_HARD_THRESHOLD = 1.5
CPU_MEMORY_COMBO_THRESHOLD = 100
def predict_failure(cpu, memory, net_io, pod_name):
    restarts = 0
    net_recv = net_io / 2
    net_trans = net_io / 2

    # Create input DataFrame
    input_df = pd.DataFrame([{
        'CPU Usage (%)': cpu,
        'Memory Usage (%)': memory,
        'Pod Restarts': restarts,
        'Network Receive Packets Dropped (p/s)': net_recv,
        'Network Transmit Packets Dropped (p/s)': net_trans
    }])

    for col in feature_names:
        if col not in input_df.columns:
            input_df[col] = 0

    input_df = input_df[feature_names]
    scaled_input = scaler.transform(input_df)

    pred = model.predict(scaled_input)[0]
    prob = model.predict_proba(scaled_input).max()
    prob_percent = round(prob * 100, 2)

    # -------------------- 🧠 Enhanced Logic --------------------

    issue = None
    root_causes = []

    if cpu > CPU_HARD_THRESHOLD:
        root_causes.append("High CPU usage")
    if memory > MEMORY_HARD_THRESHOLD:
        root_causes.append("High Memory usage")
    if net_io > NET_IO_HARD_THRESHOLD:
        root_causes.append("Network packet drops")
    if cpu + (memory / 10) > CPU_MEMORY_COMBO_THRESHOLD and not root_causes:
        root_causes.append("High combined CPU+Memory usage")

    if root_causes:
        # Determine primary issue
        if "High Memory usage" in root_causes or "High combined CPU+Memory usage" in root_causes:
            issue = "Resource Exhaustion (Memory)"
        elif "Network packet drops" in root_causes:
            issue = "Network Issues"
        elif "High CPU usage" in root_causes:
            issue = "Resource Exhaustion (CPU)"
        else:
            issue = "Node Failure"
    else:
        issue = "No Failure Detected"

    # -------------------- 🔁 Risk & Time Estimation --------------------
    if issue == "No Failure Detected":
        risk_level = "Low"
        expected_time = "N/A"
        suggested_action = "No Action Needed"
    else:
        risk_level = (
            "Critical" if prob > 0.9 else
            "High" if prob > 0.75 else
            "Medium"
        )
        if risk_level == "Critical":
            expected_time = "Within 5–10 minutes"
        elif risk_level == "High":
            expected_time = "Within 30 minutes"
        else:
            expected_time = "Within 1–2 hours"

        # Suggestion map
        suggestion_map = {
            "Resource Exhaustion (Memory)": "Scale pod or optimize memory usage",
            "Resource Exhaustion (CPU)": "Limit CPU-bound processes or autoscale pods",
            "Network Issues": "Check service mesh / CNI configs / pod network policies",
            "Node Failure": "Migrate workload or verify node health"
        }
        suggested_action = suggestion_map.get(issue, "Monitor and log anomaly")

    # -------------------- 📦 Final Output --------------------
    return {
        "Predicted Issue": issue,
        "Pod/Node": pod_name,
        "Probability (%)": prob_percent,
        "Risk Level": risk_level,
        "Expected Time": expected_time,
        "Suggested Action": suggested_action,
        "Causes": ", ".join(root_causes) if root_causes else "None"
    }

In [23]:
# ------------------ DRIVER CODE ------------------
if __name__ == "__main__":
    test_input = {
        "cpu": 12,
        "memory": 110,
        "net_io": 0.05,
        "pod_name": "pod-low-1"
    }

    result = predict_failure(**test_input)

    print("\n📊 MODEL OUTPUT FORMAT:")
    for k, v in result.items():
        print(f"{k}: {v}")


📊 MODEL OUTPUT FORMAT:
Predicted Issue: No Failure Detected
Pod/Node: pod-low-1
Probability (%): 76.0
Risk Level: Low
Expected Time: N/A
Suggested Action: No Action Needed
Causes: None


In [24]:
# ------------------ DRIVER CODE ------------------
if __name__ == "__main__":
    test_input = {
        "cpu": 8,
        "memory": 70,
        "net_io": 0.01,
        "pod_name": "low-risk-1"
    }

    result = predict_failure(**test_input)

    print("\n📊 MODEL OUTPUT FORMAT:")
    for k, v in result.items():
        print(f"{k}: {v}")


📊 MODEL OUTPUT FORMAT:
Predicted Issue: No Failure Detected
Pod/Node: low-risk-1
Probability (%): 98.0
Risk Level: Low
Expected Time: N/A
Suggested Action: No Action Needed
Causes: None


In [25]:
# ------------------ DRIVER CODE ------------------
if __name__ == "__main__":
    test_input = {
        "cpu": 50,
        "memory": 880,
        "net_io": 1.0,
        "pod_name": "low-risk-1"
    }

    result = predict_failure(**test_input)

    print("\n📊 MODEL OUTPUT FORMAT:")
    for k, v in result.items():
        print(f"{k}: {v}")


📊 MODEL OUTPUT FORMAT:
Predicted Issue: Resource Exhaustion (Memory)
Pod/Node: low-risk-1
Probability (%): 96.0
Risk Level: Critical
Expected Time: Within 5–10 minutes
Suggested Action: Scale pod or optimize memory usage
Causes: High Memory usage


In [14]:
import joblib
import numpy as np
import pandas as pd

# ------------------- 🔁 Load model and scaler -------------------
model = joblib.load("model/classifier.pkl")
scaler = joblib.load("model/scaler.pkl")

# ------------------- 📂 Load training dataset -------------------
training_data = pd.read_csv("/Users/shreyasingh/Downloads/kubernetes12/K8")  # Change path if needed

# ------------------- 🧠 Dynamically Calculate Thresholds -------------------
CPU_HARD_THRESHOLD = training_data["CPU Usage (%)"].mean() + 1.5 * training_data["CPU Usage (%)"].std()
MEMORY_HARD_THRESHOLD = training_data["Memory Usage (%)"].mean() + 1.5 * training_data["Memory Usage (%)"].std()
NET_IO_HARD_THRESHOLD = (
    training_data["Network Receive Packets Dropped (p/s)"].mean() +
    training_data["Network Transmit Packets Dropped (p/s)"].mean()
) + 2 * (
    training_data["Network Receive Packets Dropped (p/s)"].std() +
    training_data["Network Transmit Packets Dropped (p/s)"].std()
)
CPU_MEMORY_COMBO_THRESHOLD = (training_data["CPU Usage (%)"] + training_data["Memory Usage (%)"] / 10).quantile(0.9)

# ------------------- 📌 Get feature order from scaler -------------------
feature_names = scaler.feature_names_in_

# ------------------- 🔮 Main Prediction Function -------------------
def predict_failure(cpu, memory, net_io, pod_name):
    restarts = 0
    net_recv = net_io / 2
    net_trans = net_io / 2

    # Create input DataFrame
    input_df = pd.DataFrame([{
        'CPU Usage (%)': cpu,
        'Memory Usage (%)': memory,
        'Pod Restarts': restarts,
        'Network Receive Packets Dropped (p/s)': net_recv,
        'Network Transmit Packets Dropped (p/s)': net_trans
    }])

    # Fill missing columns with zero
    for col in feature_names:
        if col not in input_df.columns:
            input_df[col] = 0

    input_df = input_df[feature_names]
    scaled_input = scaler.transform(input_df)

    # Predict probability of failure (class 1)
    failure_prob = model.predict_proba(scaled_input)[0][1]
    prob_percent = round(failure_prob * 100, 2)

    # ------------------- 🧠 Diagnose Issue -------------------
    issue = None
    root_causes = []

    if cpu > CPU_HARD_THRESHOLD:
        root_causes.append("High CPU usage")
    if memory > MEMORY_HARD_THRESHOLD:
        root_causes.append("High Memory usage")
    if net_io > NET_IO_HARD_THRESHOLD:
        root_causes.append("Network packet drops")
    if cpu + (memory / 10) > CPU_MEMORY_COMBO_THRESHOLD and not root_causes:
        root_causes.append("High combined CPU+Memory usage")

    if root_causes:
        if "High Memory usage" in root_causes or "High combined CPU+Memory usage" in root_causes:
            issue = "Resource Exhaustion (Memory)"
        elif "Network packet drops" in root_causes:
            issue = "Network Issues"
        elif "High CPU usage" in root_causes:
            issue = "Resource Exhaustion (CPU)"
        else:
            issue = "Node Failure"
    else:
        issue = "No Failure Detected"

    # ------------------- ⚠️ Risk & Suggestion -------------------
    if issue == "No Failure Detected":
        risk_level = "Low"
        expected_time = "N/A"
        suggested_action = "No Action Needed"
    else:
        risk_level = (
            "Critical" if failure_prob > 0.9 else
            "High" if failure_prob > 0.75 else
            "Medium"
        )
        expected_time = {
            "Critical": "Within 5–10 minutes",
            "High": "Within 30 minutes",
            "Medium": "Within 1–2 hours"
        }.get(risk_level, "Unknown")

        suggestion_map = {
            "Resource Exhaustion (Memory)": "Scale pod or optimize memory usage",
            "Resource Exhaustion (CPU)": "Limit CPU-bound processes or autoscale pods",
            "Network Issues": "Check service mesh / CNI configs / pod network policies",
            "Node Failure": "Migrate workload or verify node health"
        }
        suggested_action = suggestion_map.get(issue, "Monitor and log anomaly")

    # ------------------- ✅ Final Output -------------------
    return {
        "Predicted Issue": issue,
        "Pod/Node": pod_name,
        "Probability (%)": prob_percent,
        "Risk Level": risk_level,
        "Expected Time": expected_time,
        "Suggested Action": suggested_action,
        "Causes": ", ".join(root_causes) if root_causes else "None"
    }

# ------------------- 🚀 Run Example -------------------
if __name__ == "__main__":
    test_input = {
        "cpu": 92,
        "memory": 850,
        "net_io": 2.0,
        "pod_name": "nginx-high-load"
    }

    result = predict_failure(**test_input)
    print("\n📊 MODEL OUTPUT FORMAT:")
    for k, v in result.items():
        print(f"{k}: {v}")

IsADirectoryError: [Errno 21] Is a directory: '/Users/shreyasingh/Downloads/kubernetes12/K8'