In [1]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Generate synthetic data
n_samples = 1000
data = {
    'timestamp': pd.date_range(start='2024-01-01', periods=n_samples, freq='h'),
    'cpu_usage': np.random.normal(50, 10, n_samples),       # CPU usage in percentage
    'memory_usage': np.random.normal(60, 15, n_samples),    # Memory usage in percentage
    'network_latency': np.random.normal(100, 20, n_samples), # Network latency in ms
    'disk_io': np.random.normal(75, 10, n_samples),         # Disk I/O in MB/s
    'error_rate': np.random.choice([0, 1], n_samples, p=[0.95, 0.05])  # 5% error rate
}

# Create DataFrame
df = pd.DataFrame(data)

# Display the first few rows of the dataset
print(df.head())
print(df.info())

            timestamp  cpu_usage  memory_usage  network_latency    disk_io  \
0 2024-01-01 00:00:00  54.967142     80.990332        86.496435  55.921924   
1 2024-01-01 01:00:00  48.617357     73.869505        97.109627  66.396150   
2 2024-01-01 02:00:00  56.476885     60.894456        84.151602  70.863945   
3 2024-01-01 03:00:00  65.230299     50.295948        93.840769  93.876877   
4 2024-01-01 04:00:00  47.658466     70.473350        62.127707  80.565531   

   error_rate  
0           0  
1           0  
2           1  
3           0  
4           0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   timestamp        1000 non-null   datetime64[ns]
 1   cpu_usage        1000 non-null   float64       
 2   memory_usage     1000 non-null   float64       
 3   network_latency  1000 non-null   float64       
 4   disk_io

In [2]:
from sklearn.ensemble import IsolationForest

# Implement anomaly detection using Isolation Forest
def detect_anomalies(data):
    model = IsolationForest(contamination=0.05, random_state=42)
    model.fit(data)
    anomalies = model.predict(data)
    return anomalies

# Detect anomalies in the dataset
numeric_data = df.select_dtypes(include=[float, int]) # Only numeric columns
df['anomaly'] = detect_anomalies(numeric_data)

print(df['anomaly'].value_counts()) # -1 denotes an anomaly

anomaly
 1    950
-1     50
Name: count, dtype: int64


In [3]:
from scipy.stats import zscore

# Calculate z-scores to identify anomalous values per column in anomalous rows
z_scores = numeric_data.apply(zscore)

# Function to identify anomalous columns for each row
def find_anomalous_columns(row, threshold=3):
    return [col for col in numeric_data.columns if abs(z_scores.loc[row.name, col]) > threshold]

# Apply the function to each anomalous row
df['anomalous_columns'] = df.apply(lambda row: find_anomalous_columns(row) if row['anomaly'] == -1 else [], axis=1)

# Display rows with anomalies and their anomalous columns
print(df[df['anomaly'] == -1][['timestamp', 'anomaly', 'anomalous_columns']])

              timestamp  anomaly              anomalous_columns
37  2024-01-02 13:00:00       -1                   [error_rate]
38  2024-01-02 14:00:00       -1                   [error_rate]
62  2024-01-03 14:00:00       -1                   [error_rate]
132 2024-01-06 12:00:00       -1                   [error_rate]
179 2024-01-08 11:00:00       -1                   [error_rate]
192 2024-01-09 00:00:00       -1                   [error_rate]
208 2024-01-09 16:00:00       -1                   [error_rate]
241 2024-01-11 01:00:00       -1                   [error_rate]
245 2024-01-11 05:00:00       -1                   [error_rate]
251 2024-01-11 11:00:00       -1                   [error_rate]
262 2024-01-11 22:00:00       -1        [cpu_usage, error_rate]
272 2024-01-12 08:00:00       -1                   [error_rate]
285 2024-01-12 21:00:00       -1                   [error_rate]
315 2024-01-14 03:00:00       -1                   [error_rate]
329 2024-01-14 17:00:00       -1        

In [8]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

# 0. (Re)load your DataFrame; parse dates if reading from CSV:
# df = pd.read_csv("your_data.csv", parse_dates=['timestamp'])

# 1. Print the anomalies you care about, up front:
print(df[df['anomaly'] == -1][['timestamp', 'anomaly', 'anomalous_columns']])

# 2. Convert all datetime cols to integer ordinals
for col in df.select_dtypes(include=['datetime64[ns]', 'datetime64']).columns:
    df[col] = df[col].map(pd.Timestamp.toordinal)

# 3. Split into features X and target y
X = df.drop('anomaly', axis=1)
y = df['anomaly']

# 4. Drop any columns that contain Python lists (they’re unhashable)
list_cols = [
    col for col in X.columns
    if X[col].apply(lambda v: isinstance(v, list)).any()
]
if list_cols:
    print("Dropping list-valued columns:", list_cols)
    X = X.drop(list_cols, axis=1)

# 5. One-hot encode any remaining object/string columns
X = pd.get_dummies(X, drop_first=True)

# 6. Define and run your root-cause tree
def root_cause_analysis(X_train, y_train, X_test):
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    return model.predict(X_test)

predicted_causes = root_cause_analysis(X, y, X)

# 7. (Optional) Inspect your predictions
print(predicted_causes[:10])



     timestamp  anomaly              anomalous_columns
37      738887       -1                   [error_rate]
38      738887       -1                   [error_rate]
62      738888       -1                   [error_rate]
132     738891       -1                   [error_rate]
179     738893       -1                   [error_rate]
192     738894       -1                   [error_rate]
208     738894       -1                   [error_rate]
241     738896       -1                   [error_rate]
245     738896       -1                   [error_rate]
251     738896       -1                   [error_rate]
262     738896       -1        [cpu_usage, error_rate]
272     738897       -1                   [error_rate]
285     738897       -1                   [error_rate]
315     738899       -1                   [error_rate]
329     738899       -1                   [error_rate]
330     738899       -1                   [error_rate]
334     738899       -1                   [error_rate]
350     73

In [12]:
# Example solution recommendation based on root cause
def recommend_solution(root_cause):
    solutions = {
        "network_error":    "Restart the network service.",
        "database_issue":   "Check the database connection and restart the service.",
        "high_cpu_usage":   "Optimize running processes or allocate more resources."
    }

    # If we get a list (or tuple), return a list of recommendations
    if isinstance(root_cause, (list, tuple)):
        return [solutions.get(c, "No recommendation available.") for c in root_cause]

    # Otherwise just look up the single cause
    return solutions.get(root_cause, "No recommendation available.")

# ——— Tests ———
# Single cause
solution = recommend_solution("network_error")
print(f"Single cause → {solution}")

# Multiple causes
solutions = recommend_solution(["network_error", "cpu_spike", "foo"])
print(f"Multiple causes → {solutions}")

Single cause → Restart the network service.
Multiple causes → ['Restart the network service.', 'No recommendation available.', 'No recommendation available.']


Detected issue:         1
Recommended solution:   No recommendation available.
