# Notebook: Analyzing Labeled Anomaly Data

**Location:** `notebooks/Analytics.ipynb`  
**Purpose:** Load `data/results/new_data_labeled.csv` (output of our anomaly‐detection pipeline) and perform exploratory and summary analyses to understand the behavior of your sensors and anomalies.  


In [None]:
# Cell 1: Imports and path definitions

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

# Configure matplotlib for inline display
%matplotlib inline

# Define paths (assuming this notebook is in notebooks/ folder)
notebook_dir = Path().resolve().parent / "notebooks"
# Path to the labeled CSV:
labeled_csv = Path().resolve().parent / "data" / "results" / "new_data_labeled.csv"

# Check that file exists
assert labeled_csv.exists(), f"File not found: {labeled_csv}"
print(f"Loading data from: {labeled_csv}")

In [None]:
# Cell 2: Load data into a pandas DataFrame
df = pd.read_csv(labeled_csv, parse_dates=["timestamp"])

# Display first few rows
display(df.head(10))

# Show DataFrame info (dtypes, non-null counts)
df.info()


In [None]:
# Cell 3: Descriptive statistics for raw sensors and accel_mag
sensor_cols = ["temp_C", "humidity_%", "accel_x", "accel_y", "accel_z", "accel_mag"]
desc = df[sensor_cols].describe().T
desc["skew"] = df[sensor_cols].skew().round(2)
desc["kurtosis"] = df[sensor_cols].kurtosis().round(2)
display(desc)


In [None]:
# Cell 4: Overall anomaly counts
counts = df["label_overall"].value_counts().rename_axis("label").reset_index(name="count")
counts["percent"] = (counts["count"] / counts["count"].sum() * 100).round(2)
display(counts)

# Bar chart of anomaly vs. normal
plt.figure(figsize=(4, 4))
plt.bar(counts["label"], counts["count"], color=["skyblue", "salmon"])
plt.title("Overall Row Counts: Normal vs. Anomaly")
plt.ylabel("Count")
for i, v in enumerate(counts["count"]):
    plt.text(i, v + max(counts["count"]) * 0.01, f"{v}", ha="center")
plt.tight_layout()
plt.show()


In [None]:
# Cell 5: Count how many times each feature was flagged
# We’ll split 'contributing_features' by comma and count occurrences
all_contrib = df["contributing_features"].str.split(",", expand=True).stack().reset_index(drop=True)
feature_counts = all_contrib.value_counts().rename_axis("feature").reset_index(name="count")
# Remove empty strings if present
feature_counts = feature_counts[feature_counts["feature"] != ""].reset_index(drop=True)

display(feature_counts)

# Bar chart
plt.figure(figsize=(6, 4))
plt.bar(feature_counts["feature"], feature_counts["count"], color="mediumseagreen")
plt.title("Count of Anomaly Occurrences by Contributing Feature")
plt.ylabel("Number of Rows Flagged")
plt.xticks(rotation=45)
for idx, val in enumerate(feature_counts["count"]):
    plt.text(idx, val + max(feature_counts["count"]) * 0.01, f"{val}", ha="center")
plt.tight_layout()
plt.show()


In [None]:
# Cell 6: Compute percentage relative to total rows
total_rows = len(df)
feature_counts["percent_of_rows"] = (feature_counts["count"] / total_rows * 100).round(2)
display(feature_counts)


In [None]:
# Cell 7: Function to plot a feature over time and highlight anomalies
def plot_time_series(feature, ylabel):
    plt.figure(figsize=(12, 4))
    plt.plot(df["timestamp"], df[feature], label=feature, linewidth=0.8)
    # Mask rows where overall label is anomaly and that feature is in contributing_features
    mask = (df["label_overall"] == "anomaly") & df["contributing_features"].str.contains(feature)
    plt.scatter(df.loc[mask, "timestamp"], df.loc[mask, feature],
                color="red", s=20, label="Anomaly", zorder=5)
    plt.xlabel("Time (US/Eastern)")
    plt.ylabel(ylabel)
    plt.title(f"Time Series: {feature} (Anomalies in Red)")
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()

# Plot each feature
plot_time_series("temp_C", "Temperature (°C)")
plot_time_series("humidity_%", "Humidity (%)")
plot_time_series("accel_x", "Acceleration X (g)")
plot_time_series("accel_y", "Acceleration Y (g)")
plot_time_series("accel_z", "Acceleration Z (g)")


In [None]:
# Cell 8: Compute correlation on raw sensor features
corr_matrix = df[sensor_cols].corr().round(2)

plt.figure(figsize=(5, 5))
plt.imshow(corr_matrix, vmin=-1, vmax=1, cmap="coolwarm")
plt.colorbar(fraction=0.046, pad=0.04)
plt.xticks(range(len(sensor_cols)), sensor_cols, rotation=45)
plt.yticks(range(len(sensor_cols)), sensor_cols)
plt.title("Correlation Matrix: Raw Sensor Columns")
plt.tight_layout()
plt.show()

display(corr_matrix)


In [None]:
# Cell 9: Histograms of z-scores for each feature
z_cols = [f"z_{f}" for f in feature_list]

plt.figure(figsize=(10, 6))
for idx, zcol in enumerate(z_cols):
    plt.subplot(2, 3, idx + 1)
    df[zcol].hist(bins=30, edgecolor="black")
    plt.axvline(3, color="red", linestyle="--", label="±3σ")
    plt.axvline(-3, color="red", linestyle="--")
    plt.title(zcol)
    plt.xlabel("z-score")
    plt.ylabel("Count")
    plt.legend()

plt.tight_layout()
plt.show()


In [None]:
# Cell 10: Extract hour from timestamp
df["hour"] = df["timestamp"].dt.hour

# Count anomalies per hour and compute percentage
hourly = df.groupby("hour").apply(
    lambda sub: (sub["label_overall"] == "anomaly").sum() / len(sub) * 100
).rename("percent_anomaly").reset_index()

display(hourly)

# Plot hourly anomaly rate
plt.figure(figsize=(8, 4))
plt.plot(hourly["hour"], hourly["percent_anomaly"], marker="o", linestyle="-", color="purple")
plt.xlabel("Hour of Day (US/Eastern)")
plt.ylabel("% of Rows with Anomaly")
plt.title("Hourly Anomaly Rate (Overall)")
plt.xticks(range(0, 24))
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
# Cell 11: Scatter of temp_C vs. humidity_%, colored by label_overall
colors = df["label_overall"].map({"normal": "blue", "anomaly": "red"})
plt.figure(figsize=(6, 6))
plt.scatter(df["temp_C"], df["humidity_%"], c=colors, alpha=0.6, s=20)
plt.xlabel("Temperature (°C)")
plt.ylabel("Humidity (%)")
plt.title("Temperature vs. Humidity (Red = Anomaly)")
plt.tight_layout()
plt.show()


In [None]:
# Cell 12: Export summaries
export_dir = Path().resolve().parent / "data" / "results"
export_dir.mkdir(parents=True, exist_ok=True)

# 10.1 Per-feature counts
feature_counts.to_csv(export_dir / "feature_anomaly_counts.csv", index=False)

# 10.2 Hourly anomaly percentages
hourly.to_csv(export_dir / "hourly_anomaly_percent.csv", index=False)

print("Exported feature and hourly anomaly summaries to:")
print(f" • {export_dir / 'feature_anomaly_counts.csv'}")
print(f" • {export_dir / 'hourly_anomaly_percent.csv'}")
