# New Data Labeled Analytics

This notebook loads `data/results/new_data_labeled.csv` (the output of our anomaly‐detection pipeline) and performs basic analytics on the newly labeled data:  
1. Previewing the data  
2. Descriptive statistics of sensor readings  
3. Counting anomalies per feature  
4. Visualizing anomaly rates  
5. Time‐based summaries (e.g., hourly anomaly counts)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# Define the path to the labeled CSV
notebook_dir = Path().resolve()
labeled_path = notebook_dir / ".." / "data" / "results" / "new_data_labeled.csv"

# Verify that the file exists
if not labeled_path.exists():
    raise FileNotFoundError(f"{labeled_path} not found.")

In [None]:
# Load the labeled DataFrame
df = pd.read_csv(labeled_path, parse_dates=["timestamp"])

# Show first few rows and column info
display(df.head())
df.info()

In [None]:
# Select only the numeric sensor columns for statistics
numeric_cols = ["temp_C", "humidity_%", "accel_x", "accel_y", "accel_z"]

# Display descriptive stats
desc_stats = df[numeric_cols].describe().T
desc_stats

In [None]:
# List of features to examine
features = ["temp_C", "humidity_%", "accel_x", "accel_y", "accel_z"]

# Build a table of normal vs. anomaly counts
anomaly_counts = {}
for feat in features:
    label_col = f"label_{feat}"
    counts = df[label_col].value_counts()
    anomaly_counts[feat] = {
        "normal": int(counts.get("normal", 0)),
        "anomaly": int(counts.get("anomaly", 0))
    }

counts_df = pd.DataFrame(anomaly_counts).T
counts_df["percent_anomaly"] = (
    counts_df["anomaly"] / (counts_df.sum(axis=1)) * 100
).round(2)

counts_df

In [None]:
plt.figure(figsize=(6, 4))
plt.bar(
    counts_df.index,
    counts_df["percent_anomaly"],
    color="skyblue",
    edgecolor="black"
)
plt.ylabel("% of Rows Flagged Anomaly")
plt.title("Anomaly Rate by Feature")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


In [None]:
# Create a boolean column: True if any feature was flagged
df["any_anomaly"] = df.apply(
    lambda row: any(row[f"label_{feat}"] == "anomaly" for feat in features),
    axis=1
)

summary = {
    "all_normal": int((df["any_anomaly"] == False).sum()),
    "at_least_one_anomaly": int((df["any_anomaly"] == True).sum())
}
pd.DataFrame.from_dict(summary, orient="index", columns=["count"])


In [None]:
# Ensure timestamp is timezone‐aware or naive datetime
df["timestamp"] = pd.to_datetime(df["timestamp"])

# Extract hour
df["hour"] = df["timestamp"].dt.hour

# Count how many anomaly rows occur in each hour
hourly_anomaly_counts = df[df["any_anomaly"]].groupby("hour").size().rename("anomaly_count").reset_index()

plt.figure(figsize=(8, 4))
plt.plot(hourly_anomaly_counts["hour"], hourly_anomaly_counts["anomaly_count"], marker="o", linestyle="-")
plt.xlabel("Hour of Day")
plt.ylabel("Number of Anomalous Rows")
plt.title("Hourly Count of Rows with ≥1 Anomaly")
plt.xticks(range(0, 24))
plt.grid(alpha=0.3)
plt.tight_layout()
plt.show()

hourly_anomaly_counts


In [None]:
# Compute Pearson correlation on the numeric sensor columns
corr_matrix = df[numeric_cols].corr().round(2)

plt.figure(figsize=(5, 5))
plt.imshow(corr_matrix, vmin=-1, vmax=1)
plt.colorbar(fraction=0.046, pad=0.04)
plt.xticks(range(len(numeric_cols)), numeric_cols, rotation=45)
plt.yticks(range(len(numeric_cols)), numeric_cols)
plt.title("Correlation Matrix (new_data_labeled.csv)")
plt.tight_layout()
plt.show()

corr_matrix


In [None]:
# Example: scatter of temp_C vs. humidity_% colored by anomaly
colors = df["label_temp_C"].map({"normal": "blue", "anomaly": "red"})
plt.figure(figsize=(6, 6))
plt.scatter(df["temp_C"], df["humidity_%"], c=colors, alpha=0.6, s=20)
plt.xlabel("Temperature (°C)")
plt.ylabel("Humidity (%)")
plt.title("Temp vs Humidity (Red = Temp Anomaly)")
plt.tight_layout()
plt.show()
