In [2]:
%pip install statsmodels

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.ensemble import IsolationForest, RandomForestRegressor
from sklearn.metrics import silhouette_score, mean_squared_error, mean_absolute_error
from statsmodels.tsa.arima.model import ARIMA
from typing import List, Dict, Any
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Create output directories
import os
os.makedirs('plots', exist_ok=True)
os.makedirs('outputs', exist_ok=True)


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.


In [3]:
try:
    df = pd.read_csv('data.csv')
    print(f"✓ Loaded dataset: {df.shape[0]:,} rows x {df.shape[1]} columns")
except FileNotFoundError:
    print("ERROR: data.csv not found in current directory")
    print("Please ensure data.csv is in the same folder as this script")
    exit(1)

# Convert time column to datetime
df['time'] = pd.to_datetime(df['time'])
df = df.sort_values('time').reset_index(drop=True)

# Convert numeric columns to float
numeric_cols: List[str] = [col for col in df.columns if col != 'time']
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

print(f"✓ Data spans: {df['time'].min()} to {df['time'].max()}")
print(f"✓ Duration: {(df['time'].max() - df['time'].min()).days} days")

✓ Loaded dataset: 378,719 rows x 7 columns
✓ Data spans: 2017-01-01 00:00:00 to 2020-08-07 12:15:00
✓ Duration: 1314 days


In [None]:
# Check for missing values
missing_info = df.isnull().sum()
print(f"\n* Missing values per column:")
for col in numeric_cols:
    pct = (missing_info[col] / len(df)) * 100
    print(f"  - {col}: {missing_info[col]:,} ({pct:.2f}%)")

# Handle missing values (forward fill then backward fill)
df[numeric_cols] = df[numeric_cols].ffill().bfill()
print(f"\n* Missing values handled via forward/backward fill")

# Check for duplicate timestamps before setting index
duplicates = df['time'].duplicated().sum()
if duplicates > 0:
    print(f"\n! Found {duplicates} duplicate timestamps - removing duplicates")
    df = df.drop_duplicates(subset='time', keep='first')
    print(f"* Removed duplicates, remaining records: {len(df):,}")

# Set time as index
df = df.set_index('time')

# Create complete time range with 5-minute frequency
time_range = pd.date_range(start=df.index.min(), end=df.index.max(), freq='5T')
print(f"\n* Expected records at 5-min frequency: {len(time_range):,}")
print(f"* Actual records: {len(df):,}")
gap_count = len(time_range) - len(df)
gap_pct = (gap_count / len(time_range)) * 100
print(f"* Gap: {gap_count:,} records ({gap_pct:.2f}%)")

# Reindex to ensure 5-minute frequency
df = df.reindex(time_range)

# Check for gaps and interpolate
gaps = df[numeric_cols].isnull().sum()
if gaps.sum() > 0:
    print(f"\n! Found {gaps.sum()} gaps after reindexing - filling with interpolation")
    df[numeric_cols] = df[numeric_cols].interpolate(method='linear')
    df[numeric_cols] = df[numeric_cols].bfill().ffill()
    print(f"* All gaps filled")

# Summary statistics
print("\n* Summary Statistics:")
print(df[numeric_cols].describe().round(2))

# Correlation matrix
print("\n* Correlation Matrix:")
corr_matrix = df[numeric_cols].corr()
print(corr_matrix.round(3))

# Save correlation heatmap
fig1 = plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Sensor Variables Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('plots/01_correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.close(fig1)
print("  -> Saved: plots/01_correlation_matrix.png")