# Feature Engineering Debug Notebook

This notebook will help identify and fix issues in the feature engineering process of the bearing anomaly detection pipeline. We'll step through each operation to validate the data at each stage.

In [1]:
# 1. Setup Required Libraries
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, 
                   format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

## 2. Load Sample Data

We'll load a small subset of the bearing data to test the feature engineering process. This will make it easier to debug any issues.

In [2]:
# Load and preprocess a small subset of data
path = "../data/1st_test/"
files = sorted(glob.glob(os.path.join(path, "*[!.]*")))[:5]  # Load first 5 files for testing
logger.info(f"Loading {len(files)} files for testing")

stats = []
for f in files:
    try:
        data = np.loadtxt(f, delimiter='\t')  # Add explicit delimiter
        logger.info(f"Loaded file {f} with shape {data.shape}")
        
        # Calculate statistics per row
        row_stats = []
        for row in data:
            mean_ = np.mean(row)
            std_ = np.std(row)
            rms_ = np.sqrt(np.mean(row**2))
            kurt_ = pd.Series(row).kurt()
            skew_ = pd.Series(row).skew()
            ptp_  = np.ptp(row)
            row_stats.append([mean_, std_, rms_, kurt_, skew_, ptp_])
        stats.extend(row_stats)
        
    except Exception as e:
        logger.error(f"Error loading file {f}: {str(e)}")

# Create DataFrame
df = pd.DataFrame(stats, columns=['mean', 'std', 'rms', 'kurt', 'skew', 'ptp'])
df['time'] = np.arange(len(df))

print("\nDataFrame Summary:")
print(f"Shape: {df.shape}")
print("\nFirst few rows:")
print(df.head())
print("\nData Types:")
print(df.dtypes)

2025-11-06 17:21:25,926 - INFO - Loading 5 files for testing
2025-11-06 17:21:25,969 - INFO - Loaded file ../data/1st_test\2003.10.22.12.06.24 with shape (20480, 8)
2025-11-06 17:21:25,969 - INFO - Loaded file ../data/1st_test\2003.10.22.12.06.24 with shape (20480, 8)
2025-11-06 17:21:34,462 - INFO - Loaded file ../data/1st_test\2003.10.22.12.09.13 with shape (20480, 8)
2025-11-06 17:21:34,462 - INFO - Loaded file ../data/1st_test\2003.10.22.12.09.13 with shape (20480, 8)
2025-11-06 17:21:43,079 - INFO - Loaded file ../data/1st_test\2003.10.22.12.14.13 with shape (20480, 8)
2025-11-06 17:21:43,079 - INFO - Loaded file ../data/1st_test\2003.10.22.12.14.13 with shape (20480, 8)
2025-11-06 17:21:51,738 - INFO - Loaded file ../data/1st_test\2003.10.22.12.19.13 with shape (20480, 8)
2025-11-06 17:21:51,738 - INFO - Loaded file ../data/1st_test\2003.10.22.12.19.13 with shape (20480, 8)
2025-11-06 17:22:00,482 - INFO - Loaded file ../data/1st_test\2003.10.22.12.24.13 with shape (20480, 8)
202


DataFrame Summary:
Shape: (102400, 7)

First few rows:
       mean       std       rms      kurt      skew    ptp  time
0 -0.101000  0.053137  0.114125 -1.329223  0.163090  0.161     0
1 -0.090750  0.068159  0.113496 -0.754834  0.475306  0.212     1
2 -0.106875  0.070464  0.128013 -1.494880  0.314280  0.190     2
3 -0.135125  0.035974  0.139832 -1.741211  0.206636  0.100     3
4 -0.147625  0.067487  0.162320 -1.214865 -0.298120  0.198     4

Data Types:
mean    float64
std     float64
rms     float64
kurt    float64
skew    float64
ptp     float64
time      int64
dtype: object


## 3. Debug Feature Engineering

Let's step through the rolling statistics calculation carefully and validate the data at each step.

In [3]:
# Debug feature engineering process
logger.info("Starting feature engineering process")

# Create a copy of the DataFrame for safety
df_debug = df.copy()

# Print initial statistics
print("Initial RMS statistics:")
print(df_debug['rms'].describe())

# Test rolling calculations with different windows
windows = [3, 5, 10]
for w in windows:
    logger.info(f"\nProcessing window size {w}")
    
    # Calculate rolling mean
    roll_mean = df_debug['rms'].rolling(window=w, min_periods=1).mean()
    logger.info(f"Rolling mean (window={w}) - First 5 values: {roll_mean.head().tolist()}")
    df_debug[f'roll_mean_{w}'] = roll_mean
    
    # Calculate rolling std with explicit handling of NaN values
    roll_std = df_debug['rms'].rolling(window=w, min_periods=1).std()
    roll_std = roll_std.fillna(0)  # Explicitly fill NaN values with 0
    logger.info(f"Rolling std (window={w}) - First 5 values: {roll_std.head().tolist()}")
    df_debug[f'roll_std_{w}'] = roll_std
    
    # Validate calculations
    print(f"\nValidation for window size {w}:")
    print(f"Number of NaN values in roll_mean_{w}: {df_debug[f'roll_mean_{w}'].isna().sum()}")
    print(f"Number of NaN values in roll_std_{w}: {df_debug[f'roll_std_{w}'].isna().sum()}")

# Handle any remaining missing values
df_debug = df_debug.bfill().ffill()

# Check for any remaining NaN values
print("\nFinal NaN check:")
print(df_debug.isna().sum())

2025-11-06 17:22:19,222 - INFO - Starting feature engineering process
2025-11-06 17:22:19,239 - INFO - 
Processing window size 3
2025-11-06 17:22:19,246 - INFO - Rolling mean (window=3) - First 5 values: [0.11412493154433873, 0.1138102630862502, 0.11854456969579537, 0.12711345756173467, 0.14338809773718905]
2025-11-06 17:22:19,254 - INFO - Rolling std (window=3) - First 5 values: [0.0, 0.00044500840107982825, 0.008206094896969295, 0.013191033320085763, 0.017427496692453547]
2025-11-06 17:22:19,239 - INFO - 
Processing window size 3
2025-11-06 17:22:19,246 - INFO - Rolling mean (window=3) - First 5 values: [0.11412493154433873, 0.1138102630862502, 0.11854456969579537, 0.12711345756173467, 0.14338809773718905]
2025-11-06 17:22:19,254 - INFO - Rolling std (window=3) - First 5 values: [0.0, 0.00044500840107982825, 0.008206094896969295, 0.013191033320085763, 0.017427496692453547]
2025-11-06 17:22:19,257 - INFO - 
Processing window size 5
2025-11-06 17:22:19,263 - INFO - Rolling mean (window

Initial RMS statistics:
count    102400.000000
mean          0.120473
std           0.025734
min           0.033701
25%           0.103064
50%           0.118501
75%           0.135381
max           0.365391
Name: rms, dtype: float64

Validation for window size 3:
Number of NaN values in roll_mean_3: 0
Number of NaN values in roll_std_3: 0

Validation for window size 5:
Number of NaN values in roll_mean_5: 0
Number of NaN values in roll_std_5: 0

Validation for window size 10:
Number of NaN values in roll_mean_10: 0
Number of NaN values in roll_std_10: 0

Final NaN check:
mean            0
std             0
rms             0
kurt            0
skew            0
ptp             0
time            0
roll_mean_3     0
roll_std_3      0
roll_mean_5     0
roll_std_5      0
roll_mean_10    0
roll_std_10     0
dtype: int64


## 4. Data Validation

Check for any anomalies in the processed data, including NaN values, infinite values, and data type consistency.

In [4]:
# Comprehensive data validation
logger.info("Starting data validation")

def validate_dataframe(df, name="DataFrame"):
    """Validate a DataFrame for common issues"""
    print(f"\nValidating {name}")
    
    # Check for NaN values
    nan_counts = df.isna().sum()
    print("\nNaN value counts:")
    print(nan_counts[nan_counts > 0] if any(nan_counts > 0) else "No NaN values found")
    
    # Check for infinite values
    inf_counts = df.isin([np.inf, -np.inf]).sum()
    print("\nInfinite value counts:")
    print(inf_counts[inf_counts > 0] if any(inf_counts > 0) else "No infinite values found")
    
    # Check data types
    print("\nData types:")
    print(df.dtypes)
    
    # Check for very large or small values
    print("\nValue ranges:")
    for col in df.select_dtypes(include=np.number).columns:
        print(f"\n{col}:")
        print(df[col].describe())

# Validate original features
print("Original Features Validation:")
validate_dataframe(df[['mean', 'std', 'rms', 'kurt', 'skew', 'ptp']], "Original Features")

# Validate rolling statistics
print("\nRolling Statistics Validation:")
roll_cols = [col for col in df_debug.columns if col.startswith('roll_')]
validate_dataframe(df_debug[roll_cols], "Rolling Statistics")

2025-11-06 17:22:37,045 - INFO - Starting data validation


Original Features Validation:

Validating Original Features

NaN value counts:
No NaN values found

Infinite value counts:
No infinite values found

Data types:
mean    float64
std     float64
rms     float64
kurt    float64
skew    float64
ptp     float64
dtype: object

Value ranges:

mean:
count    102400.000000
mean         -0.093867
std           0.025695
min          -0.249000
25%          -0.110250
50%          -0.093750
75%          -0.077250
max           0.117750
Name: mean, dtype: float64

std:
count    102400.000000
mean          0.071454
std           0.024476
min           0.009493
25%           0.054674
50%           0.068296
75%           0.084377
max           0.280767
Name: std, dtype: float64

rms:
count    102400.000000
mean          0.120473
std           0.025734
min           0.033701
25%           0.103064
50%           0.118501
75%           0.135381
max           0.365391
Name: rms, dtype: float64

kurt:
count    102400.000000
mean          0.070300
std        

## 5. Feature Scaling Validation

Test the StandardScaler implementation and verify the scaled features are within expected ranges.

In [None]:
# Test feature scaling
logger.info("Starting feature scaling validation")

# Prepare features for scaling
X = df_debug.drop(columns=['time'])

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

# Validate scaling results
print("Scaling Validation:")
print("\nScaled features summary:")
print(X_scaled_df.describe())

# Check if scaled features have mean ≈ 0 and std ≈ 1
print("\nVerifying scaling properties:")
for column in X_scaled_df.columns:
    mean = X_scaled_df[column].mean()
    std = X_scaled_df[column].std()
    print(f"\n{column}:")
    print(f"Mean: {mean:.6f} (should be close to 0)")
    print(f"Std:  {std:.6f} (should be close to 1)")
    
    # Check for any values outside expected range
    outside_range = np.sum(np.abs(X_scaled_df[column]) > 5)
    if outside_range > 0:
        print(f"Warning: {outside_range} values more than 5 standard deviations from mean")

# Save scaler for consistency
try:
    import joblib
    os.makedirs('../models', exist_ok=True)
    joblib.dump(scaler, "../models/scaler_debug.joblib")
    logger.info("Saved scaler to models/scaler_debug.joblib")
except Exception as e:
    logger.error(f"Error saving scaler: {str(e)}")

# Verify inverse transform works correctly
X_inverse = scaler.inverse_transform(X_scaled)
max_diff = np.max(np.abs(X_inverse - X.values))
print(f"\nMaximum difference after inverse transform: {max_diff:.10f}")
if max_diff > 1e-10:
    logger.warning("Large difference found in inverse transform")