In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Step 1: Load and preprocess the dataset

def load_and_preprocess(filepath):
    df = pd.read_csv(filepath)

    # The dataset columns: ['time', 'SensorA', 'SensorB', 'SensorC']
    # Step 1a: Handle missing values only in sensor columns
    sensor_cols = ['SensorA', 'SensorB', 'SensorC']
    df[sensor_cols] = df[sensor_cols].interpolate(method='linear', limit_direction='both')

    # Assert no missing values remain in sensor columns
    assert df[sensor_cols].isnull().sum().sum() == 0, "Missing values remain after interpolation"

    # Step 1b: Standardize sensor data
    scaler = StandardScaler()
    df[sensor_cols] = scaler.fit_transform(df[sensor_cols])

    # Step 1c: Data integrity - no duplicates expected in time series
    assert df.duplicated().sum() == 0, "Duplicate rows found in dataset"

    return df, scaler

# Step 2: Create rolling window sequences

def create_rolling_windows(df, window_size=30):
    sensor_cols = ['SensorA', 'SensorB', 'SensorC']
    data = df[sensor_cols].values

    sequences = []
    for i in range(len(df) - window_size + 1):
        seq = data[i:i+window_size]
        sequences.append(seq)

    return np.array(sequences)

# Step 3: Compute Remaining Useful Life (RUL)
# Since no engine_id or cycle, assume one continuous time series and RUL decreases to zero

def compute_RUL(df):
    length = len(df)

    # RUL: Remaining cycles till end of data (0 at last time point)
    df['RUL'] = np.arange(length-1, -1, -1)
    return df

# Main execution

if __name__ == "__main__":
    filepath = 'sensor_data.csv'  # Adjust if necessary

    # Load and preprocess data
    df, scaler = load_and_preprocess(filepath)

    # Compute RUL
    df = compute_RUL(df)

    # Generate rolling window sequences
    window_size = 30
    sequences = create_rolling_windows(df, window_size)

    # Confirm no missing data
    assert df.isnull().sum().sum() == 0, "Missing data present after processing"

    # Display output info
    print("Processed data shape:", df.shape)
    print("Sequences shape:", sequences.shape)
    print("Sample RUL values:", df['RUL'].head())


Processed data shape: (1440, 5)
Sequences shape: (1411, 30, 3)
Sample RUL values: 0    1439
1    1438
2    1437
3    1436
4    1435
Name: RUL, dtype: int64
