# Predictive Maintenance — Predict failures using sensor time-series

**Goal:** Predict imminent machinery failure (classification) using vibration, temperature, and pressure sensor data. This notebook includes data generation, preprocessing, feature engineering, RandomForest/XGBoost baseline, and an LSTM sequence model.

**Files:** `sensor_data.csv` (synthetic), `requirements.txt`.


In [None]:
# Standard imports
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='whitegrid')
pd.options.display.max_columns = 50

In [None]:
# Load the synthetic dataset
df = pd.read_csv('sensor_data.csv')
df.head()

In [None]:
# Quick EDA: failure rate and sensors distribution
print('Shape:', df.shape)
print('Failure rate:', df.failure.mean())
display(df.groupby('time').failure.mean().rolling(5).mean().head())

In [None]:
# Feature engineering: aggregate windows per machine-time to predict failure at current timestep.
# We'll create rolling stats per machine (mean, std) for last 10 timesteps and use them for classification.
df_sorted = df.sort_values(['machine_id','time']).copy()
for col in ['vibration','temperature','pressure']:
    df_sorted[f'{col}_rmean'] = df_sorted.groupby('machine_id')[col].rolling(window=10, min_periods=1).mean().reset_index(0,drop=True)
    df_sorted[f'{col}_rstd'] = df_sorted.groupby('machine_id')[col].rolling(window=10, min_periods=1).std().reset_index(0,drop=True).fillna(0)
    
features = [c for c in df_sorted.columns if c.endswith('_rmean') or c.endswith('_rstd')]
X = df_sorted[features]
y = df_sorted['failure']
print('Features used:', features)

In [None]:
# Train/test split ensuring no leakage (by machine)
machines = df_sorted['machine_id'].unique()
train_machines, test_machines = train_test_split(machines, test_size=0.3, random_state=42)
train_mask = df_sorted['machine_id'].isin(train_machines)
X_train, X_test = X[train_mask], X[~train_mask]
y_train, y_test = y[train_mask], y[~train_mask]
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)
print(X_train_s.shape, X_test_s.shape)

In [None]:
# Baseline: RandomForest
rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train_s, y_train)
pred = rf.predict(X_test_s)
print(classification_report(y_test, pred))
print('ROC AUC:', roc_auc_score(y_test, rf.predict_proba(X_test_s)[:,1]))

In [None]:
# Feature importance
importances = rf.feature_importances_
feat_imp = sorted(zip(features, importances), key=lambda x: x[1], reverse=True)[:10]
print('Top features:', feat_imp)

In [None]:
# LSTM approach: convert per-machine sequences into samples
# We'll create sequences of length 30 timesteps with corresponding label = whether failure occurs in that window.
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Masking
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
SEQ_LEN = 30
seqs = []
labels = []
grouped = df.sort_values(['machine_id','time']).groupby('machine_id')
for name, g in grouped:
    arr = g[['vibration','temperature','pressure']].values
    labs = g['failure'].values
    for i in range(len(arr)-SEQ_LEN):
        seqs.append(arr[i:i+SEQ_LEN])
        labels.append(1 if labs[i+SEQ_LEN-1]==1 else 0)
seqs = np.array(seqs)
labels = np.array(labels)
print('Sequences:', seqs.shape, 'Labels distribution:', labels.mean())

In [None]:
# train/test split by machines for LSTM
# We'll shuffle but keep proportion
from sklearn.model_selection import train_test_split
X_seq_train, X_seq_test, y_seq_train, y_seq_test = train_test_split(seqs, labels, test_size=0.25, random_state=42, stratify=labels)
# simple LSTM model
model = Sequential([Masking(mask_value=0., input_shape=(SEQ_LEN,3)),
                    LSTM(64, return_sequences=False),
                    Dense(1, activation='sigmoid')])
model.compile(optimizer=Adam(1e-3), loss='binary_crossentropy', metrics=['AUC'])
model.summary()

In [None]:
# Fit for a few epochs (adjust epochs for real runs)
history = model.fit(X_seq_train, y_seq_train, validation_data=(X_seq_test, y_seq_test), epochs=6, batch_size=64)
print('Eval:')
print(model.evaluate(X_seq_test, y_seq_test))

## Save artifacts
You can save `scaler`, `rf` and `model` (LSTM) using joblib / tensorflow.save for production use.