In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from sklearn.utils import class_weight
from tensorflow.keras.regularizers import l2

# Load datasets
errors_df = pd.read_csv('PdM_errors.csv')
failures_df = pd.read_csv('PdM_failures.csv')
machines_df = pd.read_csv('PdM_machines.csv')
maint_df = pd.read_csv('PdM_maint.csv')
telemetry_df = pd.read_csv('PdM_telemetry.csv')

# Convert datetime columns to datetime type
errors_df['datetime'] = pd.to_datetime(errors_df['datetime'])
failures_df['datetime'] = pd.to_datetime(failures_df['datetime'])
maint_df['datetime'] = pd.to_datetime(maint_df['datetime'])
telemetry_df['datetime'] = pd.to_datetime(telemetry_df['datetime'])

# Add target variable 'failure' to indicate any failure in next month
failures_df['failure'] = 1

# Merge all relevant data on machineID and datetime
df = telemetry_df.copy()
df = df.merge(machines_df, on='machineID', how='left')
df = df.merge(errors_df, on=['datetime', 'machineID'], how='left')
df = df.merge(maint_df, on=['datetime', 'machineID'], how='left')
df = df.merge(failures_df[['datetime', 'machineID', 'failure']], on=['datetime', 'machineID'], how='left')
df['failure'] = df['failure'].fillna(0)  # Set failures to 0 where there's no failure

# Fill missing categorical values and encode categorical columns
df['errorID'] = df['errorID'].fillna('no_error')
df['comp'] = df['comp'].fillna('no_maintenance')
label_encoder = LabelEncoder()
df['errorID'] = label_encoder.fit_transform(df['errorID'])
df['comp'] = label_encoder.fit_transform(df['comp'])
df['model'] = label_encoder.fit_transform(df['model'])

# Advanced Feature Engineering: Adding Rolling Statistics and Lag Features
for window in [3, 6, 12]:  # Advanced windows in months
    for feature in ['volt', 'rotate', 'pressure', 'vibration']:
        df[f'{feature}_rolling_mean_{window}'] = df.groupby('machineID')[feature].transform(lambda x: x.rolling(window, min_periods=1).mean())
        df[f'{feature}_rolling_std_{window}'] = df.groupby('machineID')[feature].transform(lambda x: x.rolling(window, min_periods=1).std())

# Track failure frequency and maintenance trends over last 6 months
df['failure_last_6m'] = df.groupby('machineID')['failure'].transform(lambda x: x.rolling(6, min_periods=1).sum())
df['maint_freq_last_6m'] = df.groupby('machineID')['comp'].transform(lambda x: x.rolling(6, min_periods=1).count())

# Monthly Aggregation
df['month'] = df['datetime'].dt.to_period('M')
agg_funcs = {
    'volt': ['mean', 'std'],
    'rotate': ['mean', 'std'],
    'pressure': ['mean', 'std'],
    'vibration': ['mean', 'std'],
    'age': 'max',
    'errorID': 'nunique',
    'comp': 'nunique',
    'failure_last_6m': 'max',
    'maint_freq_last_6m': 'max'
}
features = df.groupby(['machineID', 'month']).agg(agg_funcs)
features.columns = ['_'.join(col) for col in features.columns]
features = features.reset_index()

# Target variable creation
df['next_month'] = df['month'] + 1
failures_next_month = df[df['failure'] == 1][['machineID', 'next_month']].drop_duplicates()
failures_next_month['failure_in_next_month'] = 1
features = features.merge(failures_next_month, left_on=['machineID', 'month'], right_on=['machineID', 'next_month'], how='left')
features['failure_in_next_month'] = features['failure_in_next_month'].fillna(0)
features.drop('next_month', axis=1, inplace=True)

# Data Scaling
scaler = StandardScaler()
X = features.drop(['failure_in_next_month', 'machineID', 'month'], axis=1)
X_scaled = scaler.fit_transform(X)
y = features['failure_in_next_month'].values

# Convert data to sequential format
def create_sequences(X, y, time_steps=3):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:(i + time_steps)])
        ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys)

X_seq, y_seq = create_sequences(X_scaled, y)

# Train-test-validation split for time series
train_size = int(len(X_seq) * 0.7)
val_size = int(len(X_seq) * 0.2)
X_train = X_seq[:train_size]
y_train = y_seq[:train_size]
X_val = X_seq[train_size:train_size + val_size]
y_val = y_seq[train_size:train_size + val_size]
X_test = X_seq[train_size + val_size:]
y_test = y_seq[train_size + val_size:]

# Adjusted LSTM Model with Higher Regularization and Dropout
model = Sequential()
model.add(LSTM(12, input_shape=(X_train.shape[1], X_train.shape[2]), return_sequences=False,
               kernel_regularizer=l2(0.01)))  # Reduce LSTM units and increase L2 regularization
model.add(BatchNormalization())
model.add(Dropout(0.5))  # Higher dropout rate for stronger regularization

# Adding a dense layer with L2 regularization
model.add(Dense(1, activation='sigmoid', kernel_regularizer=l2(0.01)))

# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Early stopping and learning rate reduction callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=1, min_lr=1e-6)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=20,  # Adjust as needed based on your model's performance
    batch_size=64,
    validation_data=(X_val, y_val),
    class_weight=class_weights_dict,  # Ensure class weights are applied
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

# Evaluate model
y_pred = (model.predict(X_test) > 0.5).astype("int32")
print("Adjusted LSTM Model Classification Report:\n", classification_report(y_test, y_pred))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Adjusted LSTM Model Classification Report:
               precision    recall  f1-score   support

         0.0       0.89      0.98      0.94        60
         1.0       0.98      0.90      0.94        71

    accuracy                           0.94       131
   macro avg       0.94      0.94      0.94       131
weighted avg       0.94      0.94      0.94       131



In [21]:
# Evaluate the model's predictions on the test data
y_pred = (model.predict(X_test) > 0.5).astype("int32")  # Convert probabilities to binary predictions

# Print the classification report for the test data
print("Classification Report for Test Data:\n")
print(classification_report(y_test, y_pred))


Classification Report for Test Data:

              precision    recall  f1-score   support

         0.0       0.89      0.98      0.94        60
         1.0       0.98      0.90      0.94        71

    accuracy                           0.94       131
   macro avg       0.94      0.94      0.94       131
weighted avg       0.94      0.94      0.94       131

