In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

# Load your datasets (adjust file paths as necessary)
telemetry = pd.read_csv('data\PdM_telemetry.csv')  # Columns: [datetime, machineID, volt, rotate, pressure]
errors = pd.read_csv('data\PdM_errors.csv')    # Columns: [datetime, machineID, errorID]
maintenance = pd.read_csv('data\PdM_machines.csv')  # Columns: [datetime, machineID, comp]
machines = pd.read_csv('data\PdM_machines.csv')    # Columns: [machineID, model, age]
failures = pd.read_csv('data\PdM_failures.csv')    # Columns: [datetime, machineID, failure]

# Convert datetime columns to pandas datetime
telemetry['datetime'] = pd.to_datetime(telemetry['datetime'])
errors['datetime'] = pd.to_datetime(errors['datetime'])
failures['datetime'] = pd.to_datetime(failures['datetime'])

# Feature engineering (example features like mean, std of telemetry)
telemetry['volt_mean'] = telemetry.groupby('machineID')['volt'].transform('mean')
telemetry['rotate_mean'] = telemetry.groupby('machineID')['rotate'].transform('mean')
telemetry['pressure_mean'] = telemetry.groupby('machineID')['pressure'].transform('mean')
telemetry['vibration_mean'] = telemetry.groupby('machineID')['vibration'].transform('mean')

telemetry['volt_std'] = telemetry.groupby('machineID')['volt'].transform('std')
telemetry['rotate_std'] = telemetry.groupby('machineID')['rotate'].transform('std')
telemetry['pressure_std'] = telemetry.groupby('machineID')['pressure'].transform('std')
telemetry['vibration_std'] = telemetry.groupby('machineID')['vibration'].transform('std')

# Merge telemetry with machines data (using machineID as the key)
data = pd.merge(telemetry, machines, on='machineID', how='left')

# Merge telemetry with failure data (label creation)
failures['failure'] = 1
data = pd.merge(data, failures[['datetime', 'machineID', 'failure']], on=['datetime', 'machineID'], how='left')
data['failure'] = data['failure'].fillna(0)

# Handle categorical columns (OneHotEncoding for model and errorID)
data = pd.get_dummies(data, columns=['model'], drop_first=True)
errors_encoded = pd.get_dummies(errors, columns=['errorID'], drop_first=True)

# Example of time since the last failure (feature engineering)
data['time_since_last_failure'] = data.groupby('machineID')['datetime'].diff().dt.total_seconds().fillna(0)

# Convert boolean columns to integers
bool_columns = data.select_dtypes(include='bool').columns
data[bool_columns] = data[bool_columns].astype(int)

# Prepare features (X) and label (y)
X = data.drop(columns=['datetime', 'failure', 'machineID'])
y = data['failure']

# Define numerical and categorical columns
numerical_cols = ['volt', 'rotate', 'pressure', 'vibration', 'volt_mean', 'rotate_mean', 'pressure_mean', 'vibration_mean', 'volt_std', 'rotate_std', 'pressure_std', 'vibration_std', 'age', 'time_since_last_failure']
categorical_cols = [col for col in X.columns if 'model' in col or 'errorID' in col]

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_cols),
        ('cat', SimpleImputer(strategy='most_frequent'), categorical_cols)
    ])

# Define pipelines for both models
gb_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('classifier', GradientBoostingClassifier(random_state=42))
])

rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Gradient Boosting Model ###
gb_pipeline.fit(X_train, y_train)

# Predict on test data
y_pred_gb = gb_pipeline.predict(X_test)
y_pred_prob_gb = gb_pipeline.predict_proba(X_test)[:, 1]

# Evaluate the Gradient Boosting model
print("Gradient Boosting Classifier Report:")
print(classification_report(y_test, y_pred_gb))
print(f"Accuracy: {accuracy_score(y_test, y_pred_gb):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_prob_gb):.4f}")

### Random Forest Model ###
rf_pipeline.fit(X_train, y_train)

# Predict on test data
y_pred_rf = rf_pipeline.predict(X_test)
y_pred_prob_rf = rf_pipeline.predict_proba(X_test)[:, 1]

# Evaluate the Random Forest model
print("\nRandom Forest Classifier Report:")
print(classification_report(y_test, y_pred_rf))
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(f"ROC AUC Score: {roc_auc_score(y_test, y_pred_prob_rf):.4f}")

# Compare Models
print("\nModel Comparison:")
print(f"Gradient Boosting - Accuracy: {accuracy_score(y_test, y_pred_gb):.4f}, ROC AUC: {roc_auc_score(y_test, y_pred_prob_gb):.4f}")
print(f"Random Forest - Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}, ROC AUC: {roc_auc_score(y_test, y_pred_prob_rf):.4f}")


  telemetry = pd.read_csv('data\PdM_telemetry.csv')  # Columns: [datetime, machineID, volt, rotate, pressure]
  errors = pd.read_csv('data\PdM_errors.csv')    # Columns: [datetime, machineID, errorID]
  maintenance = pd.read_csv('data\PdM_machines.csv')  # Columns: [datetime, machineID, comp]
  machines = pd.read_csv('data\PdM_machines.csv')    # Columns: [machineID, model, age]
  failures = pd.read_csv('data\PdM_failures.csv')    # Columns: [datetime, machineID, failure]


Gradient Boosting Classifier Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    175084
         1.0       0.19      0.03      0.06       145

    accuracy                           1.00    175229
   macro avg       0.60      0.52      0.53    175229
weighted avg       1.00      1.00      1.00    175229

Accuracy: 0.9991
ROC AUC Score: 0.7885

Random Forest Classifier Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    175084
         1.0       0.67      0.01      0.03       145

    accuracy                           1.00    175229
   macro avg       0.83      0.51      0.51    175229
weighted avg       1.00      1.00      1.00    175229

Accuracy: 0.9992
ROC AUC Score: 0.5928

Model Comparison:
Gradient Boosting - Accuracy: 0.9991, ROC AUC: 0.7885
Random Forest - Accuracy: 0.9992, ROC AUC: 0.5928


In [17]:
data.to_csv('data\processed_data.csv', index=False)


  data.to_csv('data\processed_data.csv', index=False)


In [4]:
telemetry = pd.read_csv('data\PdM_telemetry.csv')  # Columns: [datetime, machineID, volt, rotate, pressure]
errors = pd.read_csv('data\PdM_errors.csv')    # Columns: [datetime, machineID, errorID]
maintenance = pd.read_csv('data\PdM_machines.csv')  # Columns: [datetime, machineID, comp]
machines = pd.read_csv('data\PdM_machines.csv')    # Columns: [machineID, model, age]
failures = pd.read_csv('data\PdM_failures.csv')    # Columns: [datetime, machineID, failure]

Telemetry Columns: Index(['datetime', 'machineID', 'volt', 'rotate', 'pressure', 'vibration'], dtype='object')
Error Logs Columns: Index(['datetime', 'machineID', 'errorID'], dtype='object')
Maintenance Columns: Index(['machineID', 'model', 'age'], dtype='object')
Machines Columns: Index(['machineID', 'model', 'age'], dtype='object')
Failures Columns: Index(['datetime', 'machineID', 'failure'], dtype='object')
