# Predictive Maintenance - Complete ML Pipeline
This notebook covers EDA, preprocessing, feature engineering, model training, and evaluation for the Predictive Maintenance dataset.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load the dataset
df = pd.read_csv('predictive_maintenance.csv')
df.head()

In [None]:
# Basic Info
df.info()
df.describe()

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# Visualize target variable
sns.countplot(data=df, x='Failure Type')
plt.title('Distribution of Failure Types')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Drop UDI, encode labels
df.drop(columns=['UDI'], inplace=True)
le = LabelEncoder()
df['Product ID'] = le.fit_transform(df['Product ID'])
df['Type'] = le.fit_transform(df['Type'])
df['Failure Type'] = df['Failure Type'].replace('No Failure', 'No_Failure')

In [None]:
# Split features and target
X = df.drop('Failure Type', axis=1)
y = df['Failure Type']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Try multiple models
models = {
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=500)
}

for name, model in models.items():
    score = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy').mean()
    print(f"{name}: {score:.4f}")

In [None]:
# Train best model
best_model = RandomForestClassifier()
best_model.fit(X_train_scaled, y_train)
y_pred = best_model.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=best_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=best_model.classes_)
disp.plot(xticks_rotation=45)
plt.title("Confusion Matrix")
plt.show()

In [None]:
# Feature importance
importances = pd.Series(best_model.feature_importances_, index=X.columns)
importances.sort_values(ascending=False).plot(kind='bar')
plt.title("Feature Importance")
plt.show()