In [None]:
## Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

: 

In [None]:
## Importing the datasets
train_df=pd.read_excel("/Users/_.rohan._/Desktop/Machine Failure Prediction/train.xlsx")
test_df= pd.read_excel("/Users/_.rohan._/Desktop/Machine Failure Prediction/test.xlsx")

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
## dataset shape
train_df.shape

In [None]:
test_df.shape

In [None]:
#Null Values
train_df.isna().sum()

In [None]:
#Null Values
test_df.isna().sum()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import seaborn as sns


In [None]:
if 'id' in train_df.columns:
    train_df.drop(columns=['id'], inplace=True)
if 'id' in test_df.columns:
    test_df.drop(columns=['id'], inplace=True)

In [None]:
# Encode categorical variable 'Type'
label_enc = LabelEncoder()
train_df['Type'] = label_enc.fit_transform(train_df['Type'])
test_df['Type'] = label_enc.transform(test_df['Type'])

# Drop 'Product ID' since it's a unique identifier
train_df.drop(columns=['Product ID'], inplace=True)
test_df.drop(columns=['Product ID'], inplace=True)

# Define features and target
X = train_df.drop(columns=['Machine failure'])
y = train_df['Machine failure']

In [None]:
# Split data for training and validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(test_df)

In [None]:
# Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

In [None]:
# Validate model
y_val_pred = model.predict(X_val_scaled)
y_val_prob = model.predict_proba(X_val_scaled)[:, 1]
accuracy = accuracy_score(y_val, y_val_pred)
report = classification_report(y_val, y_val_pred)

print(f"Validation Accuracy: {accuracy:.4f}")
print("Classification Report:\n", report)

In [None]:
# Plot feature importance
feature_importance = model.feature_importances_
plt.figure(figsize=(10, 6))
plt.barh(X.columns, feature_importance, color='skyblue')
plt.xlabel('Feature Importance')
plt.ylabel('Features')
plt.title('Feature Importance')
plt.show()

In [None]:
# Predict on test data
test_predictions = model.predict(X_test_scaled)
test_df['Machine failure Prediction'] = test_predictions

# Save predictions
test_df.to_csv("test_predictions.csv", index=False)

In [None]:
# Plot confusion matrix
conf_matrix = confusion_matrix(y_val, y_val_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['No Failure', 'Failure'], yticklabels=['No Failure', 'Failure'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Plot ROC Curve
fpr, tpr, _ = roc_curve(y_val, y_val_prob)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC curve (area = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
# Plot training feature distributions
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
features = X.columns[:6]  # Plot first 6 features for better visualization
for i, feature in enumerate(features):
    sns.histplot(X_train[feature], bins=30, kde=True, ax=axes[i // 3, i % 3], color='purple')
    axes[i // 3, i % 3].set_title(f'Distribution of {feature}')
plt.tight_layout()
plt.show()

In [None]:
# Plot heatmap of feature correlations
plt.figure(figsize=(10, 8))
corr_matrix = train_df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Feature Correlation Heatmap')
plt.show()

In [None]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    #"Support Vector Machine": SVC(probability=True, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5)
}

In [None]:
# Train and evaluate models
for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_scaled, y_train)
    y_val_pred = model.predict(X_val_scaled)
    y_val_prob = model.predict_proba(X_val_scaled)[:, 1]
    accuracy = accuracy_score(y_val, y_val_pred)
    report = classification_report(y_val, y_val_pred)
    print(f"Validation Accuracy ({name}): {accuracy:.4f}")
    print("Classification Report:\n", report)
    
    # Plot ROC Curve
    fpr, tpr, _ = roc_curve(y_val, y_val_prob)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f}')
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve - {name}')
    plt.legend()
    plt.show()