<a href="https://colab.research.google.com/github/ShashankAlagawadi/Advanced_PDM/blob/main/Enhanced_PDM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import ConfusionMatrixDisplay

# Load dataset from CSV file
csv_file = "/content/enhanced_maintenance_dataset.csv"
df = pd.read_csv(csv_file)

# Print initial information about the dataset
print(f"Dataset loaded from {csv_file}")
print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Encode the target variable (Maintenance_Strategy)
le = LabelEncoder()
df['Maintenance_Strategy_Encoded'] = le.fit_transform(df['Maintenance_Strategy'])

# Prepare features (X) and target (y)
X = df.drop(columns=['Maintenance_Strategy', 'Maintenance_Strategy_Encoded'])
y = df['Maintenance_Strategy_Encoded']

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Print data split information
print("\nTraining set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])

# Initialize XGBoost model
model = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Train the model
model.fit(X_train, y_train)
print("\nModel trained successfully!")

# Predict the outcomes
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Display Confusion Matrix with heatmap
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

# Feature Importance Plot
plt.figure(figsize=(10, 6))
plt.barh(X.columns, model.feature_importances_)
plt.xlabel('Feature Importance')
plt.title('XGBoost Feature Importance')
plt.show()

# Additional Graphs related to cost-effectiveness

# Distribution of Switching Costs
plt.figure(figsize=(8, 6))
sns.histplot(df['Switching_Cost'], kde=True, color='green', bins=30)
plt.title("Distribution of Switching Costs")
plt.xlabel("Switching Cost ($)")
plt.ylabel("Frequency")
plt.show()

# Distribution of Effectiveness Scores
plt.figure(figsize=(8, 6))
sns.histplot(df['Effectiveness_Score'], kde=True, color='purple', bins=30)
plt.title("Distribution of Effectiveness Scores")
plt.xlabel("Effectiveness Score")
plt.ylabel("Frequency")
plt.show()

# New Graphs for cost threshold-based switching between techniques

# 1. Plot Switching Costs against Maintenance Strategy to understand cost thresholds for switching
plt.figure(figsize=(10, 6))
sns.boxplot(x='Maintenance_Strategy', y='Switching_Cost', data=df, palette="Set2")
plt.title("Switching Costs Across Maintenance Strategies")
plt.xlabel("Maintenance Strategy")
plt.ylabel("Switching Cost ($)")
plt.show()

# Define thresholds for switching strategies based on cost
thresholds = {
    'CBM': 800,  # Example threshold: CBM switches if cost > $800
    'PdM': 1200,  # Example threshold: PdM switches if cost > $1200
    'PM': 1000,  # Example threshold: PM switches if cost > $1000
    'CM': 1100   # Example threshold: CM switches if cost > $1100
}

# Determine when switching occurs based on cost thresholds
df['Switch_Strategy'] = df.apply(lambda row:
                                 le.inverse_transform([row['Maintenance_Strategy_Encoded']])[0]
                                 if row['Switching_Cost'] > thresholds[le.inverse_transform([row['Maintenance_Strategy_Encoded']])[0]]
                                 else 'No Switch', axis=1)

# Print statements for new graphs
print("\nSwitching costs across different maintenance strategies show significant variation, with some strategies having higher costs than others.")
print("The trade-off between switching costs and effectiveness scores provides valuable insights into the decision-making process when switching strategies.")
print("The box plot illustrates the distribution of switching costs for each maintenance strategy.")

# Print sample of switch strategy decisions based on threshold
print("\nSample of Switching Strategy Decisions based on Cost Threshold:")
print(df[['Maintenance_Strategy', 'Switching_Cost', 'Switch_Strategy']].head())

# Save the model (optional)
model_file = "xgboost_model.json"
model.save_model(model_file)
print(f"\nXGBoost model saved to {model_file}")

sns.pairplot(df[['Switching_Cost', 'Effectiveness_Score', 'Maintenance_Strategy_Encoded']], hue='Maintenance_Strategy_Encoded', palette='Set1')
plt.suptitle("Pairplot of Switching Cost and Effectiveness Scores", y=1.02)
plt.show()

# Pairplot of features colored by Maintenance Strategy
sns.pairplot(df, hue='Maintenance_Strategy', palette='Set2')
plt.suptitle('Pairplot of Features vs Maintenance Strategy', y=1.02)
plt.show()

# # Correlation heatmap of features
# plt.figure(figsize=(10, 8))
# correlation_matrix = df.drop(columns=['Maintenance_Strategy', 'Maintenance_Strategy_Encoded']).corr()
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
# plt.title("Correlation Heatmap of Features")
# plt.show()

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import ConfusionMatrixDisplay

# Load dataset from CSV file
csv_file = "/content/maintenance_dataset_5k.csv"
df = pd.read_csv(csv_file)

# Print initial information about the dataset
print(f"Dataset loaded from {csv_file}")
print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Encode the target variable (Maintenance_Strategy)
le = LabelEncoder()
df['Maintenance_Strategy_Encoded'] = le.fit_transform(df['Maintenance_Strategy'])

# Prepare features (X) and target (y)
X = df.drop(columns=['Maintenance_Strategy', 'Maintenance_Strategy_Encoded'])
y = df['Maintenance_Strategy_Encoded']

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Print data split information
print("\nTraining set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])

# Define base learners for Stacking
base_learners = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svc', SVC(probability=True, random_state=42))
]

# Define meta-learner (final classifier)
meta_learner = LogisticRegression(random_state=42)

# Initialize Stacking Classifier
stacking_model = StackingClassifier(estimators=base_learners, final_estimator=meta_learner, cv=5)

# Train the Stacking model
stacking_model.fit(X_train, y_train)
print("\nStacked Generalization model trained successfully!")

# Predict the outcomes
y_pred = stacking_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Display Confusion Matrix with heatmap
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

# Feature Importance Placeholder (if applicable)
# Note: StackingClassifier does not provide direct feature importances.
# Custom implementations can extract base model importances if required.

# Additional Graphs related to cost-effectiveness

# Distribution of Switching Costs
plt.figure(figsize=(8, 6))
sns.histplot(df['Switching_Cost'], kde=True, color='green', bins=30)
plt.title("Distribution of Switching Costs")
plt.xlabel("Switching Cost ($)")
plt.ylabel("Frequency")
plt.show()

# Distribution of Effectiveness Scores
plt.figure(figsize=(8, 6))
sns.histplot(df['Effectiveness_Score'], kde=True, color='purple', bins=30)
plt.title("Distribution of Effectiveness Scores")
plt.xlabel("Effectiveness Score")
plt.ylabel("Frequency")
plt.show()

# New Graphs for cost threshold-based switching between techniques

# 1. Plot Switching Costs against Maintenance Strategy to understand cost thresholds for switching
plt.figure(figsize=(10, 6))
sns.boxplot(x='Maintenance_Strategy', y='Switching_Cost', data=df, palette="Set2")
plt.title("Switching Costs Across Maintenance Strategies")
plt.xlabel("Maintenance Strategy")
plt.ylabel("Switching Cost ($)")
plt.show()

# Define thresholds for switching strategies based on cost
thresholds = {
    'CBM': 800,  # Example threshold: CBM switches if cost > $800
    'PdM': 1200,  # Example threshold: PdM switches if cost > $1200
    'PM': 1000,  # Example threshold: PM switches if cost > $1000
    'CM': 1100   # Example threshold: CM switches if cost > $1100
}

# Determine when switching occurs based on cost thresholds
df['Switch_Strategy'] = df.apply(lambda row:
                                 le.inverse_transform([row['Maintenance_Strategy_Encoded']])[0]
                                 if row['Switching_Cost'] > thresholds[le.inverse_transform([row['Maintenance_Strategy_Encoded']])[0]]
                                 else 'No Switch', axis=1)

# Print statements for new graphs
print("\nSwitching costs across different maintenance strategies show significant variation, with some strategies having higher costs than others.")
print("The trade-off between switching costs and effectiveness scores provides valuable insights into the decision-making process when switching strategies.")
print("The box plot illustrates the distribution of switching costs for each maintenance strategy.")

# Print sample of switch strategy decisions based on threshold
print("\nSample of Switching Strategy Decisions based on Cost Threshold:")
print(df[['Maintenance_Strategy', 'Switching_Cost', 'Switch_Strategy']].head())

# Pairplot of features colored by Maintenance Strategy
sns.pairplot(df, hue='Maintenance_Strategy', palette='Set2')
plt.suptitle('Pairplot of Features vs Maintenance Strategy', y=1.02)
plt.show()

# Correlation heatmap of features
# plt.figure(figsize=(10, 8))
# correlation_matrix = df.drop(columns=['Maintenance_Strategy', 'Maintenance_Strategy_Encoded']).corr()
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
# plt.title("Correlation Heatmap of Features")
# plt.show()


In [None]:
!pip install catboost


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
from sklearn.metrics import ConfusionMatrixDisplay

# Load dataset from CSV file
csv_file = "/content/enhanced_maintenance_dataset.csv"
df = pd.read_csv(csv_file)

# Print initial information about the dataset
print(f"Dataset loaded from {csv_file}")
print(f"Dataset shape: {df.shape}")
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Encode the target variable (Maintenance_Strategy)
le = LabelEncoder()
df['Maintenance_Strategy_Encoded'] = le.fit_transform(df['Maintenance_Strategy'])

# Prepare features (X) and target (y)
X = df.drop(columns=['Maintenance_Strategy', 'Maintenance_Strategy_Encoded'])
y = df['Maintenance_Strategy_Encoded']

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Print data split information
print("\nTraining set size:", X_train.shape[0])
print("Test set size:", X_test.shape[0])

# Initialize CatBoost Classifier
catboost_model = CatBoostClassifier(iterations=1000, learning_rate=0.1, depth=6, random_seed=42, verbose=200)

# Train the CatBoost model
catboost_model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50)
print("\nCatBoost model trained successfully!")

# Predict the outcomes
y_pred = catboost_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.4f}")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Display Confusion Matrix with heatmap
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)
disp.plot(cmap='Blues')
plt.title("Confusion Matrix")
plt.show()

# Feature Importance Plot
plt.figure(figsize=(10, 6))
feature_importances = catboost_model.get_feature_importance()
plt.barh(X.columns, feature_importances)
plt.xlabel('Feature Importance')
plt.title('CatBoost Feature Importance')
plt.show()

# Additional Graphs related to cost-effectiveness

# Distribution of Switching Costs
plt.figure(figsize=(8, 6))
sns.histplot(df['Switching_Cost'], kde=True, color='green', bins=30)
plt.title("Distribution of Switching Costs")
plt.xlabel("Switching Cost ($)")
plt.ylabel("Frequency")
plt.show()

# Distribution of Effectiveness Scores
plt.figure(figsize=(8, 6))
sns.histplot(df['Effectiveness_Score'], kde=True, color='purple', bins=30)
plt.title("Distribution of Effectiveness Scores")
plt.xlabel("Effectiveness Score")
plt.ylabel("Frequency")
plt.show()

# New Graphs for cost threshold-based switching between techniques

# 1. Plot Switching Costs against Maintenance Strategy to understand cost thresholds for switching
plt.figure(figsize=(10, 6))
sns.boxplot(x='Maintenance_Strategy', y='Switching_Cost', data=df, palette="Set2")
plt.title("Switching Costs Across Maintenance Strategies")
plt.xlabel("Maintenance Strategy")
plt.ylabel("Switching Cost ($)")
plt.show()

# Define thresholds for switching strategies based on cost
thresholds = {
    'CBM': 800,  # Example threshold: CBM switches if cost > $800
    'PdM': 1200,  # Example threshold: PdM switches if cost > $1200
    'PM': 1000,  # Example threshold: PM switches if cost > $1000
    'CM': 1100   # Example threshold: CM switches if cost > $1100
}

# Determine when switching occurs based on cost thresholds
df['Switch_Strategy'] = df.apply(lambda row:
                                 le.inverse_transform([row['Maintenance_Strategy_Encoded']])[0]
                                 if row['Switching_Cost'] > thresholds[le.inverse_transform([row['Maintenance_Strategy_Encoded']])[0]]
                                 else 'No Switch', axis=1)

# Print statements for new graphs
print("\nSwitching costs across different maintenance strategies show significant variation, with some strategies having higher costs than others.")
print("The trade-off between switching costs and effectiveness scores provides valuable insights into the decision-making process when switching strategies.")
print("The box plot illustrates the distribution of switching costs for each maintenance strategy.")

# Print sample of switch strategy decisions based on threshold
print("\nSample of Switching Strategy Decisions based on Cost Threshold:")
print(df[['Maintenance_Strategy', 'Switching_Cost', 'Switch_Strategy']].head())

# Pairplot of features colored by Maintenance Strategy
sns.pairplot(df, hue='Maintenance_Strategy', palette='Set2')
plt.suptitle('Pairplot of Features vs Maintenance Strategy', y=1.02)
plt.show()

# Correlation heatmap of features
# plt.figure(figsize=(10, 8))
# correlation_matrix = df.drop(columns=['Maintenance_Strategy', 'Maintenance_Strategy_Encoded']).corr()
# sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
# plt.title("Correlation Heatmap of Features")
# plt.show()


In [None]:
!pip install numpy scikit-learn

