### Imports

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV


# Data Preprocessing & Feature Engineering


In [2]:
# Load datasets
equipment_list = pd.read_excel('/content/1-Equipment-List.xlsx')
utilization = pd.read_excel('/content/2-Equipment-Utilization.xlsx')
maintenance_history = pd.read_excel('/content/3-Equipment-Maintenance-History.xlsx')

# Data Cleaning and Preparation
def convert_column_to_string(df, column_name):
    df[column_name] = df[column_name].astype(str)
    return df

# Convert 'EquiID' columns to string
equipment_list = convert_column_to_string(equipment_list, 'EquiID')
utilization = convert_column_to_string(utilization, 'EquiID')
maintenance_history = convert_column_to_string(maintenance_history, 'EquiID')

# Handle missing values
equipment_list['Equipment Type'].fillna('Unknown', inplace=True)
equipment_list['Manufacture_Year'].fillna(equipment_list['Manufacture_Year'].median(), inplace=True)

utilization['OutDate'].fillna(pd.Timestamp('1970-01-01'), inplace=True)
utilization['InDate'].fillna(pd.Timestamp('1970-01-01'), inplace=True)

maintenance_history['Maintenance_Commnets'].fillna('No comments', inplace=True)
maintenance_history['Maintenance_Date'].fillna(pd.Timestamp('1970-01-01'), inplace=True)

# Convert datetime columns to datetime type in utilization
utilization['OutDate'] = pd.to_datetime(utilization['OutDate'], errors='coerce')
utilization['InDate'] = pd.to_datetime(utilization['InDate'], errors='coerce')

# Calculate utilization duration in hours
utilization['Utilization_Duration'] = (utilization['InDate'] - utilization['OutDate']).dt.total_seconds() / 3600  # in hours

# Merge equipment_list and utilization data
merged_data = pd.merge(utilization, equipment_list, on='EquiID', how='left')

# Remove 'Production' from the 'Maintenance_Catagory'
maintenance_history_filtered = maintenance_history[maintenance_history['Maintenance_Catagory'] != 'Production'].copy()

# Create the Failure column based on 'Maintenance_Catagory'
maintenance_history_filtered.loc[:, 'Failure'] = np.where(
    maintenance_history_filtered['Maintenance_Catagory'] == 'Breakdown Repair', 1,
    np.where(
        maintenance_history_filtered['Maintenance_Catagory'] == 'Preventive Maintenance', 0,
        np.nan  # Handle other cases if any
    )
)

# Groupby 'EquiID' to aggregate the failure column
maintenance_aggregated = maintenance_history_filtered.groupby('EquiID')['Failure'].max().reset_index()

# Merge the aggregated failure data with the merged_data
merged_data = pd.merge(merged_data, maintenance_aggregated, on='EquiID', how='left')

# Fill any missing failure values with 0 (if no failure was reported for that equipment)
merged_data['Failure'].fillna(0, inplace=True)

In [10]:
# Prepare the dataset for PCA and modeling
features = merged_data.drop(columns=['Failure', 'JONO', 'Specification', 'E_AddedDate', 'Utilization_Duration'])
target = merged_data['Failure']

# Identify categorical and numerical columns
categorical_features = features.select_dtypes(include=['object']).columns.tolist()
numerical_features = features.select_dtypes(include=[np.number]).columns.tolist()

# Preprocessing for numerical data: Scaling
numerical_transformer = StandardScaler()

# Preprocessing for categorical data: Encoding
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=42)

# Apply preprocessing and dimensionality reduction separately
preprocessed_X_train = preprocessor.fit_transform(X_train)
preprocessed_X_test = preprocessor.transform(X_test)

# Choose dimensionality reduction technique based on the number of features
if preprocessed_X_train.shape[1] > 10:  # Arbitrary threshold, adjust if necessary
    # Use TruncatedSVD for sparse data
    dim_reduction = TruncatedSVD(n_components=min(preprocessed_X_train.shape[1], 50))  # Limit to 50 components or total features
else:
    # Use PCA for dense data
    dim_reduction = PCA(n_components=0.95)  # Retain 95% of variance

# Apply dimensionality reduction
X_train_reduced = dim_reduction.fit_transform(preprocessed_X_train)
X_test_reduced = dim_reduction.transform(preprocessed_X_test)


# Models

In [11]:


# Define models
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC()
}

# Train and evaluate models
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_reduced, y_train)
    y_pred = model.predict(X_test_reduced)

    print(f"Results for {name}:")
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("-" * 40)


Training Logistic Regression...
Results for Logistic Regression:
Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00        25
         1.0       1.00      1.00      1.00      6867

    accuracy                           1.00      6892
   macro avg       0.50      0.50      0.50      6892
weighted avg       0.99      1.00      0.99      6892

Confusion Matrix:
[[   0   25]
 [   0 6867]]
----------------------------------------
Training Random Forest...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Results for Random Forest:
Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      0.24      0.39        25
         1.0       1.00      1.00      1.00      6867

    accuracy                           1.00      6892
   macro avg       1.00      0.62      0.69      6892
weighted avg       1.00      1.00      1.00      6892

Confusion Matrix:
[[   6   19]
 [   0 6867]]
----------------------------------------
Training Gradient Boosting...
Results for Gradient Boosting:
Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

         0.0       0.60      0.60      0.60        25
         1.0       1.00      1.00      1.00      6867

    accuracy                           1.00      6892
   macro avg       0.80      0.80      0.80      6892
weighted avg       1.00      1.00      1.00      6892

Confusion Matrix:
[[  15   10]
 [  10 6857]]
----------------------------------------
Training A

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# (a). Hyperparameters tunning

In [12]:
# Hyperparameter tuning example for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30]
}

grid_search = GridSearchCV(estimator=RandomForestClassifier(), param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_reduced, y_train)
print(f"Best parameters for Random Forest: {grid_search.best_params_}")


Best parameters for Random Forest: {'max_depth': 30, 'n_estimators': 100}


# Feature importance

In [13]:
# Feature importance example with Random Forest
rf_model = RandomForestClassifier().fit(X_train_reduced, y_train)
importances = rf_model.feature_importances_
print(f"Feature importances: {importances}")

Feature importances: [0.04706408 0.0709594  0.0436221  0.02641602 0.03877285 0.01931933
 0.04541381 0.05617688 0.03949702 0.02026144 0.02333813 0.02666155
 0.03074268 0.01598437 0.01808953 0.01839496 0.01116465 0.02005652
 0.01992702 0.00936747 0.01149925 0.01443634 0.01618413 0.01430781
 0.0064125  0.00986024 0.01616258 0.01136851 0.01000059 0.01211781
 0.01192986 0.01250427 0.008366   0.01031701 0.01133907 0.02730531
 0.02143233 0.01119366 0.01097315 0.01684806 0.01201983 0.01917741
 0.01211871 0.01049479 0.00821553 0.01734473 0.01268951 0.01183555
 0.01739746 0.01291818]


# Update RandomForest model with best parameters

In [14]:
# Initialize the Random Forest model with best parameters
best_rf_model = RandomForestClassifier(max_depth=None, n_estimators=50)

# Train the updated model
best_rf_model.fit(X_train_reduced, y_train)

# Evaluation on Best Prams

In [15]:
# Predict on the test set
y_pred_rf = best_rf_model.predict(X_test_reduced)

# Print evaluation metrics
print("Results for Optimized Random Forest:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.2f}")
print("Classification Report:")
print(classification_report(y_test, y_pred_rf))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))
print("-" * 40)


Results for Optimized Random Forest:
Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

         0.0       0.88      0.28      0.42        25
         1.0       1.00      1.00      1.00      6867

    accuracy                           1.00      6892
   macro avg       0.94      0.64      0.71      6892
weighted avg       1.00      1.00      1.00      6892

Confusion Matrix:
[[   7   18]
 [   1 6866]]
----------------------------------------


In [None]:
# Save best Random Forest model
with open('best_rf_model.pkl', 'wb') as f:
    pickle.dump(best_rf_model, f)

# Save dimensionality reduction model
with open('dim_reduction.pkl', 'wb') as f:
    pickle.dump(dim_reduction, f)

# Save preprocessor (scaler and encoder)
with open('preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

print("Models saved successfully!")