## Machine Learning Model Development

##### Import the necessary libraries 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, StratifiedShuffleSplit
from sklearn.preprocessing import RobustScaler, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score,\
                            recall_score, f1_score, roc_auc_score
        

import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load the dataset
machine = pd.read_csv("../data/machine_downtime_cleaned.csv", parse_dates=['Date'])

# make a copy of the data 
machine_ori = machine.copy()
# print the first few rows
machine.head()

Unnamed: 0,Date,Machine_ID,Assembly_Line_No,Coolant_Temperature,Hydraulic_Oil_Temperature,Spindle_Bearing_Temperature,Spindle_Vibration,Tool_Vibration,Voltage(volts),Torque(Nm),Downtime,Hydraulic_Pressure(Pa),Coolant_Pressure(Pa),Air_System_Pressure(Pa),Cutting(N),Spindle_Speed(RPS)
0,2021-12-08,Makino-L2-Unit1-2015,Shopfloor-L2,4.5,47.9,31.2,1.225,35.214,381.0,23.091903,No_Machine_Failure,14115919.3,513860.1,612765.0,2870.0,253.6
1,2021-12-17,Makino-L2-Unit1-2015,Shopfloor-L2,21.7,47.5,35.8,1.078,29.198,367.0,31.620335,No_Machine_Failure,7246602.0,514111.3,662932.2,2970.0,295.4
2,2021-12-17,Makino-L1-Unit1-2013,Shopfloor-L1,5.2,49.4,34.2,1.266,30.206,340.0,15.900716,Machine_Failure,8828000.0,683941.3,656038.1,2700.0,466.0
3,2021-12-17,Makino-L1-Unit1-2013,Shopfloor-L1,24.4,48.1,36.6,0.778,25.048,307.0,23.923929,Machine_Failure,7454000.0,658019.5,652883.7,3590.0,466.0
4,2021-12-21,Makino-L2-Unit1-2015,Shopfloor-L2,14.1,51.8,32.4,0.969,31.491,380.0,16.964105,Machine_Failure,5326000.0,683941.3,602069.0,2860.0,460.2


### Preprocessing

we have to divide the numeric columns into those that are skewed and those that are normal in order to be able to apply the necessary standardization or normalization to avoid bias

In [3]:
# create an empty list to store columns that are normally or
# skewly distributed
normal_cols = []
skewed_cols = []

# loop through the numerical features
for col in machine_ori.select_dtypes(include=np.number):
    skewness = machine_ori[col].skew()
    kurtosis = machine_ori[col].kurt()

    # set a threshold for kurtosis and skewness and then append the necessary features
    if -0.2 <= skewness <= 0.3 and -0.2 <= kurtosis <= 0.2:  # Adjust thresholds as needed
        normal_cols.append(col)
        print(f"{col}: Skewness = {skewness:.2f}, Kurtosis = {kurtosis:.2f} (Approximately Normal)")
    else:
        skewed_cols.append(col)
        print(f"{col}: Skewness = {skewness:.2f}, Kurtosis = {kurtosis:.2f} (Not Normally Distributed)")


Coolant_Temperature: Skewness = -0.22, Kurtosis = -1.35 (Not Normally Distributed)
Hydraulic_Oil_Temperature: Skewness = -0.00, Kurtosis = 0.05 (Approximately Normal)
Spindle_Bearing_Temperature: Skewness = -0.03, Kurtosis = -0.05 (Approximately Normal)
Spindle_Vibration: Skewness = 0.03, Kurtosis = -0.11 (Approximately Normal)
Tool_Vibration: Skewness = -0.06, Kurtosis = 0.01 (Approximately Normal)
Voltage(volts): Skewness = -0.03, Kurtosis = -0.09 (Approximately Normal)
Torque(Nm): Skewness = 0.03, Kurtosis = -0.46 (Not Normally Distributed)
Hydraulic_Pressure(Pa): Skewness = 0.21, Kurtosis = -0.98 (Not Normally Distributed)
Coolant_Pressure(Pa): Skewness = -0.01, Kurtosis = -0.13 (Approximately Normal)
Air_System_Pressure(Pa): Skewness = -0.05, Kurtosis = 0.01 (Approximately Normal)
Cutting(N): Skewness = 0.12, Kurtosis = -1.09 (Not Normally Distributed)
Spindle_Speed(RPS): Skewness = 0.22, Kurtosis = -0.45 (Not Normally Distributed)


### Model Parameters Preparation

In [4]:
# Define target and features
X = machine_ori.drop(columns=["Downtime", "Machine_ID", "Date", "Assembly_Line_No"])  # Features

# define encoder
label_encode = LabelEncoder()
y = label_encode.fit_transform(machine_ori["Downtime"])  # Target variable

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns

# Define transformers
preprocessor = ColumnTransformer([
    ("robust", RobustScaler(), skewed_cols),  # Skewed data
    ("standard", StandardScaler(), normal_cols)  # Normal data 
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                    stratify = y, random_state=42)

# Define models
models = {
    "Bayesian Logistic Regression": LogisticRegression(solver="lbfgs"),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "SVM": SVC(kernel="rbf", probability=True, random_state=42),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss", random_state = 42)
}


### Train the model 

**Key Performance Metrics and Their Meaning**

+ Precision: Measures how many of the predicted failures were actually failures. A high precision means fewer false positives.
+ Recall: Measures how many of the actual failures were correctly identified. A high recall means fewer false negatives.
+ F1-Score: Harmonic mean of precision and recall, balancing both. Higher is better.
+ ROC AUC: Measures the model’s ability to distinguish between classes. A value closer to 1 is better.

**Model Comparison and Best Performing Model**

Model Performance Interpretation:

1. Best Overall Model: XGBoost (0.9993 ROC AUC, 0.9869 F1-Score)

+ Highest ROC AUC (0.9993) → Best discrimination ability.
+ Very high precision (0.9934) → Almost all predicted failures were actual failures.
+ Very high recall (0.9805) → Nearly all actual failures were correctly identified.
+ Strong balance between precision & recall (F1-Score = 0.9869).

Likely the best choice for deployment.

2. Random Forest is also very strong (0.9989 ROC AUC, 0.9870 F1-Score)

> + Very similar performance to XGBoost.
> + If interpretability is needed, Random Forest may be preferable.

3. Gradient Boosting also performs well (0.9981 ROC AUC, 0.9853 F1-Score)

> + Close competitor but slightly lower recall than XGBoost.

4. Decision Tree (0.9647 ROC AUC, 0.9644 F1-Score)

Still good but lacks the power of ensemble methods.

5. SVM & Bayesian Logistic Regression are weaker

> + SVM (0.9469 ROC AUC, 0.8696 F1-Score) and Bayesian Logistic Regression (0.9125 ROC AUC, 0.8419 F1-Score) underperform compared to ensemble models.


In [5]:
# craete an empty list to store model result
model_results = []

# iterate through the models
for name, model in models.items():
    # create a pipeline
    pipeline = Pipeline([
        
        ('preprocessor', preprocessor),
        ('classifier', model)
        
        ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_prob = pipeline.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba')\
             else None
    
    # evaluate Metrics
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob) if y_prob is not None else 'N/A'
    
    # append result
    model_results.append({
        "Model": name,
        "Precision": round(precision, 4),
        "Recall": round(recall, 4),
        "F1-Score": round(f1, 4),
        "ROC AUC": round(roc_auc, 4) if roc_auc != "N/A" else "N/A"
    })
    
    # convert result to Datframe
    model_results_df = pd.DataFrame(model_results)
    

Parameters: { "use_label_encoder" } are not used.



In [6]:
model_results_df.head(10)

Unnamed: 0,Model,Precision,Recall,F1-Score,ROC AUC
0,Bayesian Logistic Regression,0.8365,0.8474,0.8419,0.9125
1,Random Forest,0.987,0.987,0.987,0.9989
2,Gradient Boosting,0.9934,0.9773,0.9853,0.9981
3,Decision Tree,0.9613,0.9675,0.9644,0.9647
4,SVM,0.8626,0.8766,0.8696,0.9469
5,XGBoost,0.9934,0.9805,0.9869,0.9993


### Check for Class Imbalance

I have trained all the models involved, and most of them exhibit exceptionally high evaluation metric values, reaching as high as 0.99. Given that this is a classification problem, one potential concern could be class imbalance, which often leads to inflated performance metrics. However, after thoroughly checking the class distribution, there doesn’t appear to be any significant imbalance. This suggests that the models might either be capturing strong patterns in the data or potentially overfitting. Further investigation, such as cross-validation performance consistency and feature importance analysis, may be necessary to ensure the models’ generalizability.

In [7]:
machine_ori['Downtime'].value_counts()

Downtime
Machine_Failure       1257
No_Machine_Failure    1230
Name: count, dtype: int64

### Hyperparameter Tuning

In [None]:
# Convert X_train and y_train to Pandas DataFrames/Series if needed
if isinstance(X_train, np.ndarray):
    X_train = pd.DataFrame(X_train)
if isinstance(y_train, np.ndarray):
    y_train = pd.Series(y_train)

In [93]:
# define the objective function for Optuna
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500, step=50),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'auc'
    }
    
    # instantiate the kfold
    strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    roc_auc_scores = []    # instatntiate an empty list to store the roc_auc scores

    for train_index, val_index in strat_kfold.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[train_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        # Ensure X_train and y_train are Pandas DataFrame/Series

        #print(X_train_fold.shape, X_val_fold.shape, y_train_fold.shape, y_val_fold.shape)
        y_train_fold = y_train_fold.values.ravel()
        y_val_fold = y_val_fold.values.ravel()

    
        model = xgb.XGBClassifier(**params)
        model.fit(X_train_fold, y_train_fold,
                    eval_set = [(X_val_fold, y_val_fold)],
                     verbose = False)
        y_pred = model.predict_proba(X_val_fold)[:, 1]
        roc_auc_scores.append(roc_auc_score(y_val_fold, y_pred))
    return mean(roc_auc_scores)




In [94]:
# Run Optuna optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials = 50, timeout=1800) # run 50 trials or max 30mins

# print best params
print('Best Parameters found: ', study.best_params)

[I 2025-02-27 20:50:48,326] A new study created in memory with name: no-name-4e5f8afe-30d0-433a-ae09-2a5855e5942a
  'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
[W 2025-02-27 20:50:48,352] Trial 0 failed with parameters: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.23965282989838496, 'subsample': 0.7722967895638201, 'colsample_bytree': 0.6739608956964448, 'gamma': 6.805526372132293, 'reg_alpha': 2.2893470812400696, 'reg_lambda': 0.17018386825966347} because of the following error: XGBoostError('[20:50:48] C:\\buildkite-agent\\builds\\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\\xgboost\\xgboost-ci-windows\\src\\data\\data.cc:508: Check failed: this->labels.Size() % this->num_row_ == 0 (373 vs. 0) : Incorrect size for labels.').
Traceback (most recent call last):
  File "c:\Users\Administrator\anaconda3\envs\machineind\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:

XGBoostError: [20:50:48] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\xgboost\xgboost-ci-windows\src\data\data.cc:508: Check failed: this->labels.Size() % this->num_row_ == 0 (373 vs. 0) : Incorrect size for labels.

In [None]:
X_train.shape

In [89]:
 # instantiate the kfold
strat_kfold = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
#roc_auc_scores = []    # instatntiate an empty list to store the roc_auc scores

for train_index, val_index in strat_kfold.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[train_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
print(X_train_fold.shape, X_val_fold.shape, y_train_fold.shape, y_val_fold.shape)


(933, 12) (933, 12) (933,) (932,)


In [90]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold

# Ensure X_train and y_train are Pandas DataFrame/Series
if isinstance(X_train, np.ndarray):
    X_train = pd.DataFrame(X_train)
if isinstance(y_train, np.ndarray):
    y_train = pd.Series(y_train)

# Instantiate Stratified K-Fold
strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for train_index, val_index in strat_kfold.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

print(X_train_fold.shape, X_val_fold.shape, y_train_fold.shape, y_val_fold.shape)


(1492, 12) (373, 12) (1492,) (373,)
