## Machine Learning Model Development

##### Import the necessary libraries 

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, StratifiedShuffleSplit,\
                                    cross_val_predict
from sklearn.preprocessing import RobustScaler, StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score,\
                            recall_score, f1_score, roc_auc_score
        

import optuna

In [None]:
# load the dataset
machine = pd.read_csv("../data/machine_downtime_cleaned.csv", parse_dates=['Date'])

# make a copy of the data 
machine_ori = machine.copy()
# print the first few rows
machine.head()

### Preprocessing

we have to divide the numeric columns into those that are skewed and those that are normal in order to be able to apply the necessary standardization or normalization to avoid bias

In [None]:
# create an empty list to store columns that are normally or
# skewly distributed
normal_cols = []
skewed_cols = []

# loop through the numerical features
for col in machine_ori.select_dtypes(include=np.number):
    skewness = machine_ori[col].skew()
    kurtosis = machine_ori[col].kurt()

    # set a threshold for kurtosis and skewness and then append the necessary features
    if -0.2 <= skewness <= 0.3 and -0.2 <= kurtosis <= 0.2:  # Adjust thresholds as needed
        normal_cols.append(col)
        print(f"{col}: Skewness = {skewness:.2f}, Kurtosis = {kurtosis:.2f} (Approximately Normal)")
    else:
        skewed_cols.append(col)
        print(f"{col}: Skewness = {skewness:.2f}, Kurtosis = {kurtosis:.2f} (Not Normally Distributed)")


### Model Parameters Preparation

In [None]:
# Define target and features
X = machine_ori.drop(columns=["Downtime", "Machine_ID", "Date", "Assembly_Line_No"])  # Features

# define encoder
label_encode = LabelEncoder()
y = label_encode.fit_transform(machine_ori["Downtime"])  # Target variable

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns

# Define transformers
preprocessor = ColumnTransformer([
    ("robust", RobustScaler(), skewed_cols),  # Skewed data
    ("standard", StandardScaler(), normal_cols)  # Normal data 
])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                    stratify = y, random_state=42)

# Define models
models = {
    "Bayesian Logistic Regression": LogisticRegression(solver="lbfgs"),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "SVM": SVC(kernel="rbf", probability=True, random_state=42),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric="auc", random_state = 42)
}


### Train the model 
#### Cross Validation

Since our problem is a classification task, Stratified K-Fold (StratifiedKFold) will be use for the cross validation. 

Why Use Stratified K-Fold?

+ Preserves Class Distribution: Stratified K-Fold ensures that each fold maintains the same proportion of classes as the overall dataset, which is crucial when dealing with classification problems, even if there is no visible class imbalance.
+ More Reliable Performance Estimates: It provides a more stable and representative estimate of your model’s performance compared to ShuffleSplit, which may produce folds with different class distributions.
+ Better Generalization: Ensures that all classes are well represented in training and validation splits, reducing the risk of biased results.

**Key Performance Metrics and Their Meaning**

+ Precision: Measures how many of the predicted failures were actually failures. A high precision means fewer false positives.
+ Recall: Measures how many of the actual failures were correctly identified. A high recall means fewer false negatives.
+ F1-Score: Harmonic mean of precision and recall, balancing both. Higher is better.
+ ROC AUC: Measures the model’s ability to distinguish between classes. A value closer to 1 is better.

**Model Comparison and Best Performing Model**

Model Performance Interpretation:

1. Best Overall Model: XGBoost (0.9993 ROC AUC, 0.9869 F1-Score)

+ Highest ROC AUC (0.9993) → Best discrimination ability.
+ Very high precision (0.9934) → Almost all predicted failures were actual failures.
+ Very high recall (0.9805) → Nearly all actual failures were correctly identified.
+ Strong balance between precision & recall (F1-Score = 0.9869).

Likely the best choice for deployment.

2. Random Forest is also very strong (0.9989 ROC AUC, 0.9870 F1-Score)

> + Very similar performance to XGBoost.
> + If interpretability is needed, Random Forest may be preferable.

3. Gradient Boosting also performs well (0.9981 ROC AUC, 0.9853 F1-Score)

> + Close competitor but slightly lower recall than XGBoost.

4. Decision Tree (0.9647 ROC AUC, 0.9644 F1-Score)

Still good but lacks the power of ensemble methods.

5. SVM & Bayesian Logistic Regression are weaker

> + SVM (0.9469 ROC AUC, 0.8696 F1-Score) and Bayesian Logistic Regression (0.9125 ROC AUC, 0.8419 F1-Score) underperform compared to ensemble models.


In [20]:
# craete an empty list to store model result
model_results = []

# Initialize Stratified K-Fold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# iterate through the models
for name, model in models.items():
    # create a pipeline
    pipeline = Pipeline([
        
        ('preprocessor', preprocessor),
        ('classifier', model)
        
        ])
    
    # Perform cross-validation
    y_pred_cv = cross_val_predict(pipeline, X_train, y_train, cv=cv)
    y_prob_cv = cross_val_predict(pipeline, X_train, y_train, cv=cv, method="predict_proba")\
                [:, 1] if hasattr(model, "predict_proba") else None
    
    # evaluate Metrics
    precision = precision_score(y_train, y_pred_cv)
    recall = recall_score(y_train, y_pred_cv)
    f1 = f1_score(y_train, y_pred_cv)
    roc_auc = roc_auc_score(y_train, y_prob_cv) if y_prob_cv is not None else 'N/A'
    
    # append result
    model_results.append({
        "Model": name,
        "Precision": round(precision, 4),
        "Recall": round(recall, 4),
        "F1-Score": round(f1, 4),
        "ROC AUC": round(roc_auc, 4) if roc_auc != "N/A" else "N/A"
    })
    
    # convert result to Datframe
    model_results_df = pd.DataFrame(model_results)
    

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [21]:
model_results_df.head(10)

Unnamed: 0,Model,Precision,Recall,F1-Score,ROC AUC
0,Bayesian Logistic Regression,0.8565,0.8547,0.8556,0.9287
1,Random Forest,0.9828,0.9902,0.9865,0.9988
2,Gradient Boosting,0.9892,0.9957,0.9924,0.9992
3,Decision Tree,0.9665,0.9696,0.9681,0.9684
4,SVM,0.8839,0.8753,0.8796,0.9457
5,XGBoost,0.9903,0.9946,0.9924,0.9992


### Check for Class Imbalance

I have trained all the models involved, and most of them exhibit exceptionally high evaluation metric values, reaching as high as 0.99. Given that this is a classification problem, one potential concern could be class imbalance, which often leads to inflated performance metrics. However, after thoroughly checking the class distribution, there doesn’t appear to be any significant imbalance. This suggests that the models might either be capturing strong patterns in the data or potentially overfitting. Further investigation, such as cross-validation performance consistency and feature importance analysis will be implemented to ensure the models’ generalizability.

In [None]:
machine_ori['Downtime'].value_counts()

In [None]:
# Define Stratified K-Fold Cross Validation
n_splits = 5
strat_kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# List to store AUC scores for each fold
roc_scores = []

y_train = pd.Series(y_train)

# Perform Cross-Validation
for train_idx, val_idx in strat_kfold.split(X_train, y_train):
    # Split data
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    # Define the model
    model_cross = xgb.XGBClassifier(
        n_estimators=300,  # Fixed number of trees
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        gamma=0,
        reg_alpha=0,
        reg_lambda=1,
       # use_label_encoder=False,
        eval_metric='auc',
        random_state=42
    )
    
    # Train the model
    model_cross.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], 
                verbose=False)
    
    # Predict probabilities
    y_pred = model_cross.predict_proba(X_val_fold)[:, 1]
    
    # Compute ROC-AUC score
    roc_auc = roc_auc_score(y_val_fold, y_pred)
    
    roc_scores.append(roc_auc)

# Print the average ROC-AUC score
mean_auc = np.mean(roc_scores)
print(f"Mean ROC AUC across {n_splits} folds: {mean_auc:.4f}")

### Hyperparameter Tuning

In [None]:
# Convert X_train and y_train to Pandas DataFrames/Series if needed
if isinstance(X_train, np.ndarray):
    X_train = pd.DataFrame(X_train)
if isinstance(y_train, np.ndarray):
    y_train = pd.Series(y_train)

In [None]:
# define the objective function for Optuna
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500, step=50),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'auc'
    }
    
    # instantiate the kfold
    strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    roc_auc_scores = []    # instatntiate an empty list to store the roc_auc scores

    for train_index, val_index in strat_kfold.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[train_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
        # Ensure X_train and y_train are Pandas DataFrame/Series

        #print(X_train_fold.shape, X_val_fold.shape, y_train_fold.shape, y_val_fold.shape)
        y_train_fold = y_train_fold.values.ravel()
        y_val_fold = y_val_fold.values.ravel()

    
        model = xgb.XGBClassifier(**params)
        model.fit(X_train_fold, y_train_fold,
                    eval_set = [(X_val_fold, y_val_fold)],
                     verbose = False)
        y_pred = model.predict_proba(X_val_fold)[:, 1]
        roc_auc_scores.append(roc_auc_score(y_val_fold, y_pred))
    return mean(roc_auc_scores)




In [None]:
# Run Optuna optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials = 50, timeout=1800) # run 50 trials or max 30mins

# print best params
print('Best Parameters found: ', study.best_params)