<a href="https://colab.research.google.com/github/RoyTng/ADALL-Project/blob/main/My_ADALL_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Config File Readme

from google.colab import drive
drive.mount('/content/drive')  # authorise when prompted

BASE_DIR = "/content/drive/MyDrive/Colab Notebooks/ADALL Project"
DATA_CSV  = f"{BASE_DIR}/Data/Ai4i.csv"

OUT_DIR   = f"{BASE_DIR}/Outputs"
MODEL_DIR = f"{BASE_DIR}/Models"

random_state = 42
target_col = "Machine failure"

drop_cols = [
    "UDI", "Product ID",          # identifiers (leakage risk)
    "TWF", "HDF", "PWF", "OSF", "RNF"  # failure modes (leakage vs target)
]

In [1]:
# Core libraries
import pandas as pd
import numpy as np
import seaborn as sns
import os
# Visualisation
import matplotlib.pyplot as plt
# Modelling and preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, average_precision_score, confusion_matrix
)
import xgboost as xgb


from google.colab import drive
drive.mount('/content/drive')

# DATA_CSV = "/content/drive/MyDrive/Colab Notebooks/ADALL Project/Data/Ai4i.csv"
BASE_DIR = "/content/drive/MyDrive/Colab Notebooks/ADALL Project"
DATA_CSV = f"{BASE_DIR}/Data/Ai4i.csv"
OUT_DIR  = f"{BASE_DIR}/Outputs"
MODEL_DIR = f"{BASE_DIR}/Models"

os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

random_state = 42
target_col = "Machine failure"

df = pd.read_csv(DATA_CSV)
print("Raw shape:", df.shape)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Raw shape: (10000, 14)


**No need to run the code below**

In [12]:
# ====================
# Dataset Google Drive
# ====================
# df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ADALL Project/Data/Ai4i.csv")

# ====================
# Dataset GitHub
# ====================
# Example: Replace this with the raw URL of your GitHub file
# github_raw_url = 'https://raw.githubusercontent.com/RoyTng/ADALL-Project/refs/heads/main/Ai4i.csv'

try:
    df = pd.read_csv(github_raw_url)
    print("Successfully loaded data from GitHub!")
    display(df.head())
except Exception as e:
    print(f"Error loading data: {e}")
    print("Please ensure the URL is correct and the file format is compatible with `pd.read_csv`.")


Successfully loaded data from GitHub!


Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   UDI                      10000 non-null  int64  
 1   Product ID               10000 non-null  object 
 2   Type                     10000 non-null  object 
 3   Air temperature [K]      10000 non-null  float64
 4   Process temperature [K]  10000 non-null  float64
 5   Rotational speed [rpm]   10000 non-null  int64  
 6   Torque [Nm]              10000 non-null  float64
 7   Tool wear [min]          10000 non-null  int64  
 8   Machine failure          10000 non-null  int64  
 9   TWF                      10000 non-null  int64  
 10  HDF                      10000 non-null  int64  
 11  PWF                      10000 non-null  int64  
 12  OSF                      10000 non-null  int64  
 13  RNF                      10000 non-null  int64  
dtypes: float64(3), int64(9)

In [4]:
from google.colab import userdata
from openai import OpenAI

# Load key from Google Colab Secrets
api_key = userdata.get('OPENAI_API_KEY')

client = OpenAI(
    api_key=api_key,
)

In [5]:
#generate a preview of ten rows as text first, so that we can use it for sending to LLM API later.
data_preview = df.head(10).to_string()
print(data_preview)

   UDI Product ID Type  Air temperature [K]  Process temperature [K]  Rotational speed [rpm]  Torque [Nm]  Tool wear [min]  Machine failure  TWF  HDF  PWF  OSF  RNF
0    1     M14860    M                298.1                    308.6                    1551         42.8                0                0    0    0    0    0    0
1    2     L47181    L                298.2                    308.7                    1408         46.3                3                0    0    0    0    0    0
2    3     L47182    L                298.1                    308.5                    1498         49.4                5                0    0    0    0    0    0
3    4     L47183    L                298.2                    308.6                    1433         39.5                7                0    0    0    0    0    0
4    5     L47184    L                298.2                    308.7                    1408         40.0                9                0    0    0    0    0    0
5    6    

### Data Profiling

In [3]:
import pandas as pd
import numpy as np
from io import StringIO

# ---------------------------
# Generate a full dataset profile
# ---------------------------

buffer = StringIO()

# dtypes
buffer.write("=== DTYPES ===\n")
buffer.write(df.dtypes.to_string())
buffer.write("\n\n")

# numeric describe
buffer.write("=== NUMERIC DESCRIBE ===\n")
buffer.write(df.describe().to_string())
buffer.write("\n\n")

# categorical describe
buffer.write("=== CATEGORICAL DESCRIBE ===\n")
try:
    buffer.write(df.describe(include='object').to_string())
except:
    buffer.write("No categorical columns")
buffer.write("\n\n")

# null summary
buffer.write("=== NULL SUMMARY ===\n")
null_summary = (
    df.isna().sum().to_frame("null_count")
    .assign(null_pct=lambda x: x["null_count"]/len(df))
)
buffer.write(null_summary.to_string())
buffer.write("\n\n")

# unique cardinality
buffer.write("=== UNIQUE VALUES PER COLUMN ===\n")
buffer.write(df.nunique().to_frame("unique_count").to_string())
buffer.write("\n\n")

# correlation matrix
buffer.write("=== CORRELATIONS (NUMERIC ONLY) ===\n")
buffer.write(df.corr(numeric_only=True).round(3).to_string())
buffer.write("\n\n")

# value counts for categoricals
buffer.write("=== VALUE COUNTS (TOP 20 PER CATEGORICAL COLUMN) ===\n")
cat_cols = df.select_dtypes(include='object').columns
if len(cat_cols) > 0:
    for col in cat_cols:
        buffer.write(f"\nColumn: {col}\n")
        vc = df[col].value_counts().head(20)
        buffer.write(vc.to_string())
        buffer.write("\n")
else:
    buffer.write("No categorical columns\n")
buffer.write("\n")

# --------- FIXED OUTLIER COMPUTATION (NO BOOLEANS) ---------
buffer.write("=== OUTLIER SUMMARY (IQR METHOD) ===\n")
num_cols = df.select_dtypes(include=['number']).columns  # exclude booleans
Q1 = df[num_cols].quantile(0.25)
Q3 = df[num_cols].quantile(0.75)
IQR = Q3 - Q1
outliers = ((df[num_cols] < (Q1 - 1.5*IQR)) | (df[num_cols] > (Q3 + 1.5*IQR))).sum()
buffer.write(outliers.to_string())
buffer.write("\n\n")

# leakage scan: columns with all unique values
buffer.write("=== POSSIBLE LEAKAGE COLUMNS (UNIQUE FOR EACH ROW) ===\n")
leak_cols = df.columns[df.nunique() == len(df)]
buffer.write(str(list(leak_cols)))
buffer.write("\n\n")

# shape, duplicates, constant cols
buffer.write("=== SHAPE / DUPLICATES / CONSTANT COLUMNS ===\n")
dup_count = df.duplicated().sum()
constant_cols = df.columns[df.nunique() == 1].tolist()
buffer.write(f"Rows: {len(df)}, Columns: {df.shape[1]}\n")
buffer.write(f"Duplicate rows: {dup_count}\n")
buffer.write(f"Constant columns: {constant_cols}\n\n")

# Final text
payload_text = buffer.getvalue()

print(payload_text)


=== DTYPES ===
UDI                          int64
Product ID                  object
Type                        object
Air temperature [K]        float64
Process temperature [K]    float64
Rotational speed [rpm]       int64
Torque [Nm]                float64
Tool wear [min]              int64
Machine failure              int64
TWF                          int64
HDF                          int64
PWF                          int64
OSF                          int64
RNF                          int64

=== NUMERIC DESCRIBE ===
               UDI  Air temperature [K]  Process temperature [K]  Rotational speed [rpm]   Torque [Nm]  Tool wear [min]  Machine failure           TWF           HDF           PWF           OSF          RNF
count  10000.00000         10000.000000             10000.000000            10000.000000  10000.000000     10000.000000     10000.000000  10000.000000  10000.000000  10000.000000  10000.000000  10000.00000
mean    5000.50000           300.004930               310

### LLM-assisted Data Profiling

In [None]:
response = client.responses.create(
    model="gpt-5-mini",
    instructions="""
You are an expert data scientist with extensive knowledge of tree-based models.
Always justify recommendations using reasoning trace based ONLY on the dataset profile.
""",
    input=f"""
Dataset info: {payload_text}\n
Questions:\n
1. Based on the dataset profile, what data quality issues should be resolved before modelling?
Provide a priority list and justify each item. \n
2. Which columns appear redundant, correlated, or likely to cause leakage?
Explain why each is problematic. \n
Next: Provide a python script to handle the identified issues.
Define one helper function for each issue.
Then define a wrapper function that calls these helper with true false option as user choice
Provide a single line of code to run the overall wrapper function.
Do not encode categorical columns or model first.
""")

print(response.output_text)


Below I first summarize the data-quality issues (prioritized) and explain which columns are problematic (redundant / correlated / leakage) using only the dataset profile you provided. After that I give a self-contained Python preprocessing script with one helper function per issue and a wrapper that calls them according to user-specified booleans. The code does not encode categorical columns or train any model.

1) Priority list of data-quality issues to resolve before modelling (priority descending)
1. Remove clear leakage columns (high priority)
   - Why: TWF, HDF, PWF, OSF, RNF are binary columns that correlate strongly with the target Machine failure (correlations: TWF 0.363, HDF 0.576, PWF 0.523, OSF 0.531). These are likely post-hoc indicators or decompositions of failure (i.e., they directly indicate specific failure types). Keeping them would leak information about the label into training and produce overly optimistic models that won’t generalize. Remove or reserve them only fo

### <font color=gold> Data Handling & Assumptions

<font color=purple>(**Below is the Output from the above Query [payload_text] to GenAI**)

I first summarize the data-quality issues (prioritized) and explain which columns are problematic (redundant / correlated / leakage) using only the dataset profile you provided. After that I give a self-contained Python preprocessing script with one helper function per issue and a wrapper that calls them according to user-specified booleans. The code does not encode categorical columns or train any model.

<font color=gold>A) Priority list of data-quality issues to resolve before modelling (priority descending)
1. Remove clear leakage columns (high priority)
   - Why: TWF, HDF, PWF, OSF, RNF are binary columns that correlate strongly with the target Machine failure (correlations: TWF 0.363, HDF 0.576, PWF 0.523, OSF 0.531). These are likely post-hoc indicators or decompositions of failure (i.e., they directly indicate specific failure types). Keeping them would leak information about the label into training and produce overly optimistic models that won’t generalize. Remove or reserve them only for post-hoc analysis if the task is to predict Machine failure.
2. Remove unique identifiers (high priority)
   - Why: UDI and Product ID are unique per row (UDI unique_count 10000; Product ID unique_count 10000). These are identifiers and cannot generalize; they represent leakage/overfitting risk and should be dropped before modelling.
3. Address extreme class imbalance (high priority)
   - Why: Machine failure mean = 0.0339 (~3.4% positives). Such imbalance requires special handling at modelling time (class weighting, resampling, appropriate evaluation metrics). Without handling, models will be biased to majority class.
4. Handle outliers in numeric predictors (medium priority)
   - Why: The IQR-based outlier summary flags substantial outliers for Rotational speed [rpm] (418 points) and Torque [Nm] (69 points). Outliers can affect some preprocessing steps and may indicate measurement errors or rare operating modes; handle via winsorization/clipping or targeted removal depending on domain needs.
5. Consider highly correlated / redundant features (low-to-medium priority for tree models)
   - Why: Strong correlations exist between:
     - Rotational speed [rpm] and Torque [Nm] (corr = -0.875): these are almost deterministic of each other — one may be redundant.
     - Air temperature [K] and Process temperature [K] (corr = 0.876): large overlap.
     Tree-based models (Random Forest / Gradient Boosting) are robust to multicollinearity, so dropping correlated features is not mandatory; but removing truly redundant features can reduce model complexity and speed up training.
6. Convert object columns to appropriate dtypes but do not one-hot/ordinal-encode yet (low priority)
   - Why: 'Type' is categorical with 3 levels (counts: L 6000, M 2997, H 1003) — convert to pandas 'category' dtype for memory efficiency, but do not encode until modelling decisions are made.
7. Confirm there are no duplicates or missing values (low priority)
   - Why: Profile shows 0 nulls and 0 duplicate rows; nothing to fix here.

<font color=gold>B) Which columns appear redundant, correlated, or likely to cause leakage (and why)
- Leakage (drop prior to model training when predicting Machine failure):
  - TWF, HDF, PWF, OSF, RNF — all binary and have moderate-to-strong correlations with Machine failure (0.363, 0.576, 0.523, 0.531, 0.005 respectively). These are very likely derived failure-mode indicators tied to the label.
  - Product ID and UDI — unique per row; identifiers will leak and not generalize.
- Redundant / Highly correlated:
  - Rotational speed [rpm] and Torque [Nm] (corr = -0.875) — almost collinear; one may be redundant or both carry the same signal with opposite sign.
  - Air temperature [K] and Process temperature [K] (corr = 0.876) — high redundancy.
  - Note: Tree models tolerate correlation, so these can be kept, but if you want fewer features or simpler models, drop one of each high-correlation pair.
- Others to check / caution:
  - Tool wear [min] has many unique values (246) and some weak correlation to Machine failure (0.105) — keep, but consider domain relevance.
  - RNF has negligible correlation and very low prevalence (unique=2, outlier count 19) — could be noisy; depends on domain.

Python preprocessing script
- Each issue is handled by a single helper function.
- A wrapper preprocess_pipeline calls helpers in a logical order and respects user True/False choices.
- The script leaves categorical columns (Type) unencoded; it only optionally casts them to pandas 'category'.
- The wrapper returns the processed DataFrame. It does not model or encode categories.

Paste the script into your environment and run the single-line example provided at the bottom to apply the chosen steps.

##<font color=orange> **Feature Engineering & Transformation**

### Below the Sample Helper's Code:
Helper's code is generated by using the "Payload_Text" with ChatGPT API.

In [4]:
import pandas as pd
import numpy as np
from sklearn.utils import resample

# Helper 1: remove identifier columns that are unique per-row (leakage)
def remove_identifiers(df, drop_udi=True, drop_product_id=True, inplace=False):
    """
    Drops UDI and/or Product ID if present. These are unique identifiers and will cause leakage.
    Returns a new DataFrame (or mutates if inplace=True).
    """
    df_out = df if inplace else df.copy()
    drop_cols = []
    if drop_udi and 'UDI' in df_out.columns:
        drop_cols.append('UDI')
    if drop_product_id and 'Product ID' in df_out.columns:
        drop_cols.append('Product ID')
    if drop_cols:
        df_out = df_out.drop(columns=drop_cols)
    return df_out

# Helper 2: remove failure-mode columns that leak the target
def remove_failure_mode_columns(df, drop_failure_modes=True, inplace=False):
    """
    Drops binary failure-mode columns ['TWF','HDF','PWF','OSF','RNF'] which are likely derived from the target.
    Keep 'Machine failure' (the label) unless explicitly removed by user outside.
    """
    df_out = df if inplace else df.copy()
    cols = ['TWF', 'HDF', 'PWF', 'OSF', 'RNF']
    to_drop = [c for c in cols if c in df_out.columns]
    if drop_failure_modes and to_drop:
        df_out = df_out.drop(columns=to_drop)
    return df_out

# Helper 3: handle outliers with IQR clipping/winsorization or removal
def handle_outliers_iqr(df, columns=None, method='clip', iqr_multiplier=1.5, inplace=False):
    """
    columns: list of numeric column names to inspect; if None, defaults to ['Rotational speed [rpm]','Torque [Nm]']
    method: 'clip' (winsorize at bounds), 'remove' (drop rows outside bounds), or 'none'
    Returns DataFrame after operation.
    """
    df_out = df if inplace else df.copy()
    if columns is None:
        columns = [c for c in ['Rotational speed [rpm]', 'Torque [Nm]'] if c in df_out.columns]
    for col in columns:
        if col not in df_out.columns:
            continue
        series = df_out[col].dropna()
        q1 = series.quantile(0.25)
        q3 = series.quantile(0.75)
        iqr = q3 - q1
        lower = q1 - iqr_multiplier * iqr
        upper = q3 + iqr_multiplier * iqr
        if method == 'clip':
            df_out[col] = df_out[col].clip(lower=lower, upper=upper)
        elif method == 'winsorize':
            # Explicit winsorize by clipping (same as clip)
            df_out[col] = df_out[col].clip(lower=lower, upper=upper)
        elif method == 'remove':
            mask = df_out[col].between(lower, upper, inclusive='both')
            df_out = df_out[mask].reset_index(drop=True)
        elif method == 'none':
            pass
        else:
            raise ValueError("method must be one of ['clip','winsorize','remove','none']")
    return df_out

# Helper 4: drop highly correlated numeric features (optional)
def drop_highly_correlated(df, threshold=0.95, target_column='Machine failure', inplace=False):
    """
    Drops one column from each pair of numeric features with absolute correlation >= threshold.
    Strategy: for a pair (i,j), drop the column with larger mean absolute correlation with others.
    The target_column (if present) will never be dropped.
    Returns DataFrame with columns dropped.
    """
    df_out = df if inplace else df.copy()
    numeric = df_out.select_dtypes(include=[np.number])
    if numeric.shape[1] <= 1:
        return df_out
    if target_column in numeric.columns:
        numeric_for_corr = numeric.drop(columns=[target_column])
    else:
        numeric_for_corr = numeric
    corr = numeric_for_corr.corr().abs()
    # Upper triangle mask
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = set()
    # Find pairs to drop
    for col in upper.columns:
        high_pairs = upper.index[upper[col] >= threshold].tolist()
        for other in high_pairs:
            if other in to_drop or col in to_drop:
                continue
            # compute mean abs correlation to decide which to drop
            mean_corr_col = corr[col].mean()
            mean_corr_other = corr[other].mean()
            if mean_corr_col >= mean_corr_other:
                to_drop.add(col)
            else:
                to_drop.add(other)
    # Remove only those existing in df_out
    to_drop = [c for c in to_drop if c in df_out.columns]
    if to_drop:
        df_out = df_out.drop(columns=to_drop)
    return df_out

# Helper 5: set categorical dtype for Type (no encoding)
def set_categorical_dtype(df, columns=None, inplace=False):
    """
    Convert specified columns to pandas 'category' dtype. Does not encode to numbers.
    If columns is None, default to ['Type'] if present.
    """
    df_out = df if inplace else df.copy()
    if columns is None:
        columns = ['Type'] if 'Type' in df_out.columns else []
    for col in columns:
        if col in df_out.columns:
            df_out[col] = df_out[col].astype('category')
    return df_out

# Helper 6: handle class imbalance by simple random oversampling or undersampling
def resample_for_balance(df, target='Machine failure', method=None, minority_oversample_ratio=1.0, random_state=42, inplace=False):
    """
    method options:
      - None: do nothing (default)
      - 'undersample': downsample majority class to minority_count * minority_oversample_ratio
      - 'oversample': upsample minority class to majority_count * minority_oversample_ratio (simple random oversampling with replacement)
    Returns resampled DataFrame.
    Note: This is a simple resampling; for advanced methods consider SMOTE or class_weight at model training.
    """
    df_out = df if inplace else df.copy()
    if method is None:
        return df_out
    if target not in df_out.columns:
        raise KeyError(f"target column '{target}' not found in DataFrame")
    df_pos = df_out[df_out[target] == 1]
    df_neg = df_out[df_out[target] == 0]
    n_pos = len(df_pos)
    n_neg = len(df_neg)
    if n_pos == 0 or n_neg == 0:
        return df_out
    if method == 'undersample':
        target_count = int(n_pos * minority_oversample_ratio)
        if target_count < 1:
            raise ValueError("Resulting undersampled majority size is less than 1; adjust minority_oversample_ratio")
        df_neg_down = resample(df_neg, replace=False, n_samples=target_count, random_state=random_state)
        df_resampled = pd.concat([df_neg_down, df_pos], axis=0).sample(frac=1, random_state=random_state).reset_index(drop=True)
        return df_resampled
    elif method == 'oversample':
        target_count = int(n_neg * minority_oversample_ratio)
        if target_count < 1:
            raise ValueError("Resulting oversampled minority size is less than 1; adjust minority_oversample_ratio")
        df_pos_up = resample(df_pos, replace=True, n_samples=target_count, random_state=random_state)
        df_resampled = pd.concat([df_neg, df_pos_up], axis=0).sample(frac=1, random_state=random_state).reset_index(drop=True)
        return df_resampled
    else:
        raise ValueError("method must be one of [None, 'undersample', 'oversample']")

# Helper 7: remove duplicate rows if any (simple)
def remove_duplicate_rows(df, subset=None, keep='first', inplace=False):
    """
    Removes duplicate rows. subset: list of columns to consider; if None uses all columns.
    """
    df_out = df if inplace else df.copy()
    df_out = df_out.drop_duplicates(subset=subset, keep=keep).reset_index(drop=True)
    return df_out

# Wrapper pipeline that calls helpers with boolean options
def preprocess_pipeline(
    df,
    drop_udi=True,
    drop_product_id=True,
    drop_failure_modes=True,
    outlier_method=None,            # options: 'clip','winsorize','remove','none'
    outlier_columns=None,            # default uses Rotational speed and Torque if present
    drop_correlated=False,
    corr_threshold=0.95,
    set_cat=True,
    cat_columns=None,
    balance_method=None,             # options: None, 'undersample', 'oversample'
    balance_ratio=1.0,
    random_state=42,
    inplace=False
):
    """
    Applies a sequence of preprocessing steps in sensible order and returns processed DataFrame.
    Steps (order):
      1. cast categorical dtypes (Type) if requested
      2. remove identifiers (UDI, Product ID)
      3. remove failure modes (TWF,HDF,PWF,OSF,RNF) if requested
      4. handle outliers in specified columns
      5. drop highly correlated numeric features if requested
      6. resample for class balance (if requested)
      7. remove duplicate rows
    """
    df_out = df if inplace else df.copy()
    # 1: categorical dtype cast
    if set_cat:
        df_out = set_categorical_dtype(df_out, columns=cat_columns)
    # 2: identifiers
    df_out = remove_identifiers(df_out, drop_udi=drop_udi, drop_product_id=drop_product_id)
    # 3: failure modes
    df_out = remove_failure_mode_columns(df_out, drop_failure_modes=drop_failure_modes)
    # 4: outliers
    if outlier_method is not None:
      df_out = handle_outliers_iqr(df_out, columns=outlier_columns, method=outlier_method)
    # 5: drop correlated (do not drop the target)
    if drop_correlated:
        df_out = drop_highly_correlated(df_out, threshold=corr_threshold, target_column='Machine failure')
    # 6: resample for balance
    if balance_method is not None:
        df_out = resample_for_balance(df_out, target='Machine failure', method=balance_method,
                                      minority_oversample_ratio=balance_ratio, random_state=random_state)
    # 7: remove duplicates just in case
    df_out = remove_duplicate_rows(df_out)
    return df_out

# Example single-line run (modify options as desired):
# processed_df = preprocess_pipeline(df, drop_udi=True, drop_product_id=True, drop_failure_modes=True, outlier_method='clip', drop_correlated=True, corr_threshold=0.95, set_cat=True, balance_method=None)


Single line to run the overall wrapper (example):
- If your original DataFrame is named df, run:

processed_df = preprocess_pipeline(df, drop_udi=True, drop_product_id=True, drop_failure_modes=True, outlier_method='clip', drop_correlated=True, corr_threshold=0.95, set_cat=True, balance_method=None)

Notes and recommended settings (based ONLY on the dataset profile):
- Definitely set drop_failure_modes=True and drop_udi/drop_product_id=True before training any model to avoid leakage.
- For outliers, I recommend method='clip' (winsorize) as a safe default — it mitigates extreme values without dropping rows. Use 'remove' only if domain experts confirm those rows are erroneous.
- For correlated features: because tree-based models are robust, drop_correlated=False is acceptable. If you want a compact feature set, use drop_correlated=True with threshold around 0.90–0.95. Here the report shows two strong ~0.875 and ~0.876 correlations — these are under 0.95 but you may still choose to drop one from each pair manually if you want.
- For class imbalance: Do not blindly oversample without a plan. I suggest initially training with class_weight (in model) or using careful resampling with cross validation. If you prefer simple resampling in preprocessing, set balance_method='oversample' (or 'undersample') — the function provides simple random resampling.
- Keep 'Type' as a categorical column (set_cat=True) but defer encoding to modelling stage so you can choose the best encoding strategy (target encoding, ordinal, one-hot, embeddings, etc.).

If you want, I can:
- Provide recommended feature subsets for tree-based models (given the correlations).
- Suggest sample hyperparameters (class_weight, evaluation metrics) tuned for this 3.4% positive class.
- Provide a ready-to-run training snippet for a RandomForest/LightGBM with class weighting and cross-validation (after you confirm which preprocessing steps you want applied).

### <font color=cyan> Leakage policy (Locked)

- Drop identifiers (UDI, Product ID) and failure-mode columns (TWF, HDF, PWF, OSF, RNF) before modeling to avoid target leakage.
- We used a stratified 60/20/20 split. All model choices and threshold tuning were done using Train + Validation only.
- The Test set was kept untouched and used once at the end as the final exam to estimate real-world performance.
- If resampling is used later, apply it inside CV folds only (never before the split).


### <font color=gold>Leakage Drop:

In [5]:
# Leakage policy enforcement ONLY (no resampling, no correlated dropping, no outlier clipping yet)
df_proc = preprocess_pipeline(
    df,
    drop_udi=True,
    drop_product_id=True,
    drop_failure_modes=True,
    outlier_method=None,        # defer
    drop_correlated=False,      # defer
    set_cat=True,               # OK: dtype only
    balance_method=None,        # MUST be None (no resampling before split)
    random_state=random_state
)

print("After leakage drop shape:", df_proc.shape)


After leakage drop shape: (10000, 7)


### <font color=gold>Stratified 60/20/20 split

In [6]:
from sklearn.model_selection import train_test_split

y = df_proc[target_col].astype(int)
X = df_proc.drop(columns=[target_col])

# 60/20/20 stratified split, random_state=42
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.40, stratify=y, random_state=random_state
)

X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=random_state
)

print(X_train.shape, X_valid.shape, X_test.shape)


(6000, 6) (2000, 6) (2000, 6)


<font color=red>Below the code to check for Column "Name"

There is no need to run every time, just a check.

In [8]:
def check_type_column_quick(X):
    print("First 20 columns:", X.columns.tolist()[:20])
    print("Contains 'Type' exactly?", "Type" in X.columns)

check_type_column_quick(X)


First 20 columns: ['Type', 'Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']
Contains 'Type' exactly? True


### <font color=gold>Audit + save to Drive

In [7]:
def split_audit(name, y_part):
    n = len(y_part)
    pos = int(y_part.sum())
    rate = pos / n if n else 0
    return {"split": name, "n": n, "failures": pos, "failure_rate": round(rate, 4)}

audit = pd.DataFrame([
    split_audit("train", y_train),
    split_audit("valid", y_valid),
    split_audit("test",  y_test),
])

print(audit)

# Save audit artifact (Drive-only)
audit_path = f"{OUT_DIR}/split_audit_60_20_20.csv"
audit.to_csv(audit_path, index=False)
print('=======================================')
print()
print("Saved:", audit_path)

print("\nPredictor columns:", X.columns.tolist())
print("Leakage columns dropped: UDI, Product ID, TWF, HDF, PWF, OSF, RNF")


   split     n  failures  failure_rate
0  train  6000       203        0.0338
1  valid  2000        68        0.0340
2   test  2000        68        0.0340

Saved: /content/drive/MyDrive/Colab Notebooks/ADALL Project/Outputs/split_audit_60_20_20.csv

Predictor columns: ['Type', 'Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']
Leakage columns dropped: UDI, Product ID, TWF, HDF, PWF, OSF, RNF


Test set frozen; model selection + threshold tuning use Train + Validation only; **evaluate Test once at the end**.

In [11]:
import pandas as pd
import numpy as np

split_idx = pd.DataFrame({
    "index": np.concatenate([X_train.index, X_valid.index, X_test.index]),
    "split": (["train"] * len(X_train)) + (["valid"] * len(X_valid)) + (["test"] * len(X_test))
}).sort_values("index").reset_index(drop=True)

split_idx_path = f"{OUT_DIR}/split_indices_60_20_20.csv"
split_idx.to_csv(split_idx_path, index=False)
print("Saved:", split_idx_path)


Saved: /content/drive/MyDrive/Colab Notebooks/ADALL Project/Outputs/split_indices_60_20_20.csv


This gives you a durable record of which original rows went into train/valid/test, saved to Drive under Outputs/.

If later you do introduce any row-dropping (e.g., handle_outliers_iqr(... method='remove')), switch to a row_id column before preprocessing—but based on your plan, you can ignore that for now.

## <font color=gold>**Logistic Regression** (Baseline Modelling)

Sanity check (inputs exist)

In [9]:
def step0_sanity_check_inputs(X_train, y_train, X_valid, y_valid):
    print("X_train:", X_train.shape, "y_train:", y_train.shape)
    print("X_valid:", X_valid.shape, "y_valid:", y_valid.shape)
    print("Has 'Type' column?", "Type" in X_train.columns)

step0_sanity_check_inputs(X_train, y_train, X_valid, y_valid)


X_train: (6000, 6) y_train: (6000,)
X_valid: (2000, 6) y_valid: (2000,)
Has 'Type' column? True


Define output paths (Drive-only)

In [10]:
def step1_define_output_paths(OUT_DIR):
    return {
        "metrics_valid": f"{OUT_DIR}/metrics_baseline_lr_validation.csv",
        "preds_valid":   f"{OUT_DIR}/predictions_baseline_lr_validation.csv",
        "cm_valid":      f"{OUT_DIR}/confusion_matrix_baseline_lr_validation.csv",
    }

paths = step1_define_output_paths(OUT_DIR)
paths


{'metrics_valid': '/content/drive/MyDrive/Colab Notebooks/ADALL Project/Outputs/metrics_baseline_lr_validation.csv',
 'preds_valid': '/content/drive/MyDrive/Colab Notebooks/ADALL Project/Outputs/predictions_baseline_lr_validation.csv',
 'cm_valid': '/content/drive/MyDrive/Colab Notebooks/ADALL Project/Outputs/confusion_matrix_baseline_lr_validation.csv'}

Build the LR preprocessor (one-hot Type + scale numerics)

This follows the “two-preprocessor” idea: LR gets scaling; tree models later won’t.

In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

def step2_build_preprocessor_lr(X_train, cat_features=("Type",)):
    cat_features = list(cat_features)
    num_features = [c for c in X_train.columns if c not in cat_features]

    preprocessor = ColumnTransformer(
        transformers=[
            ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
            ("num", StandardScaler(), num_features),
        ],
        remainder="drop"
    )
    return preprocessor, cat_features, num_features

preprocessor_lr, cat_cols, num_cols = step2_build_preprocessor_lr(X_train, cat_features=("Type",))
cat_cols, num_cols


(['Type'],
 ['Air temperature [K]',
  'Process temperature [K]',
  'Rotational speed [rpm]',
  'Torque [Nm]',
  'Tool wear [min]'])

Build the baseline model (Logistic Regression, balanced)

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

def step3_build_lr_model(preprocessor, random_state=42):
    lr = LogisticRegression(
        solver="liblinear",
        max_iter=2000,
        class_weight="balanced",
        random_state=random_state
    )
    pipe = Pipeline(steps=[("prep", preprocessor), ("lr", lr)])
    return pipe

pipe_lr = step3_build_lr_model(preprocessor_lr, random_state=random_state)
pipe_lr


Fit on Train only

In [13]:
def step4_fit_model(pipe, X_train, y_train):
    pipe.fit(X_train, y_train)
    return pipe

pipe_lr = step4_fit_model(pipe_lr, X_train, y_train)


 Predict on Validation (probabilities + thresholded labels)

In [14]:
import numpy as np

def step5_predict_valid(pipe, X_valid, threshold=0.50):
    proba = pipe.predict_proba(X_valid)[:, 1]
    pred = (proba >= threshold).astype(int)
    return proba, pred

threshold = 0.50
valid_proba, valid_pred = step5_predict_valid(pipe_lr, X_valid, threshold=threshold)
(valid_proba[:5], valid_pred[:5])


(array([0.13274806, 0.03507193, 0.12251955, 0.13024163, 0.63313876]),
 array([0, 0, 0, 0, 1]))

Evaluate metrics (PR-AUC/AP, precision, recall) + confusion matrix

PR-AUC/AP is the primary metric in your modelling plan, and precision/recall support your later risk-banding decision.

In [20]:
import pandas as pd
from sklearn.metrics import average_precision_score, precision_score, recall_score, confusion_matrix

def step6_eval_valid(y_valid, valid_proba, valid_pred, threshold=0.50):
    metrics_df = pd.DataFrame([{
        "model": "LogisticRegression_balanced",
        "split": "valid",
        "n": int(len(y_valid)),
        "failures": int(pd.Series(y_valid).sum()),
        "threshold": float(threshold),
        "pr_auc_ap": float(average_precision_score(y_valid, valid_proba)),
        "precision": float(precision_score(y_valid, valid_pred, zero_division=0)),
        "recall": float(recall_score(y_valid, valid_pred, zero_division=0)),
    }])

    cm = confusion_matrix(y_valid, valid_pred)
    cm_df = pd.DataFrame(cm, index=["actual_0", "actual_1"], columns=["pred_0", "pred_1"])
    return metrics_df, cm_df

metrics_df, cm_df = step6_eval_valid(y_valid, valid_proba, valid_pred, threshold=threshold)
metrics_df, cm_df


(                         model  split     n  failures  threshold  pr_auc_ap  \
 0  LogisticRegression_balanced  valid  2000        68        0.5   0.407236   
 
    precision    recall  
 0       0.15  0.794118  ,
           pred_0  pred_1
 actual_0    1626     306
 actual_1      14      54)

<font color=cyan> **Code for printing nice table**

In [21]:
import pandas as pd

def pretty_print_tables(metrics_df, cm_df):
    # Make pandas show full width nicely in Colab
    pd.set_option("display.max_columns", 50)
    pd.set_option("display.width", 120)

    # Format numbers
    metrics_show = metrics_df.copy()
    float_cols = [c for c in metrics_show.columns if metrics_show[c].dtype.kind == "f"]
    for c in float_cols:
        metrics_show[c] = metrics_show[c].map(lambda x: f"{x:.4f}")

    print("Validation metrics")
    display(metrics_show)

    print("\nConfusion matrix (rows=actual, cols=predicted)")
    display(cm_df)


In [27]:
pretty_print_tables(metrics_df, cm_df)


Validation metrics


Unnamed: 0,model,split,n,failures,threshold,pr_auc_ap,precision,recall
0,LogisticRegression_balanced,valid,2000,68,0.5,0.4072,0.15,0.7941



Confusion matrix (rows=actual, cols=predicted)


Unnamed: 0,pred_0,pred_1
actual_0,1626,306
actual_1,14,54


<font color=cyan> **Clean Table for Slides/Notes**

In [24]:
def export_readable_tables(metrics_df, cm_df):
    print("Metrics (text)")
    print(metrics_df.to_string(index=False))

    print("\nMetrics (markdown)")
    print(metrics_df.to_markdown(index=False))

    print("\nConfusion matrix (markdown)")
    print(cm_df.to_markdown())


In [25]:
export_readable_tables(metrics_df, cm_df)

Metrics (text)
                      model split    n  failures  threshold  pr_auc_ap  precision   recall
LogisticRegression_balanced valid 2000        68        0.5   0.407236       0.15 0.794118

Metrics (markdown)
| model                       | split   |    n |   failures |   threshold |   pr_auc_ap |   precision |   recall |
|:----------------------------|:--------|-----:|-----------:|------------:|------------:|------------:|---------:|
| LogisticRegression_balanced | valid   | 2000 |         68 |         0.5 |    0.407236 |        0.15 | 0.794118 |

Confusion matrix (markdown)
|          |   pred_0 |   pred_1 |
|:---------|---------:|---------:|
| actual_0 |     1626 |      306 |
| actual_1 |       14 |       54 |


Confusion Matrix

In [28]:
def print_confusion_counts(cm_df):
    tn = int(cm_df.loc["actual_0", "pred_0"])
    fp = int(cm_df.loc["actual_0", "pred_1"])
    fn = int(cm_df.loc["actual_1", "pred_0"])
    tp = int(cm_df.loc["actual_1", "pred_1"])

    print(f"TN (actual 0, predicted 0): {tn}")
    print(f"FP (actual 0, predicted 1): {fp}")
    print(f"FN (actual 1, predicted 0): {fn}")
    print(f"TP (actual 1, predicted 1): {tp}")

print_confusion_counts(cm_df)


TN (actual 0, predicted 0): 1626
FP (actual 0, predicted 1): 306
FN (actual 1, predicted 0): 14
TP (actual 1, predicted 1): 54


Save Validation Predictions (for risk banding later) + metrics to Drive

Your unified plan explicitly calls for saving predictions_validation.csv-type artifacts (scores + true label).

In [16]:
def step7_save_valid_artifacts(paths, X_valid, y_valid, valid_proba, valid_pred, metrics_df, cm_df):
    preds_valid = X_valid.copy()
    preds_valid["y_true"] = y_valid.values if hasattr(y_valid, "values") else y_valid
    preds_valid["y_proba"] = valid_proba
    preds_valid[f"y_pred@{0.50:.2f}"] = valid_pred

    metrics_df.to_csv(paths["metrics_valid"], index=False)
    preds_valid.to_csv(paths["preds_valid"], index=False)
    cm_df.to_csv(paths["cm_valid"], index=True)

    print("Saved:", paths["metrics_valid"])
    print("Saved:", paths["preds_valid"])
    print("Saved:", paths["cm_valid"])
    return preds_valid

preds_valid_df = step7_save_valid_artifacts(
    paths, X_valid, y_valid, valid_proba, valid_pred, metrics_df, cm_df
)

preds_valid_df.head()


Saved: /content/drive/MyDrive/Colab Notebooks/ADALL Project/Outputs/metrics_baseline_lr_validation.csv
Saved: /content/drive/MyDrive/Colab Notebooks/ADALL Project/Outputs/predictions_baseline_lr_validation.csv
Saved: /content/drive/MyDrive/Colab Notebooks/ADALL Project/Outputs/confusion_matrix_baseline_lr_validation.csv


Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],y_true,y_proba,y_pred@0.50
409,L,297.4,308.6,1459,41.3,196,0,0.132748,0
1780,L,298.5,308.2,1595,32.3,46,0,0.035072,0
3873,L,302.6,311.8,1638,35.1,17,0,0.12252,0
9954,M,298.1,307.9,1446,42.8,121,0,0.130242,0
2743,L,299.6,309.1,1476,45.3,166,0,0.633139,1


### <font color=gold>Place Holder

### <font color=gold>Place Holder