In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load processed feature sets
X_train = pd.read_csv("../data/processed/X_train_scaled.csv")
X_val   = pd.read_csv("../data/processed/X_val_scaled.csv")

# Load processed label sets (contain Machine failure + TWF/HDF/etc.)
y_train_raw = pd.read_csv("../data/processed/y_train.csv")
y_val_raw   = pd.read_csv("../data/processed/y_val.csv")

# Collapse one-hot failure subtype columns into a single multiclass target
def create_multiclass_target(df):
    failure_cols = ["TWF", "HDF", "PWF", "OSF", "RNF"]
    # If all zero → no failure → class 0
    target = df[failure_cols].idxmax(axis=1)
    target = target.replace({
        "TWF": 1,
        "HDF": 2,
        "PWF": 3,
        "OSF": 4,
        "RNF": 5
    }).fillna(0).astype(int)
    return target.values

y_train = create_multiclass_target(y_train_raw)
y_val   = create_multiclass_target(y_val_raw)

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

# Baseline multinomial logistic regression
model = LogisticRegression(
    multi_class="multinomial",
    class_weight="balanced",
    max_iter=1000,
    random_state=123
)

model.fit(X_train, y_train)

# Evaluate
preds = model.predict(X_val)
print(classification_report(y_val, preds))


(7499, 6) (7499,)
(1501, 6) (1501,)
              precision    recall  f1-score   support

           1       1.00      0.63      0.77      1455
           2       0.28      0.94      0.44        18
           3       0.32      0.90      0.47        10
           4       0.41      1.00      0.58        15
           5       0.00      0.67      0.01         3

    accuracy                           0.64      1501
   macro avg       0.40      0.83      0.45      1501
weighted avg       0.98      0.64      0.76      1501



  target = target.replace({
  target = target.replace({
