In [24]:
# --- Core Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


# --- Core ML Libraries ---
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [25]:
# --- Data Loading ---
file_path = '../data/predictive_maintenance.csv'
df_maint = pd.read_csv(file_path)
df_maint_cleaned = df_maint.drop(["UDI", "Product ID","Failure Type"], axis=1)

In [26]:
df_maint_cleaned

Unnamed: 0,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target
0,M,298.1,308.6,1551,42.8,0,0
1,L,298.2,308.7,1408,46.3,3,0
2,L,298.1,308.5,1498,49.4,5,0
3,L,298.2,308.6,1433,39.5,7,0
4,L,298.2,308.7,1408,40.0,9,0
...,...,...,...,...,...,...,...
9995,M,298.8,308.4,1604,29.5,14,0
9996,H,298.9,308.4,1632,31.8,17,0
9997,M,299.0,308.6,1645,33.4,22,0
9998,H,299.0,308.7,1408,48.5,25,0


In [27]:
X = df_maint_cleaned.drop('Target', axis=1)
y = df_maint_cleaned['Target']

In [28]:
# Usamos stratify=y para asegurar que el desbalance de clases 
# se mantenga en ambos sets (train y test).
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=y)

In [29]:
# Identificar columnas numéricas (excluyendo las binarias de 'falla')
# Las columnas binarias (TWF, HDF, etc.) ya son numéricas y no necesitan scaling.
numeric_features = ['Air temperature [K]', 'Process temperature [K]', 
                    'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']

In [30]:
# Identificar columnas categóricas
categorical_features = ['Type']

In [31]:
# Crear el transformador numérico (solo scaling)
numeric_transformer = StandardScaler()

In [32]:
# Crear el transformador categórico (One-Hot Encoding)
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [33]:
# Combinar transformadores usando ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough' # Mantiene las columnas no especificadas (TWF, HDF, etc.)
)

In [34]:
# Crear el pipeline completo:
# 1. Preprocesar (ColumnTransformer)
# 2. Modelar (LogisticRegression)
clf_log_reg = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42, 
                                      class_weight='balanced', 
                                      max_iter=1000))
])

In [36]:
# Entrenar el pipeline
clf_log_reg.fit(X_train, y_train)

# --- 4.5 Baseline Evaluation ---
y_pred_log_reg = clf_log_reg.predict(X_test)

# Reportar las métricas clave (Accuracy es irrelevante)
print("--- Baseline: Logistic Regression Results ---")
print(classification_report(y_test, y_pred_log_reg, target_names=['0 (No Falla)', '1 (Falla)']))

--- Baseline: Logistic Regression Results ---
              precision    recall  f1-score   support

0 (No Falla)       0.99      0.82      0.90      1932
   1 (Falla)       0.14      0.82      0.24        68

    accuracy                           0.82      2000
   macro avg       0.57      0.82      0.57      2000
weighted avg       0.96      0.82      0.88      2000



In [37]:
# --- Importar modelos adicionales ---
from sklearn.tree import DecisionTreeClassifier

# --- 4.6 Model: Decision Tree ---
# Creamos un pipeline para el Árbol de Decisión
clf_tree = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42,
                                        class_weight='balanced'))
])

# Entrenar el pipeline
clf_tree.fit(X_train, y_train)

# --- 4.7 Decision Tree Evaluation ---
y_pred_tree = clf_tree.predict(X_test)

print("--- Model 2: Decision Tree Results ---")
print(classification_report(y_test, y_pred_tree, target_names=['0 (No Falla)', '1 (Falla)']))

--- Model 2: Decision Tree Results ---
              precision    recall  f1-score   support

0 (No Falla)       0.99      0.99      0.99      1932
   1 (Falla)       0.69      0.60      0.65        68

    accuracy                           0.98      2000
   macro avg       0.84      0.80      0.82      2000
weighted avg       0.98      0.98      0.98      2000



In [38]:
# --- Importar modelos adicionales ---
from sklearn.neighbors import KNeighborsClassifier

# --- 4.8 Model: K-Nearest Neighbors (KNN) ---
# Usamos el estándar k=5 vecinos
clf_knn = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=5))
])

# Entrenar el pipeline
clf_knn.fit(X_train, y_train)

# --- 4.9 KNN Evaluation ---
y_pred_knn = clf_knn.predict(X_test)

print("--- Model 3: KNN Results ---")
print(classification_report(y_test, y_pred_knn, target_names=['0 (No Falla)', '1 (Falla)']))

--- Model 3: KNN Results ---
              precision    recall  f1-score   support

0 (No Falla)       0.98      1.00      0.99      1932
   1 (Falla)       0.83      0.29      0.43        68

    accuracy                           0.97      2000
   macro avg       0.90      0.65      0.71      2000
weighted avg       0.97      0.97      0.97      2000

