##### KNN


# FailureSense – KNN and SVM Models

Objective:
- Implement distance-based classifiers
- Evaluate performance under class imbalance
- Compare with earlier supervised models


In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


In [2]:
df = pd.read_csv(r"D:\FailureSense_MLProj\failuresense\data\raw\ai4i2020.csv")

In [3]:
DROP_COLUMNS = ["UDI", "TWF", "HDF", "PWF", "OSF", "RNF"]
TARGET = "Machine failure"

X = df.drop(columns=DROP_COLUMNS + [TARGET])
y = df[TARGET]


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [5]:
numeric_features = [
    "Air temperature [K]",
    "Process temperature [K]",
    "Rotational speed [rpm]",
    "Torque [Nm]",
    "Tool wear [min]"
]

categorical_features = ["Type"]

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)


In [6]:
knn_model = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        ("model", KNeighborsClassifier(n_neighbors=5))
    ]
)

knn_model.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [7]:
y_pred_knn = knn_model.predict(X_test)


In [8]:
confusion_matrix(y_test, y_pred_knn)


array([[1928,    4],
       [  48,   20]])

In [9]:
print(classification_report(y_test, y_pred_knn))


              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1932
           1       0.83      0.29      0.43        68

    accuracy                           0.97      2000
   macro avg       0.90      0.65      0.71      2000
weighted avg       0.97      0.97      0.97      2000



## KNN Observations

- Sensitive to feature scaling
- Performance affected by class imbalance
- Computationally expensive for large datasets
- Provides local decision boundaries


## Support Vector Machine (SVM)

In [10]:
svm_model = Pipeline(
    steps=[
        ("preprocessing", preprocessor),
        ("model", SVC(kernel="rbf", probability=True))
    ]
)

svm_model.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,True
,tol,0.001
,cache_size,200
,class_weight,


In [11]:
y_pred_svm = svm_model.predict(X_test)
y_prob_svm = svm_model.predict_proba(X_test)[:, 1]


In [12]:
confusion_matrix(y_test, y_pred_svm)


array([[1930,    2],
       [  54,   14]])

In [13]:
print(classification_report(y_test, y_pred_svm))


              precision    recall  f1-score   support

           0       0.97      1.00      0.99      1932
           1       0.88      0.21      0.33        68

    accuracy                           0.97      2000
   macro avg       0.92      0.60      0.66      2000
weighted avg       0.97      0.97      0.96      2000



In [14]:
roc_auc_score(y_test, y_prob_svm)

0.9468167701863354

## SVM Observations

- Effective for non-linear decision boundaries
- Sensitive to class imbalance
- Requires careful kernel and parameter tuning
- Provides strong generalization when properly configured
