In [1]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_iris
import numpy as np
iris = load_iris()
X = iris.data  # Features (measurements)
y = iris.target 

Using Feature engineering to train model on new feature Sepal Area Ratio

In [2]:
import numpy as np
def add_feature_engineering(X):
  # Existing features
  sepal_length = X[:, 0]
  sepal_width = X[:, 1]

  # New feature: Sepal Area Ratio
  SAR = sepal_width / sepal_length
  
  # Combine original features and new feature
  X_new = np.column_stack((X, SAR))
  return X_new


In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

Comparing with the accuracy of the baseline model(without SAR)

In [5]:
knn_baseline = KNeighborsClassifier(n_neighbors=5)
knn_baseline.fit(X_train[:, :4], y_train)  # Use only first 4 features

y_pred_baseline = knn_baseline.predict(X_test[:, :4])
accuracy_baseline = accuracy_score(y_test, y_pred_baseline)

print(f"Model Accuracy (Baseline): {accuracy_baseline:.2f}")

Model Accuracy (Baseline): 1.00


This approach performs K-Fold cross-validation while simultaneously tuning the n_neighbors parameter of the KNN model. GridSearchCV will evaluate different combinations of hyperparameter values within the defined grid and select the one that achieves the best average accuracy across the folds.

In [6]:
# KFold for cross-validation
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Parameter grid for KNN
param_grid = {'n_neighbors': [3, 5, 7]}

# GridSearchCV with cross-validation
knn_cv = GridSearchCV(KNeighborsClassifier(), param_grid, cv=cv, scoring='accuracy')
knn_cv.fit(add_feature_engineering(X_train), y_train)


In [7]:
y_pred = knn.predict(X_test)

from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy with Feature Engineering: {accuracy:.2f}")


Model Accuracy with Feature Engineering: 1.00


In [8]:
# Best model and parameters
best_model = knn_cv.best_estimator_
best_params = knn_cv.best_params_

# Predict on test set using best model
y_pred = best_model.predict(add_feature_engineering(X_test))
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy with Feature Engineering and CV+Hyperparameter Tuning: {accuracy:.2f}")
print(f"Best KNN hyperparameter: {best_params}")


Model Accuracy with Feature Engineering and CV+Hyperparameter Tuning: 1.00
Best KNN hyperparameter: {'n_neighbors': 5}
