## KNN-Based Two-Stage Fruit & Vegetable Classifier

This notebook implements a two-stage classification pipeline using
K-Nearest Neighbors (KNN). The same MobileNetV2 feature extractor and
PCA dimensionality reduction used in the SVM experiment are reused here
to ensure a fair comparison.


In [3]:
import torch
from torchvision import models, transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
import numpy as np

from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import os 



In [6]:
# -------------------- Image preprocessing --------------------
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

# -------------------- Paths --------------------
# Set base_dir to the parent folder of notebooks
base_dir = os.path.dirname(os.getcwd())  # one level up from 'notebooks'
print("Base directory:", base_dir)
print("Contents:", os.listdir(base_dir))  # sanity check

# -------------------- Load dataset --------------------
# Stage 1: Fruit vs Vegetable
train_dataset_stage1 = ImageFolder(os.path.join(base_dir, "train"), transform=transform)
#val_dataset_stage1   = ImageFolder(os.path.join(base_dir, "validation"), transform=transform)
test_dataset_stage1  = ImageFolder(os.path.join(base_dir, "test"), transform=transform)

train_loader_stage1 = DataLoader(train_dataset_stage1, batch_size=32, shuffle=False)
#val_loader_stage1   = DataLoader(val_dataset_stage1,   batch_size=32, shuffle=False)
test_loader_stage1  = DataLoader(test_dataset_stage1,  batch_size=32, shuffle=False)

# Stage 2: Fine‑grained classes (each subfolder under train/validation/test is a class)
train_dataset_stage2 = ImageFolder(os.path.join(base_dir, "train"), transform=transform)
#val_dataset_stage2   = ImageFolder(os.path.join(base_dir, "validation"), transform=transform)
test_dataset_stage2  = ImageFolder(os.path.join(base_dir, "test"), transform=transform)

train_loader_stage2 = DataLoader(train_dataset_stage2, batch_size=32, shuffle=False)
#val_loader_stage2   = DataLoader(val_dataset_stage2,   batch_size=32, shuffle=False)
test_loader_stage2  = DataLoader(test_dataset_stage2,  batch_size=32, shuffle=False)

# -------------------- Class names --------------------
print("Stage 1 classes (Fruit vs Vegetable):", train_dataset_stage1.classes)
print("Stage 2 classes (Fine-grained):", train_dataset_stage2.classes)


Base directory: c:\Users\Hams\Downloads\machine-learning\MLproject\fruit_veg_classifier
Contents: ['.gitignore', 'notebooks', 'test', 'train', 'venv']
Stage 1 classes (Fruit vs Vegetable): ['fruit', 'vegetable']
Stage 2 classes (Fine-grained): ['fruit', 'vegetable']


In [7]:
# Load pretrained MobileNetV2
model = models.mobilenet_v2(pretrained=True)
feature_extractor = torch.nn.Sequential(*list(model.children())[:-1])
feature_extractor.eval()  # freeze weights

# Function to extract features
def extract_features(loader):
    features_list, labels_list = [], []
    with torch.no_grad():
        for images, labels in loader:
            outputs = feature_extractor(images)
            outputs = outputs.view(outputs.size(0), -1)
            features_list.append(outputs.numpy())
            labels_list.append(labels.numpy())
    return np.concatenate(features_list), np.concatenate(labels_list)

# Stage 1 features
X_train_stage1, y_train_stage1 = extract_features(train_loader_stage1)
X_test_stage1, y_test_stage1   = extract_features(test_loader_stage1)

# Stage 2 features (flattened folders)
X_train_stage2, y_train_stage2 = extract_features(train_loader_stage2)
X_test_stage2, y_test_stage2   = extract_features(test_loader_stage2)



In [8]:
pca = PCA(n_components=512)
X_train_stage1_pca = pca.fit_transform(X_train_stage1)
X_test_stage1_pca  = pca.transform(X_test_stage1)
X_train_stage2_pca = pca.fit_transform(X_train_stage2)
X_test_stage2_pca  = pca.transform(X_test_stage2)

In [9]:
knn_stage1 = KNeighborsClassifier(
    n_neighbors=5,
    weights="distance",
    metric="euclidean"
)

knn_stage1.fit(X_train_stage1_pca, y_train_stage1)


0,1,2
,n_neighbors,5
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'euclidean'
,metric_params,
,n_jobs,


In [10]:
y_pred_stage1 = knn_stage1.predict(X_test_stage1_pca)

print("Stage 1 Accuracy (Fruit vs Vegetable):",
      accuracy_score(y_test_stage1, y_pred_stage1))

print("Stage 1 Confusion Matrix:\n",
      confusion_matrix(y_test_stage1, y_pred_stage1))

print(classification_report(
    y_test_stage1,
    y_pred_stage1,
    target_names=["Fruit", "Vegetable"]
))


Stage 1 Accuracy (Fruit vs Vegetable): 0.9637883008356546
Stage 1 Confusion Matrix:
 [[ 76  13]
 [  0 270]]
              precision    recall  f1-score   support

       Fruit       1.00      0.85      0.92        89
   Vegetable       0.95      1.00      0.98       270

    accuracy                           0.96       359
   macro avg       0.98      0.93      0.95       359
weighted avg       0.97      0.96      0.96       359



In [11]:
knn_stage2 = KNeighborsClassifier(
    n_neighbors=5,
    weights="distance",
    metric="euclidean"
)

knn_stage2.fit(X_train_stage2_pca, y_train_stage2)


0,1,2
,n_neighbors,5
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'euclidean'
,metric_params,
,n_jobs,


In [12]:
y_final_pred = []

for i in range(len(X_test_stage2_pca)):

    # Stage 1 decision (already computed)
    _ = y_pred_stage1[i]  # kept for consistency with SVM

    # Stage 2 prediction
    pred = knn_stage2.predict(
        X_test_stage2_pca[i].reshape(1, -1)
    )[0]

    y_final_pred.append(pred)

y_final_pred = np.array(y_final_pred)
print("Stage 2 Accuracy (Fine-Grained):",
      accuracy_score(y_test_stage2, y_final_pred))

Stage 2 Accuracy (Fine-Grained): 0.958217270194986


In [13]:
print("Final Two-Stage KNN Accuracy:",
      accuracy_score(y_test_stage2, y_final_pred))

print("Final Two-Stage Classification Report:\n")

print(classification_report(
    y_test_stage2,
    y_final_pred,
    target_names=train_dataset_stage2.classes
))


Final Two-Stage KNN Accuracy: 0.958217270194986
Final Two-Stage Classification Report:

              precision    recall  f1-score   support

       fruit       0.99      0.84      0.91        89
   vegetable       0.95      1.00      0.97       270

    accuracy                           0.96       359
   macro avg       0.97      0.92      0.94       359
weighted avg       0.96      0.96      0.96       359



In [14]:
for i in range(10):
    print(
        f"Predicted: {train_dataset_stage2.classes[y_final_pred[i]]}, "
        f"True: {train_dataset_stage2.classes[y_test_stage2[i]]}"
    )


Predicted: fruit, True: fruit
Predicted: fruit, True: fruit
Predicted: fruit, True: fruit
Predicted: fruit, True: fruit
Predicted: vegetable, True: fruit
Predicted: fruit, True: fruit
Predicted: fruit, True: fruit
Predicted: fruit, True: fruit
Predicted: fruit, True: fruit
Predicted: vegetable, True: fruit
