In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import cv2
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
from pathlib import Path

data_color = "/kaggle/input/plantvillage-dataset/color"
data_grayscale = "/kaggle/input/plantvillage-dataset/grayscale"
data_segmented = "/kaggle/input/plantvillage-dataset/segmented"

paths = [path.parts[-2:] for path in Path(data_color).rglob("*.*")]

df = pd.DataFrame(paths, columns=["Class", "Images"])
df = df.sort_values("Class").reset_index(drop=True)

df.head()

Unnamed: 0,Class,Images
0,Apple___Apple_scab,f0d56524-1296-4a54-bafa-9b620baf9f1d___FREC_Sc...
1,Apple___Apple_scab,6643566f-d980-4bdb-88d7-4d3ab3c771fa___FREC_Sc...
2,Apple___Apple_scab,0b1e31fa-cbc0-41ed-9139-c794e6855e82___FREC_Sc...
3,Apple___Apple_scab,1a21aabb-6f74-4644-8d9e-a517568b7e9c___FREC_Sc...
4,Apple___Apple_scab,258ce9eb-2b67-475b-b09c-0bc83b0987f1___FREC_Sc...


In [4]:
dir_path = "/kaggle/input/plantvillage-dataset/color"

images = []
y = []

for class_name in os.listdir(dir_path):
    class_path = os.path.join(dir_path, class_name)

    if not os.path.isdir(class_path):
        continue

    # Label: healthy = 0, diseased = 1
    if "healthy" in class_name.lower():
        k = 0
    else:
        k = 1

    for img_name in os.listdir(class_path):
        img_path = os.path.join(class_path, img_name)

        img = cv2.imread(img_path)
        if img is None:
            continue

        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        img = cv2.resize(img, (64, 64))

        images.append(img)
        y.append(k)

images = np.array(images)
y = np.array(y)

# Flatten images
X = images.reshape(images.shape[0], -1)

# Standardize
X = StandardScaler().fit_transform(X)

print(X.shape, y.shape)

(54305, 12288) (54305,)


In [5]:
features = pd.DataFrame(X)
features.to_csv("/kaggle/working/features.csv", index=False)

labels = pd.DataFrame(y, columns=["label"])
labels.to_csv("/kaggle/working/labels.csv", index=False)

print("Saved files:")
print("/kaggle/working/features.csv")
print("/kaggle/working/labels.csv")

Saved files:
/kaggle/working/features.csv
/kaggle/working/labels.csv


In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(f'Accuracy: {model.score(X_test, y_test)}')

              precision    recall  f1-score   support

           0       0.92      0.68      0.78      3013
           1       0.89      0.98      0.93      7848

    accuracy                           0.89     10861
   macro avg       0.90      0.83      0.85     10861
weighted avg       0.90      0.89      0.89     10861

[[2045  968]
 [ 189 7659]]
Accuracy: 0.8934720559801124


2045 healthy leaves correctly identified (True Negatives – TN)
→ Healthy samples correctly predicted as healthy.

968 healthy leaves misclassified as diseased (False Positives – FP)
→ Healthy samples wrongly flagged as diseased (false alarm).

7659 diseased leaves correctly identified (True Positives – TP)
→ Diseased samples correctly detected by the model.

189 diseased leaves missed (False Negatives – FN)
→ Diseased samples incorrectly predicted as healthy.

Overall Accuracy

Accuracy: 89.3%
→ The model correctly classifies most test samples.

Class-wise Performance
Class 0 – Healthy leaves

Precision = 0.92
→ When the model predicts healthy, it is usually correct (low FP for healthy).

Recall = 0.68
→ Some healthy leaves are misclassified as diseased (higher FP).

Class 1 – Diseased leaves

Precision = 0.89
→ Most disease predictions are correct.

Recall = 0.98
→ Very few diseased leaves are missed (low FN).

Key Insight

False Positives (968): Healthy leaves flagged as diseased

False Negatives (189): Diseased leaves missed

Conclusion

→ The model prioritizes minimizing false negatives, making it highly effective for disease detection, which is desirable since missing diseased plants is more harmful than false alarms.