# RUN THE MODEL IN THE GOOGLE COLAB

## Library Set

In [None]:
!pip -q install kaggle joblib scikit-image opencv-python scikit-learn numpy

import os, glob, zipfile, shutil, numpy as np
from pathlib import Path



[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


## Adding Kaggle.json file

In [None]:
os.makedirs("/root/.kaggle", exist_ok=True)
!cp /content/kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json


## Download Necessay files

In [None]:
# Create folders
os.makedirs("/content/data", exist_ok=True)
os.makedirs("/content/models_out", exist_ok=True)

# Rice (Bangladesh)
!kaggle datasets download -d raihan150146/rice-leaf-diseases-dataset -p /content/data --unzip

# Jute (Bangladesh)
!kaggle datasets download -d mdsaimunalam/jute-leaf-disease-detection -p /content/data --unzip

# Potato (PlantVillage)
!kaggle datasets download -d aarishasifkhan/plantvillage-potato-disease-dataset -p /content/data --unzip

# Tomato (choose one tomato dataset from Kaggle; here using luisolazo/tomato-diseases)
!kaggle datasets download -d luisolazo/tomato-diseases -p /content/data --unzip

!ls -lah /content/data


## Detecting Dataset roots

In [None]:
from pathlib import Path

IMG_EXT = (".jpg",".jpeg",".png",".bmp",".webp")

def is_class_folder(p: Path) -> bool:
    if not p.is_dir():
        return False
    # class folder contains images directly
    for ext in IMG_EXT:
        if any(p.glob(f"*{ext}")):
            return True
    return False

def find_dataset_roots(base_dir: str):
    base = Path(base_dir)
    roots = []
    for d in base.rglob("*"):
        if d.is_dir():
            subdirs = [s for s in d.iterdir() if s.is_dir()]
            class_like = [s for s in subdirs if is_class_folder(s)]
            if len(class_like) >= 2:  # at least 2 classes
                roots.append(d)
    # keep only top-most (remove nested duplicates)
    roots = sorted(set(roots), key=lambda x: len(str(x)))
    filtered = []
    for r in roots:
        if not any(str(r).startswith(str(x) + "/") for x in filtered):
            filtered.append(r)
    return filtered

roots = find_dataset_roots("/content/data")
print("Detected dataset roots:\n")
for r in roots:
    print("-", r)
    # show class folders
    classes = [c.name for c in r.iterdir() if c.is_dir() and is_class_folder(c)]
    print("  classes:", classes[:10], "..." if len(classes) > 10 else "")


In [None]:
!find "/content/data/PlantVillage" -maxdepth 2 -type d


/content/data/PlantVillage
/content/data/PlantVillage/Potato___healthy
/content/data/PlantVillage/Potato___Early_blight
/content/data/PlantVillage/Potato___Late_blight


## Model For rice

In [None]:
import os
import numpy as np
import cv2
from pathlib import Path
from skimage.feature import hog

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import joblib

IMG_EXT = (".jpg",".jpeg",".png",".bmp",".webp")

def extract_features(img_bgr, size=(128,128)):
    img = cv2.resize(img_bgr, size, interpolation=cv2.INTER_AREA)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    hog_feat = hog(gray, orientations=9, pixels_per_cell=(8,8),
                   cells_per_block=(2,2), block_norm="L2-Hys", feature_vector=True)
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv],[0,1,2],None,[8,8,8],[0,180,0,256,0,256]).flatten()
    hist = hist / (hist.sum() + 1e-8)
    return np.concatenate([hog_feat, hist]).astype(np.float32)

def folder_has_images(p: Path) -> bool:
    if not p.is_dir(): return False
    for ext in IMG_EXT:
        if any(p.glob(f"*{ext}")):
            return True
    return False

def find_best_dataset_root(base_dir: str, crop_prefix=None):
    """
    Finds a directory D such that D has >=2 subfolders, and each subfolder contains images.
    If crop_prefix is given (PlantVillage), it filters class folders by that prefix.
    """
    base = Path(base_dir)
    candidates = []
    for d in base.rglob("*"):
        if not d.is_dir():
            continue
        subdirs = [s for s in d.iterdir() if s.is_dir()]
        if crop_prefix:
            subdirs = [s for s in subdirs if s.name.lower().startswith(crop_prefix.lower())]
        class_like = [s for s in subdirs if folder_has_images(s)]
        if len(class_like) >= 2:
            # score: total images in class folders (bigger is better)
            total_imgs = 0
            for c in class_like:
                for ext in IMG_EXT:
                    total_imgs += len(list(c.glob(f"*{ext}")))
            candidates.append((total_imgs, d, class_like))

    if not candidates:
        return None

    candidates.sort(key=lambda x: x[0], reverse=True)
    total_imgs, best_root, best_classes = candidates[0]
    print("Auto-selected dataset root:", best_root)
    print("total images:", total_imgs)
    print("example class folders:", [c.name for c in best_classes[:10]])
    return str(best_root)

def load_dataset(root_dir: str, crop_prefix=None, limit_per_class=None):
    root = Path(root_dir)
    class_dirs = [d for d in root.iterdir() if d.is_dir()]
    if crop_prefix:
        class_dirs = [d for d in class_dirs if d.name.lower().startswith(crop_prefix.lower())]

    X, y = [], []
    for cdir in class_dirs:
        files = []
        for ext in IMG_EXT:
            files += list(cdir.glob(f"*{ext}"))
        if not files:
            continue
        if limit_per_class:
            files = files[:limit_per_class]
        for fp in files:
            img = cv2.imread(str(fp))
            if img is None:
                continue
            X.append(extract_features(img))
            y.append(cdir.name)

    return np.array(X, dtype=np.float32), np.array(y)

def train_one_crop_auto(crop_name, base_dir, out_pkl, crop_prefix=None):
    print(f"\n===== Training: {crop_name} =====")
    print("base_dir:", base_dir, "prefix:", crop_prefix)

    root = find_best_dataset_root(base_dir, crop_prefix=crop_prefix)
    if root is None:
        raise ValueError(f"Could not find class folders with images inside: {base_dir}")

    X, y = load_dataset(root, crop_prefix=crop_prefix)
    print("Samples:", len(y))
    if len(y) < 10:
        raise ValueError("Too few images found. Check dataset extraction path.")

    le = LabelEncoder()
    y_enc = le.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
    )

    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s  = scaler.transform(X_test)

    models = {
        "LinearSVC": LinearSVC(),
        "LogReg": LogisticRegression(max_iter=5000),
        "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
        "KNN": KNeighborsClassifier(n_neighbors=7),
    }

    best_name, best_acc, best_model = None, -1, None
    for name, m in models.items():
        if name == "RandomForest":
            m.fit(X_train, y_train)
            pred = m.predict(X_test)
        else:
            m.fit(X_train_s, y_train)
            pred = m.predict(X_test_s)

        acc = accuracy_score(y_test, pred)
        print(f"{name}: {acc:.4f}")
        if acc > best_acc:
            best_acc, best_name, best_model = acc, name, m

    bundle = {
        "crop": crop_name,
        "model_name": best_name,
        "model": best_model,
        "scaler": scaler,
        "label_encoder": le,
        "dataset_root": root,
        "feature_info": {"img_size": (128,128), "hog": "9 ori, 8x8, 2x2", "hsv_hist": (8,8,8)},
    }
    os.makedirs(os.path.dirname(out_pkl), exist_ok=True)
    joblib.dump(bundle, out_pkl)
    print("Saved:", out_pkl)
    print("Classes:", list(le.classes_))

# Train using YOUR folder names (works even if nested)
train_one_crop_auto("rice",   "/content/data/Rice leaf disease", "/content/models_out/rice_model.pkl")
train_one_crop_auto("jute",   "/content/data/Jute Leaf Disease Detection", "/content/models_out/jute_model.pkl")

# PlantVillage: use prefix filter so it only trains that crop
train_one_crop_auto("potato", "/content/data/PlantVillage", "/content/models_out/potato_model.pkl", crop_prefix="Potato___")
train_one_crop_auto("tomato", "/content/data/PlantVillage", "/content/models_out/tomato_model.pkl", crop_prefix="Tomato___")


## Path for Jute Model

In [None]:
!find "/content/data/Jute Leaf Disease Detection" -maxdepth 3 -type d
!find "/content/data/Jute Leaf Disease Detection" -type f | head -n 20


/content/data/Jute Leaf Disease Detection
/content/data/Jute Leaf Disease Detection/Golden Mosaic
/content/data/Jute Leaf Disease Detection/Healthy Leaf
/content/data/Jute Leaf Disease Detection/Cescospora Leaf Spot
/content/data/Jute Leaf Disease Detection/Golden Mosaic/Golden_Mosaic (27).jpg
/content/data/Jute Leaf Disease Detection/Golden Mosaic/Golden_Mosaic (106).jpg
/content/data/Jute Leaf Disease Detection/Golden Mosaic/Golden_Mosaic (210).jpg
/content/data/Jute Leaf Disease Detection/Golden Mosaic/Golden_Mosaic (137).jpg
/content/data/Jute Leaf Disease Detection/Golden Mosaic/Golden_Mosaic (191).jpg
/content/data/Jute Leaf Disease Detection/Golden Mosaic/Golden_Mosaic (180).jpg
/content/data/Jute Leaf Disease Detection/Golden Mosaic/Golden_Mosaic (146).jpg
/content/data/Jute Leaf Disease Detection/Golden Mosaic/Golden_Mosaic (230).jpg
/content/data/Jute Leaf Disease Detection/Golden Mosaic/Golden_Mosaic (155).jpg
/content/data/Jute Leaf Disease Detection/Golden Mosaic/Golden_Mo

## Model for Jute

In [None]:
import os
import numpy as np
import cv2
from pathlib import Path
from skimage.feature import hog

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import joblib

EXTS = {".jpg",".jpeg",".png",".bmp",".webp"}

def extract_features(img_bgr, size=(128,128)):
    img = cv2.resize(img_bgr, size, interpolation=cv2.INTER_AREA)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    hog_feat = hog(gray, orientations=9, pixels_per_cell=(8,8),
                   cells_per_block=(2,2), block_norm="L2-Hys", feature_vector=True)
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv],[0,1,2],None,[8,8,8],[0,180,0,256,0,256]).flatten()
    hist = hist / (hist.sum() + 1e-8)
    return np.concatenate([hog_feat, hist]).astype(np.float32)

def load_class_folder_dataset(root_dir, limit_per_class=None):
    root = Path(root_dir)
    if not root.exists():
        raise FileNotFoundError(root_dir)

    class_dirs = [d for d in root.iterdir() if d.is_dir()]
    if len(class_dirs) < 2:
        raise ValueError("Not enough class folders found!")

    X, y = [], []
    for cdir in class_dirs:
        imgs = [f for f in cdir.glob("*") if f.is_file() and f.suffix.lower() in EXTS]
        if not imgs:
            # if images are nested deeper, use rglob
            imgs = [f for f in cdir.rglob("*") if f.is_file() and f.suffix.lower() in EXTS]

        if limit_per_class:
            imgs = imgs[:limit_per_class]

        for fp in imgs:
            img = cv2.imread(str(fp))
            if img is None:
                continue
            X.append(extract_features(img))
            y.append(cdir.name)

    X = np.array(X, dtype=np.float32)
    y = np.array(y)
    return X, y

def train_save(root_dir, out_pkl, crop_name="jute"):
    X, y = load_class_folder_dataset(root_dir)
    print("Samples:", len(y), "Feature dim:", X.shape[1])
    print("Classes:", sorted(set(y)))

    le = LabelEncoder()
    y_enc = le.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
    )

    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s  = scaler.transform(X_test)

    models = {
        "LinearSVC": LinearSVC(),
        "LogReg": LogisticRegression(max_iter=5000),
        "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
        "KNN": KNeighborsClassifier(n_neighbors=7),
    }

    best_name, best_acc, best_model = None, -1, None
    for name, m in models.items():
        if name == "RandomForest":
            m.fit(X_train, y_train)
            pred = m.predict(X_test)
        else:
            m.fit(X_train_s, y_train)
            pred = m.predict(X_test_s)

        acc = accuracy_score(y_test, pred)
        print(f"{name}: {acc:.4f}")
        if acc > best_acc:
            best_acc, best_name, best_model = acc, name, m

    # report best
    if best_name == "RandomForest":
        pred = best_model.predict(X_test)
    else:
        pred = best_model.predict(X_test_s)

    print("\nBest model:", best_name, "Acc:", best_acc)
    print(classification_report(y_test, pred, target_names=le.classes_))

    bundle = {
        "crop": crop_name,
        "model_name": best_name,
        "model": best_model,
        "scaler": scaler,
        "label_encoder": le,
        "dataset_root": root_dir,
        "feature_info": {"img_size": (128,128), "hog": "9 ori, 8x8, 2x2", "hsv_hist": (8,8,8)},
    }

    os.makedirs(os.path.dirname(out_pkl), exist_ok=True)
    joblib.dump(bundle, out_pkl)
    print("Saved:", out_pkl)

# Train Jute from your exact folder
train_save(
    "/content/data/Jute Leaf Disease Detection",
    "/content/models_out/jute_model.pkl",
    crop_name="jute"
)


Samples: 920 Feature dim: 8612
Classes: [np.str_('Cescospora Leaf Spot'), np.str_('Golden Mosaic'), np.str_('Healthy Leaf')]
LinearSVC: 0.6902
LogReg: 0.6848
RandomForest: 0.7663
KNN: 0.6250

Best model: RandomForest Acc: 0.7663043478260869
                      precision    recall  f1-score   support

Cescospora Leaf Spot       0.70      0.53      0.61        62
       Golden Mosaic       0.73      0.80      0.76        69
        Healthy Leaf       0.85      1.00      0.92        53

            accuracy                           0.77       184
           macro avg       0.76      0.78      0.76       184
        weighted avg       0.76      0.77      0.76       184

✅ Saved: /content/models_out/jute_model.pkl


## Model for Potato

In [None]:
import os
import numpy as np
import cv2
from pathlib import Path
from skimage.feature import hog

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import joblib

# -------------------------
# Settings
# -------------------------
POTATO_ROOT = "/content/data/PlantVillage"     
CROP_PREFIX = "Potato___"
OUT_PKL = "/content/models_out/potato_model.pkl"
os.makedirs("/content/models_out", exist_ok=True)

EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}

# -------------------------
# Feature extraction
# -------------------------
def extract_features(img_bgr, size=(128,128)):
    img = cv2.resize(img_bgr, size, interpolation=cv2.INTER_AREA)

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    hog_feat = hog(
        gray,
        orientations=9,
        pixels_per_cell=(8,8),
        cells_per_block=(2,2),
        block_norm="L2-Hys",
        feature_vector=True
    )

    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv],[0,1,2],None,[8,8,8],[0,180,0,256,0,256]).flatten()
    hist = hist / (hist.sum() + 1e-8)

    return np.concatenate([hog_feat, hist]).astype(np.float32)

# -------------------------
# Load Potato classes
# -------------------------
def load_potato_dataset(root_dir, prefix="Potato___", limit_per_class=None):
    root = Path(root_dir)
    if not root.exists():
        raise FileNotFoundError(root_dir)

    class_dirs = [d for d in root.iterdir() if d.is_dir() and d.name.lower().startswith(prefix.lower())]
    if len(class_dirs) < 2:
        raise ValueError(f"Not enough Potato classes found in {root_dir}")

    print("Detected Potato classes:")
    for d in class_dirs:
        print(" -", d.name)

    X, y = [], []
    for cdir in class_dirs:
        imgs = [f for f in cdir.rglob("*") if f.is_file() and f.suffix.lower() in EXTS]
        if limit_per_class:
            imgs = imgs[:limit_per_class]

        for fp in imgs:
            img = cv2.imread(str(fp))
            if img is None:
                continue
            X.append(extract_features(img))
            y.append(cdir.name)

    return np.array(X, dtype=np.float32), np.array(y)

# -------------------------
# Train + save best ML model
# -------------------------
def train_potato_model():
    X, y = load_potato_dataset(POTATO_ROOT, CROP_PREFIX, limit_per_class=None)
    print("\nTotal samples:", len(y), "Feature dim:", X.shape[1])

    le = LabelEncoder()
    y_enc = le.fit_transform(y)

    X_train, X_test, y_train, y_test = train_test_split(
        X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
    )

    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s  = scaler.transform(X_test)

    models = {
        "LinearSVC": LinearSVC(),
        "LogReg": LogisticRegression(max_iter=5000),
        "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1),
        "KNN": KNeighborsClassifier(n_neighbors=7),
    }

    best_name, best_acc, best_model = None, -1, None

    for name, m in models.items():
        if name == "RandomForest":
            m.fit(X_train, y_train)
            pred = m.predict(X_test)
        else:
            m.fit(X_train_s, y_train)
            pred = m.predict(X_test_s)

        acc = accuracy_score(y_test, pred)
        print(f"{name} accuracy: {acc:.4f}")
        if acc > best_acc:
            best_acc, best_name, best_model = acc, name, m

    # Best model report
    if best_name == "RandomForest":
        pred = best_model.predict(X_test)
    else:
        pred = best_model.predict(X_test_s)

    print("\nBest model:", best_name, "Acc:", best_acc)
    print(classification_report(y_test, pred, target_names=le.classes_))

    bundle = {
        "crop": "potato",
        "model_name": best_name,
        "model": best_model,
        "scaler": scaler,
        "label_encoder": le,
        "dataset_root": POTATO_ROOT,
        "class_prefix": CROP_PREFIX,
        "feature_info": {"img_size": (128,128), "hog": "9 ori, 8x8, 2x2", "hsv_hist": (8,8,8)},
    }

    joblib.dump(bundle, OUT_PKL)
    print("\nSaved potato model to:", OUT_PKL)

train_potato_model()


Detected Potato classes:
 - Potato___healthy
 - Potato___Early_blight
 - Potato___Late_blight

Total samples: 2152 Feature dim: 8612
LinearSVC accuracy: 0.9582
LogReg accuracy: 0.9513
RandomForest accuracy: 0.9629
KNN accuracy: 0.7564

Best model: RandomForest Acc: 0.962877030162413
                       precision    recall  f1-score   support

Potato___Early_blight       0.99      0.96      0.98       200
 Potato___Late_blight       0.93      0.99      0.96       200
     Potato___healthy       1.00      0.77      0.87        31

             accuracy                           0.96       431
            macro avg       0.97      0.91      0.94       431
         weighted avg       0.97      0.96      0.96       431


✅ Saved potato model to: /content/models_out/potato_model.pkl


## Model for Tomato

In [None]:
import os
import numpy as np
import cv2
from pathlib import Path
from skimage.feature import hog
from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import joblib

EXTS = {".jpg",".jpeg",".png",".bmp",".webp"}

# SET THIS base folder (must contain train/ and test/)
TOMATO_BASE = "/content/data"    # your screenshot shows /content/data/train and /content/data/test
TRAIN_DIR = os.path.join(TOMATO_BASE, "train")
TEST_DIR  = os.path.join(TOMATO_BASE, "test")

OUT_PKL = "/content/models_out/tomato_model.pkl"
CACHE_TRAIN = "/content/models_out/tomato_train_feats.npz"
CACHE_TEST  = "/content/models_out/tomato_test_feats.npz"
os.makedirs("/content/models_out", exist_ok=True)

# ---------- Feature extraction ----------
def extract_features(img_bgr, size=(96, 96)):   # ✅ smaller size = faster
    img = cv2.resize(img_bgr, size, interpolation=cv2.INTER_AREA)

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    hog_feat = hog(
        gray, orientations=9, pixels_per_cell=(8,8),
        cells_per_block=(2,2), block_norm="L2-Hys", feature_vector=True
    )

    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv],[0,1,2],None,[8,8,8],[0,180,0,256,0,256]).flatten()
    hist = hist / (hist.sum() + 1e-8)

    return np.concatenate([hog_feat, hist]).astype(np.float32)

def list_class_folders(root_dir):
    root = Path(root_dir)
    if not root.exists():
        raise FileNotFoundError(root_dir)
    classes = [d for d in root.iterdir() if d.is_dir()]
    if len(classes) < 2:
        raise ValueError(f"Not enough class folders in: {root_dir}")
    return classes

def load_folder_dataset_with_progress(root_dir, limit_per_class=None):
    class_dirs = list_class_folders(root_dir)

    # Build file list first (so we can show global progress)
    items = []
    for cdir in class_dirs:
        files = [f for f in cdir.rglob("*") if f.is_file() and f.suffix.lower() in EXTS]
        if limit_per_class:
            files = files[:limit_per_class]
        for fp in files:
            items.append((str(fp), cdir.name))

    if len(items) == 0:
        raise ValueError(f"No images found in: {root_dir}")

    X, y = [], []
    for fp, label in tqdm(items, desc=f"Extracting features from {Path(root_dir).name}", total=len(items)):
        img = cv2.imread(fp)
        if img is None:
            continue
        X.append(extract_features(img))
        y.append(label)

    return np.array(X, dtype=np.float32), np.array(y)

def load_or_build_cache(cache_path, folder, limit_per_class=None):
    if os.path.exists(cache_path):
        data = np.load(cache_path, allow_pickle=True)
        print(f"Loaded cache: {cache_path}  X={data['X'].shape}, y={data['y'].shape}")
        return data["X"], data["y"]

    X, y = load_folder_dataset_with_progress(folder, limit_per_class=limit_per_class)
    np.savez_compressed(cache_path, X=X, y=y)
    print(f"Saved cache: {cache_path}  X={X.shape}, y={y.shape}")
    return X, y

# ---------- Load data (with cache) ----------
# For quick run, set limit_per_class like 800 (train) and 200 (test)
LIMIT_TRAIN_PER_CLASS = 800   # e.g. 800
LIMIT_TEST_PER_CLASS  = 200   # e.g. 200

print("TRAIN_DIR:", TRAIN_DIR)
print("TEST_DIR :", TEST_DIR)

X_train, y_train = load_or_build_cache(CACHE_TRAIN, TRAIN_DIR, limit_per_class=LIMIT_TRAIN_PER_CLASS)
X_test,  y_test  = load_or_build_cache(CACHE_TEST,  TEST_DIR,  limit_per_class=LIMIT_TEST_PER_CLASS)

print("Train samples:", len(y_train))
print("Test samples :", len(y_test))

# ---------- Encode labels ----------
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc  = le.transform(y_test)

# ---------- Scale for linear models ----------
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s  = scaler.transform(X_test)

# ---------- Train models ----------
models = {
    "LinearSVC": LinearSVC(),
    "LogReg": LogisticRegression(max_iter=5000),
    "RandomForest": RandomForestClassifier(n_estimators=250, random_state=42, n_jobs=-1),
    "KNN": KNeighborsClassifier(n_neighbors=7),
}

best_name, best_acc, best_model = None, -1, None

for name, m in models.items():
    print(f"\n--- Training {name} ---")
    if name == "RandomForest":
        m.fit(X_train, y_train_enc)
        pred = m.predict(X_test)
    else:
        m.fit(X_train_s, y_train_enc)
        pred = m.predict(X_test_s)

    acc = accuracy_score(y_test_enc, pred)
    print(f"{name} test accuracy: {acc:.4f}")

    if acc > best_acc:
        best_acc, best_name, best_model = acc, name, m

# ---------- Final report ----------
if best_name == "RandomForest":
    final_pred = best_model.predict(X_test)
else:
    final_pred = best_model.predict(X_test_s)

print("\nBest model:", best_name, "Test Acc:", best_acc)
print(classification_report(y_test_enc, final_pred, target_names=le.classes_))

# ---------- Save bundle ----------
bundle = {
    "crop": "tomato",
    "model_name": best_name,
    "model": best_model,
    "scaler": scaler,
    "label_encoder": le,
    "dataset_root": TOMATO_BASE,
    "feature_info": {"img_size": (96,96), "hog": "9 ori, 8x8, 2x2", "hsv_hist": (8,8,8)},
}

joblib.dump(bundle, OUT_PKL)
print("Saved:", OUT_PKL)


TRAIN_DIR: /content/data/train
TEST_DIR : /content/data/test


Extracting features from train: 100%|██████████| 8000/8000 [01:09<00:00, 115.67it/s]


✅ Saved cache: /content/models_out/tomato_train_feats.npz  X=(8000, 4868), y=(8000,)


Extracting features from test: 100%|██████████| 2000/2000 [00:15<00:00, 132.89it/s]


✅ Saved cache: /content/models_out/tomato_test_feats.npz  X=(2000, 4868), y=(2000,)
Train samples: 8000
Test samples : 2000

--- Training LinearSVC ---




LinearSVC test accuracy: 0.8340

--- Training LogReg ---
LogReg test accuracy: 0.8665

--- Training RandomForest ---
RandomForest test accuracy: 0.9160

--- Training KNN ---
KNN test accuracy: 0.5020

✅ Best model: RandomForest Test Acc: 0.916
                        precision    recall  f1-score   support

        bacterial_spot       0.92      0.94      0.93       200
          early_blight       0.87      0.65      0.74       200
               healthy       0.97      0.95      0.96       200
           late_blight       0.82      0.84      0.83       200
             leaf_mold       0.96      0.98      0.97       200
          mosaic_virus       0.98      0.98      0.98       200
    septoria_leaf_spot       0.89      0.97      0.93       200
           target_spot       0.91      0.90      0.90       200
twospotted_spider_mite       0.90      0.97      0.93       200
yellow_leaf_curl_virus       0.93      0.98      0.96       200

              accuracy                           0

## Download the pkl zip files

In [None]:
!zip -r models_out.zip /content/models_out/*.pkl
from google.colab import files
files.download("models_out.zip")

  adding: content/models_out/jute_model.pkl (deflated 76%)
  adding: content/models_out/potato_model.pkl (deflated 75%)
  adding: content/models_out/rice_model.pkl (deflated 82%)
  adding: content/models_out/tomato_model.pkl (deflated 85%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>