<a href="https://colab.research.google.com/github/Ruchithasunkoji/Dog-Breed-Classification-and-Info-Generation/blob/main/model_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Traing the dataset by taking 50 images from 10 breeds

In [None]:
# STEP 1 — Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os
import cv2
import zipfile
import numpy as np
import random
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# ✅ 2. Path to your dataset ZIP file (change this to your actual path)
zip_path = "/content/drive/MyDrive/StanfordDogs/stanford_dogs_cleaned.zip"
extract_path = "/content/dogs_dataset"

In [None]:
# ✅ 3. Unzip dataset if not already extracted
if not os.path.exists(extract_path):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_path)

In [None]:
import os
import cv2
import numpy as np
import random
from skimage.feature import hog
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")


#  Dataset settings

CLEANED_DIR = "/content/dogs_dataset"
num_breeds = 10
images_per_breed = 50
img_size = (128,128)  # larger image → more detailed HOG


# Select random 10 breeds

all_breeds = [b for b in os.listdir(CLEANED_DIR) if os.path.isdir(os.path.join(CLEANED_DIR, b))]
selected_breeds = random.sample(all_breeds, num_breeds)
print("Selected breeds:", selected_breeds)


# Feature extraction: HOG + Color Histogram

def extract_features(img_path):
    img = cv2.imread(img_path)
    if img is None:
        return None
    img = cv2.resize(img, img_size)

    # Color histogram
    hist_r = cv2.calcHist([img],[0],None,[32],[0,256])
    hist_g = cv2.calcHist([img],[1],None,[32],[0,256])
    hist_b = cv2.calcHist([img],[2],None,[32],[0,256])
    color_features = np.concatenate([hist_r, hist_g, hist_b]).flatten()

    # HOG features
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    hog_features = hog(gray, orientations=9, pixels_per_cell=(8,8),
                       cells_per_block=(2,2), block_norm='L2-Hys')

    # Combine features
    features = np.concatenate([color_features, hog_features])
    return features

X, y = [], []

for breed in selected_breeds:
    breed_path = os.path.join(CLEANED_DIR, breed)
    files = [f for f in os.listdir(breed_path) if f.lower().endswith((".jpg",".png"))]
    selected_files = random.sample(files, min(len(files), images_per_breed))

    for f in selected_files:
        features = extract_features(os.path.join(breed_path, f))
        if features is not None:
            X.append(features)
            y.append(breed)

X = np.array(X)
y = np.array(y)
print("Feature matrix shape:", X.shape)


#  Encode labels & scale

encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


#  PCA dimensionality reduction

pca = PCA(n_components=0.95, svd_solver='full')
X_pca = pca.fit_transform(X_scaled)
print("Shape after PCA:", X_pca.shape)


#  Train-test split

X_train, X_test, y_train, y_test = train_test_split(
    X_pca, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)


# Logistic Regression fine-tuning

param_grid = {
    "C": [0.1, 1, 10, 100],
    "solver": ["lbfgs", "liblinear", "saga"],
    "max_iter": [1000, 2000]
}

grid = GridSearchCV(LogisticRegression(multi_class="auto"), param_grid, cv=3, scoring="accuracy", n_jobs=-1)
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
print(f"Best Logistic Regression params: {grid.best_params_}")


#  Evaluate

y_pred = best_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"✅ Logistic Regression Test Accuracy: {acc*100:.2f}%")


Selected breeds: ['n02086910-papillon', 'n02088364-beagle', 'n02096585-Boston_bull', 'n02112706-Brabancon_griffon', 'n02107142-Doberman', 'n02091831-Saluki', 'n02102973-Irish_water_spaniel', 'n02110627-affenpinscher', 'n02113186-Cardigan', 'n02116738-African_hunting_dog']
Feature matrix shape: (500, 8196)
Shape after PCA: (500, 405)
Best Logistic Regression params: {'C': 10, 'max_iter': 2000, 'solver': 'saga'}
✅ Logistic Regression Test Accuracy: 23.00%
