<a href="https://colab.research.google.com/github/Ruchithasunkoji/Dog-Breed-Classification-and-Info-Generation/blob/main/infosys_week_1_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()  # Choose dog-breed-identification.zip


Saving kaggle.json to kaggle.json


In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content"  # point to the folder containing kaggle.json


In [None]:
!ls /content/kaggle.json  # Should display kaggle.json


/content/kaggle.json


In [None]:
!pip install kaggle --upgrade




In [None]:
# downloading the stanfod dog dataset from kaggle using kaggle API
!kaggle datasets download -d jessicali9530/stanford-dogs-dataset


Dataset URL: https://www.kaggle.com/datasets/jessicali9530/stanford-dogs-dataset
License(s): other
Downloading stanford-dogs-dataset.zip to /content
 98% 732M/750M [00:11<00:00, 39.6MB/s]
100% 750M/750M [00:12<00:00, 65.5MB/s]


In [None]:
# unziping the dataset
!unzip -q stanford-dogs-dataset.zip -d /content/stanford_dogs

In [None]:
# displaying contents in the dataset
!ls /content/stanford_dogs

annotations  images


In [None]:
# removing zip folder
!rm stanford-dogs-dataset.zip


In [None]:
# number of images
!ls /content/stanford_dogs/images/Images | wc -l

120


In [None]:
!ls /content/stanford_dogs/images/Images | head


n02085620-Chihuahua
n02085782-Japanese_spaniel
n02085936-Maltese_dog
n02086079-Pekinese
n02086240-Shih-Tzu
n02086646-Blenheim_spaniel
n02086910-papillon
n02087046-toy_terrier
n02087394-Rhodesian_ridgeback
n02088094-Afghan_hound


In [None]:
DATASET_PATH ='/content/stanford_dogs/images/Images'


In [None]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.203-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.17-py3-none-any.whl.metadata (14 kB)
Downloading ultralytics-8.3.203-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m20.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.17-py3-none-any.whl (28 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.3.203 ultralytics-thop-2.0.17


# **cleaning the dataset**

In [None]:
import os
import cv2
import hashlib
from PIL import Image
from ultralytics import YOLO
import pandas as pd


# Dataset Paths

INPUT_DIR = "/content/stanford_dogs/images/Images"   # original breed folders
OUTPUT_DIR = "/content/stanford_dogs_cleaned"        # cleaned output
os.makedirs(OUTPUT_DIR, exist_ok=True)


# Load YOLOv8 Model (pretrained on COCO)

model = YOLO("yolov8n.pt")  # small & fast model


# Helper Functions

def is_corrupted(image_path):
    try:
        img = Image.open(image_path)
        img.verify()
        return False
    except:
        return True

def is_blurry(image_path, threshold=100.0):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        return True
    lap_var = cv2.Laplacian(img, cv2.CV_64F).var()
    return lap_var < threshold

def get_image_hash(image_path):
    with open(image_path, "rb") as f:
        return hashlib.md5(f.read()).hexdigest()

def contains_clear_dog_only(image_path, conf_threshold=0.3):
    results = model(image_path, conf=conf_threshold)
    has_dog, has_person = False, False
    for r in results[0].boxes:
        cls = int(r.cls[0])
        if model.names[cls] == "dog":
            has_dog = True
        if model.names[cls] == "person":
            has_person = True
    return has_dog and not has_person  # keep only dog-only images

def resize_and_save(image_path, output_path, size=(224,224)):
    try:
        img = Image.open(image_path).convert("RGB")
        if img.size[0] < 100 or img.size[1] < 100:  # remove very low-res
            return False
        img = img.resize(size, Image.LANCZOS)
        img.save(output_path)
        return True
    except:
        return False


# Main Cleaning Function

def clean_dataset_fast(input_dir, output_dir, resize_size=(224,224), blur_threshold=100):
    """
    Fast cleaning of large image datasets.
    Removes duplicates, corrupted, blurry, low-res, non-dog or person+dog images.
    Resizes images to resize_size.
    """
    seen_hashes = set()
    stats = []

    breeds = [b for b in os.listdir(input_dir) if os.path.isdir(os.path.join(input_dir, b))]
    print(f"Found {len(breeds)} breeds. Starting cleaning...\n")

    for breed in breeds:
        breed_path = os.path.join(input_dir, breed)
        breed_output_dir = os.path.join(output_dir, breed)
        os.makedirs(breed_output_dir, exist_ok=True)

        total, kept = 0, 0
        files = [f for f in os.listdir(breed_path) if f.lower().endswith((".jpg", ".png"))]

        for file in files:
            total += 1
            file_path = os.path.join(breed_path, file)

            # Skip corrupted
            if is_corrupted(file_path):
                continue

            # Skip duplicates
            img_hash = get_image_hash(file_path)
            if img_hash in seen_hashes:
                continue
            seen_hashes.add(img_hash)

            # Skip blurry
            if is_blurry(file_path, threshold=blur_threshold):
                continue

            # Skip if not dog-only
            if not contains_clear_dog_only(file_path):
                continue

            # Resize & save
            if resize_and_save(file_path, os.path.join(breed_output_dir, file), size=resize_size):
                kept += 1

        stats.append({"Breed": breed, "Original": total, "Cleaned": kept})
        print(f"Breed: {breed} | Original: {total} | Cleaned: {kept} | Removed: {total-kept}")

    # Save CSV report
    df = pd.DataFrame(stats)
    df["Removed"] = df["Original"] - df["Cleaned"]
    df.to_csv(os.path.join(output_dir, "cleaning_report.csv"), index=False)
    print("\n✅ Dataset cleaning complete! Report saved as 'cleaning_report.csv'.")

    return df

# Run Cleaning

report_df = clean_dataset_fast(INPUT_DIR, OUTPUT_DIR)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
image 1/1 /content/stanford_dogs/images/Images/n02107908-Appenzeller/n02107908_7122.jpg: 480x640 2 dogs, 180.9ms
Speed: 4.5ms preprocess, 180.9ms inference, 1.6ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /content/stanford_dogs/images/Images/n02107908-Appenzeller/n02107908_5964.jpg: 640x448 1 dog, 172.0ms
Speed: 3.7ms preprocess, 172.0ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 448)

image 1/1 /content/stanford_dogs/images/Images/n02107908-Appenzeller/n02107908_5162.jpg: 480x640 1 dog, 178.0ms
Speed: 4.1ms preprocess, 178.0ms inference, 1.6ms postprocess per image at shape (1, 3, 480, 640)

image 1/1 /content/stanford_dogs/images/Images/n02107908-Appenzeller/n02107908_7700.jpg: 448x640 1 cat, 166.0ms
Speed: 3.6ms preprocess, 166.0ms inference, 1.4ms postprocess per image at shape (1, 3, 448, 640)

image 1/1 /content/stanford_dogs/images/Images/n02107908-Appenzeller/n02107908_1855.jpg: 4

After data cleaning the dataset is saved as "stanford_dogs_cleaned.zip",
And it also create "cleaning_report.csv" which is used for how much images are cleaned from each breed.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# unziping the cleaned zip file
!zip -r stanford_dogs_cleaned.zip /content/stanford_dogs_cleaned

In [None]:
# downloding cleaned dataset
files.download('stanford_dogs_cleaned.zip')

model training with 500 images

In [None]:
#Import Required Libraries
import os
import random
import numpy as np
from skimage.io import imread
from skimage.transform import resize
from skimage.feature import hog
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report


In [None]:
#Set Parameters
DATA_DIR = "/content/dog_dataset"  # cleaned dataset path
IMG_SIZE = (64, 64)                        # match cleaned dataset
IMAGES_PER_BREED = 50                         # take 50 images per breed
random.seed(42)


In [None]:
#Load Dataset (10 Breeds × 50 Images) and Extract HOG Features
X, y = [], []

# Select only 10 directories (breeds) and ignore files like CSV
breeds = [b for b in sorted(os.listdir(DATA_DIR)) if os.path.isdir(os.path.join(DATA_DIR, b))][:10]

for breed in breeds:
    folder_path = os.path.join(DATA_DIR, breed)
    images = os.listdir(folder_path)
    random.shuffle(images)
    images = images[:IMAGES_PER_BREED]  # take 50 images per breed

    for img_name in images:
        img_path = os.path.join(folder_path, img_name)
        try:
            img = imread(img_path)
            gray = np.mean(img, axis=2)  # convert to grayscale
            # Extract HOG features
            features = hog(gray, pixels_per_cell=(16,16), cells_per_block=(2,2), channel_axis=None)
            X.append(features)
            y.append(breed)
        except:
            pass  # skip problematic images

X = np.array(X)
y = np.array(y)
print(f"Dataset loaded: {X.shape[0]} samples, {len(breeds)} breeds")


Dataset loaded: 500 samples, 10 breeds


In [None]:
#Train-Test Split and Scaling
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Train/test split and scaling done")


Train/test split and scaling done


In [None]:
#Train and Evaluate SVM
svm_clf = SVC(kernel='rbf', C=10, gamma='scale')
svm_clf.fit(X_train, y_train)
y_pred_svm = svm_clf.predict(X_test)

print("=== SVM ===")
print("Accuracy:", round(accuracy_score(y_test, y_pred_svm)*100,2), "%")
print(classification_report(y_test, y_pred_svm))


=== SVM ===
Accuracy: 23.0 %
                               precision    recall  f1-score   support

          n02085620-Chihuahua       0.17      0.30      0.21        10
   n02085782-Japanese_spaniel       0.18      0.20      0.19        10
        n02085936-Maltese_dog       0.00      0.00      0.00        10
           n02086079-Pekinese       0.12      0.10      0.11        10
           n02086240-Shih-Tzu       0.00      0.00      0.00        10
   n02086646-Blenheim_spaniel       0.23      0.30      0.26        10
           n02086910-papillon       0.42      0.50      0.45        10
        n02087046-toy_terrier       0.29      0.20      0.24        10
n02087394-Rhodesian_ridgeback       0.29      0.20      0.24        10
       n02088094-Afghan_hound       0.56      0.50      0.53        10

                     accuracy                           0.23       100
                    macro avg       0.22      0.23      0.22       100
                 weighted avg       0.22      

In [None]:
#Train and Evaluate Logistic Regression
log_reg = LogisticRegression(max_iter=1000, solver='lbfgs', multi_class='ovr')
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

print("=== Logistic Regression ===")
print("Accuracy:", round(accuracy_score(y_test, y_pred_lr)*100,2), "%")
print(classification_report(y_test, y_pred_lr))




=== Logistic Regression ===
Accuracy: 22.0 %
                               precision    recall  f1-score   support

          n02085620-Chihuahua       0.20      0.20      0.20        10
   n02085782-Japanese_spaniel       0.22      0.20      0.21        10
        n02085936-Maltese_dog       0.14      0.10      0.12        10
           n02086079-Pekinese       0.15      0.20      0.17        10
           n02086240-Shih-Tzu       0.00      0.00      0.00        10
   n02086646-Blenheim_spaniel       0.13      0.20      0.16        10
           n02086910-papillon       0.57      0.40      0.47        10
        n02087046-toy_terrier       0.38      0.30      0.33        10
n02087394-Rhodesian_ridgeback       0.19      0.30      0.23        10
       n02088094-Afghan_hound       0.30      0.30      0.30        10

                     accuracy                           0.22       100
                    macro avg       0.23      0.22      0.22       100
                 weighted avg 

In [None]:
#Train and Evaluate Decision Tree
dt_clf = DecisionTreeClassifier(random_state=42)
dt_clf.fit(X_train, y_train)
y_pred_dt = dt_clf.predict(X_test)

print("=== Decision Tree ===")
print("Accuracy:", round(accuracy_score(y_test, y_pred_dt)*100,2), "%")
print(classification_report(y_test, y_pred_dt))


=== Decision Tree ===
Accuracy: 13.0 %
                               precision    recall  f1-score   support

          n02085620-Chihuahua       0.29      0.20      0.24        10
   n02085782-Japanese_spaniel       0.27      0.30      0.29        10
        n02085936-Maltese_dog       0.00      0.00      0.00        10
           n02086079-Pekinese       0.00      0.00      0.00        10
           n02086240-Shih-Tzu       0.09      0.10      0.10        10
   n02086646-Blenheim_spaniel       0.25      0.10      0.14        10
           n02086910-papillon       0.20      0.20      0.20        10
        n02087046-toy_terrier       0.15      0.30      0.20        10
n02087394-Rhodesian_ridgeback       0.00      0.00      0.00        10
       n02088094-Afghan_hound       0.08      0.10      0.09        10

                     accuracy                           0.13       100
                    macro avg       0.13      0.13      0.13       100
                 weighted avg       

In [None]:
#Compare Model Accuracies
print("\nSummary of Model Accuracies:")
print(f"SVM: {round(accuracy_score(y_test, y_pred_svm)*100,2)}%")
print(f"Logistic Regression: {round(accuracy_score(y_test, y_pred_lr)*100,2)}%")
print(f"Decision Tree: {round(accuracy_score(y_test, y_pred_dt)*100,2)}%")



Summary of Model Accuracies:
SVM: 23.0%
Logistic Regression: 22.0%
Decision Tree: 13.0%
