In [1]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"racist20","key":"77ac2b78e1fddb50f02a36e819b11815"}'}

In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c petfinder-adoption-prediction
!unzip -q petfinder-adoption-prediction.zip -d /content/petfinder_data

Downloading petfinder-adoption-prediction.zip to /content
 97% 1.88G/1.94G [00:01<00:00, 1.15GB/s]
100% 1.94G/1.94G [00:01<00:00, 1.15GB/s]


In [2]:
!wget http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar -O /content/stanford_images.tar
!mkdir -p /content/stanford_dogs
!tar -xf /content/stanford_images.tar -C /content/stanford_dogs

--2025-05-19 15:10:49--  http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar
Resolving vision.stanford.edu (vision.stanford.edu)... 171.64.68.10
Connecting to vision.stanford.edu (vision.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 793579520 (757M) [application/x-tar]
Saving to: ‘/content/stanford_images.tar’


2025-05-19 15:11:05 (48.6 MB/s) - ‘/content/stanford_images.tar’ saved [793579520/793579520]



In [3]:
!wget -q http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz
!wget -q http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz

!mkdir /content/oxford_pet_data
!tar -xzf images.tar.gz -C /content/oxford_pet_data
!tar -xzf annotations.tar.gz -C /content/oxford_pet_data

In [4]:
import os, json, shutil
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
# Load breed labels from PetFinder
breed_labels = pd.read_csv("/content/petfinder_data/PetFinder-BreedLabels.csv")
petfinder_meta = "/content/petfinder_data/train_metadata"
petfinder_images = "/content/petfinder_data/train_images"

def extract_petfinder():
    data = []
    for file in tqdm(os.listdir(petfinder_meta)):
        with open(os.path.join(petfinder_meta, file)) as f:
            meta = json.load(f)
        pet_id = file.replace('.json', '')
        breed_id = meta.get("Breed1")
        breed_match = breed_labels[breed_labels["BreedID"] == breed_id]
        if breed_match.empty:
            continue
        breed = breed_match["BreedName"].values[0]
        img_path = os.path.join(petfinder_images, f"{pet_id}-1.jpg")
        if os.path.exists(img_path):
            data.append((img_path, breed.lower().strip()))
    return data

def extract_oxford():
    anno_path = "/content/oxford_pet_data/annotations/list.txt"
    anno_df = pd.read_csv(anno_path, skiprows=6, sep=" ", header=None)
    anno_df.columns = ["file", "class_id", "species", "breed_id"]
    data = []
    for _, row in anno_df.iterrows():
        img_path = f"/content/oxford_pet_data/images/{row['file']}.jpg"
        if os.path.exists(img_path):
            breed = row["file"].split("_")[0]
            data.append((img_path, breed.lower().strip()))
    return data

def extract_stanford():
    root = "/content/stanford_dogs/Images"
    data = []
    for breed_dir in os.listdir(root):
        breed_name = breed_dir.split("-")[-1].replace("_", " ").lower().strip()
        for img_file in os.listdir(os.path.join(root, breed_dir)):
            img_path = os.path.join(root, breed_dir, img_file)
            data.append((img_path, breed_name))
    return data

petfinder_data = extract_petfinder()
oxford_data = extract_oxford()
stanford_data = extract_stanford()

100%|██████████| 58311/58311 [00:19<00:00, 3038.41it/s]


In [6]:
combined_data = petfinder_data + stanford_data + oxford_data

# Convert to DataFrame
df = pd.DataFrame(combined_data, columns=["image_path", "breed_name"])

# Normalize breed names: lowercase and strip whitespace
df["breed_name"] = df["breed_name"].str.lower().str.strip()

# Drop duplicates
df = df.drop_duplicates(subset=["image_path", "breed_name"])

# Count breed occurrences
breed_counts = df["breed_name"].value_counts()

print(f"✅ Total images (raw): {len(df)}")
print(f"✅ Unique breeds (raw): {df['breed_name'].nunique()}")
print("📊 Sample breed counts:\n", breed_counts.head(10))

✅ Total images (raw): 27929
✅ Unique breeds (raw): 145
📊 Sample breed counts:
 breed_name
pomeranian      419
samoyed         418
american        400
pug             400
english         396
beagle          395
newfoundland    391
basset          375
keeshond        357
chihuahua       352
Name: count, dtype: int64


In [7]:
from PIL import Image, ImageEnhance
import random

balanced_data = []

def augment_image(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
        operations = [
            lambda x: x.rotate(random.randint(-15, 15)),
            lambda x: ImageEnhance.Color(x).enhance(random.uniform(0.8, 1.2)),
            lambda x: ImageEnhance.Brightness(x).enhance(random.uniform(0.7, 1.3)),
            lambda x: ImageEnhance.Contrast(x).enhance(random.uniform(0.8, 1.2))
        ]
        op = random.choice(operations)
        return op(image)
    except:
        return None

# Group by breed
grouped = df.groupby('breed_name')

for breed, group in grouped:
    images = group['image_path'].tolist()
    count = len(images)

    if count >= 200:
        # Downsample to 200
        selected = random.sample(images, 200)
        balanced_data.extend([(img, breed) for img in selected])

    else:
        # Use all existing images
        balanced_data.extend([(img, breed) for img in images])
        needed = 200 - count

        for i in range(needed):
            original = random.choice(images)
            augmented = augment_image(original)
            if augmented:
                # Save augmented to a temporary path
                aug_path = f"/content/augmented/{breed}_{i}.jpg"
                os.makedirs(os.path.dirname(aug_path), exist_ok=True)
                augmented.save(aug_path)
                balanced_data.append((aug_path, breed))

# Final DataFrame
balanced_df = pd.DataFrame(balanced_data, columns=["image_path", "breed_name"])

# Check
print("✅ Final dataset size:", len(balanced_df))
print("✅ Unique breeds:", balanced_df["breed_name"].nunique())
print("✅ Images per breed (should all be 200):\n")
print(balanced_df['breed_name'].value_counts().head())

✅ Final dataset size: 29000
✅ Unique breeds: 145
✅ Images per breed (should all be 200):

breed_name
abyssinian             200
affenpinscher          200
afghan hound           200
african hunting dog    200
airedale               200
Name: count, dtype: int64


In [8]:
from sklearn.preprocessing import LabelEncoder
import pickle

# Encode breed labels
le = LabelEncoder()
balanced_df["label"] = le.fit_transform(balanced_df["breed_name"])

# Save encoder
with open("/content/label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

print("✅ LabelEncoder saved at: /content/label_encoder.pkl")
print("✅ Sample class labels:\n", list(le.classes_)[:10])

✅ LabelEncoder saved at: /content/label_encoder.pkl
✅ Sample class labels:
 ['abyssinian', 'affenpinscher', 'afghan hound', 'african hunting dog', 'airedale', 'american', 'american staffordshire terrier', 'appenzeller', 'australian terrier', 'basenji']


In [9]:
from sklearn.model_selection import train_test_split

# Train/Test Split (80% train, 20% temp)
train_df, temp_df = train_test_split(
    balanced_df,
    test_size=0.2,
    stratify=balanced_df["breed_name"],
    random_state=42
)

# Split temp into val/test (50/50 of remaining 20% = 10% each)
val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df["breed_name"],
    random_state=42
)

print("✅ Split Sizes:")
print(f"Train: {len(train_df)}")
print(f"Val: {len(val_df)}")
print(f"Test: {len(test_df)}")

✅ Split Sizes:
Train: 23200
Val: 2900
Test: 2900


In [10]:
import shutil

# Set output directory
output_dir = "/content/petify_data"
os.makedirs(output_dir, exist_ok=True)

def copy_images(df, subset):
    for _, row in df.iterrows():
        label = row["breed_name"]
        src = row["image_path"]
        dst_dir = os.path.join(output_dir, subset, label)
        os.makedirs(dst_dir, exist_ok=True)
        dst_path = os.path.join(dst_dir, os.path.basename(src))
        try:
            shutil.copy(src, dst_path)
        except Exception as e:
            print(f"⚠️ Failed to copy {src}: {e}")

# Copy images to folders
copy_images(train_df, "train")
copy_images(val_df, "val")
copy_images(test_df, "test")

print("✅ Image folders created under /content/petify_data")

✅ Image folders created under /content/petify_data


In [11]:
import os

train_breeds = os.listdir(f"{output_dir}/train")
print(f"✅ Total Breeds in Training Folder: {len(train_breeds)}")

# Check breed sample
print("📂 Sample Breed Folders:", train_breeds[:5])

# Check number of images in one breed
sample_breed = train_breeds[0]
num_images = len(os.listdir(f"{output_dir}/train/{sample_breed}"))
print(f"✅ Images in breed '{sample_breed}': {num_images}")

✅ Total Breeds in Training Folder: 145
📂 Sample Breed Folders: ['eskimo dog', 'bedlington terrier', 'bernese mountain dog', 'miniature', 'basenji']
✅ Images in breed 'eskimo dog': 160


In [12]:
# prompt: download tensorflow

!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.19.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting tensorboard~=2.19.0 (from tensorflow)
  Downloading tensorboard-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow-io-gcs-filesystem>=0.23.1 (from tensorflow)
  Downloading tensorflow_io_gcs_filesystem-0.37.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (14 kB)
Collecting wheel<1.0,>=0.23.0 (from astunparse>=1.6.0->tensorflow

In [13]:
import tensorflow as tf

# Constants
BATCH_SIZE = 32
IMG_SIZE = (224, 224)
DATA_DIR = "/content/petify_data"

# Load datasets
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    os.path.join(DATA_DIR, "train"),
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    label_mode="categorical"
)

val_ds = tf.keras.preprocessing.image_dataset_from_directory(
    os.path.join(DATA_DIR, "val"),
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    label_mode="categorical"
)

test_ds = tf.keras.preprocessing.image_dataset_from_directory(
    os.path.join(DATA_DIR, "test"),
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    label_mode="categorical"
)


Found 23200 files belonging to 145 classes.
Found 2900 files belonging to 145 classes.
Found 2900 files belonging to 145 classes.


In [14]:
import numpy as np
import tensorflow as tf # Ensure tf is imported

# Helper function to convert tf.data.Dataset to numpy arrays
def dataset_to_numpy(ds):
    X = []
    y = []
    print(f"Processing dataset with element spec: {ds.element_spec}") # Print element spec for debugging
    expected_flat_shape = None
    for images, labels in ds:
        # Process each image in the batch
        for i in range(images.shape[0]):
            try:
                image = images[i]
                label = labels[i]
                flattened_image = image.numpy().flatten()  # flatten image

                if expected_flat_shape is None:
                    expected_flat_shape = flattened_image.shape[0]
                    print(f"Expected flattened image shape: {expected_flat_shape}")

                # Check if the flattened shape matches the expected shape
                if flattened_image.shape[0] == expected_flat_shape:
                    X.append(flattened_image)
                    # convert one-hot to class index if label_mode was 'categorical'
                    if label.shape.rank > 0:
                         y.append(np.argmax(label.numpy()))
                    else: # Handle scalar labels if label_mode was 'int'
                         y.append(label.numpy())
                else:
                    print(f"⚠️ Skipping image due to inconsistent shape. Expected {expected_flat_shape}, got {flattened_image.shape[0]}")

            except Exception as e:
                print(f"⚠️ Error processing image in batch: {e}")

    # Convert lists to numpy arrays. Use dtype=object if necessary to handle potential shape differences that weren't caught
    # This might indicate a more fundamental issue if it's still needed, but can help in debugging.
    # np.array(X, dtype=object)
    try:
        X_array = np.array(X)
        y_array = np.array(y)
        return X_array, y_array
    except ValueError as e:
        print(f"Fatal error creating numpy array: {e}")
        print(f"Shape of collected X elements: {[item.shape for item in X]}")
        print(f"Number of collected X elements: {len(X)}")
        raise e # Re-raise the error after printing debug info


X_train, y_train = dataset_to_numpy(train_ds)
X_val, y_val = dataset_to_numpy(val_ds)
X_test, y_test = dataset_to_numpy(test_ds)

print("✅ Converted datasets to flat arrays.")
print(f"Shape: X_train = {X_train.shape}, y_train = {y_train.shape}")
print(f"Shape: X_val = {X_val.shape}, y_val = {y_val.shape}")
print(f"Shape: X_test = {X_test.shape}, y_test = {y_test.shape}")

Processing dataset with element spec: (TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, 145), dtype=tf.float32, name=None))
Expected flattened image shape: 150528
Processing dataset with element spec: (TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, 145), dtype=tf.float32, name=None))
Expected flattened image shape: 150528
Processing dataset with element spec: (TensorSpec(shape=(None, 224, 224, 3), dtype=tf.float32, name=None), TensorSpec(shape=(None, 145), dtype=tf.float32, name=None))
Expected flattened image shape: 150528
✅ Converted datasets to flat arrays.
Shape: X_train = (23200, 150528), y_train = (23200,)
Shape: X_val = (2900, 150528), y_val = (2900,)
Shape: X_test = (2900, 150528), y_test = (2900,)


In [15]:
from sklearn.decomposition import PCA

# Reduce to 500 features (tunable)
pca = PCA(n_components=500)
X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)


In [17]:
# prompt: install XGBoost

!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.1-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.26.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.0.1-py3-none-manylinux_2_28_x86_64.whl (253.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.9/253.9 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nccl_cu12-2.26.5-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (318.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m318.1/318.1 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.26.5 xgboost-3.0.1


In [22]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Define model
xgb = XGBClassifier(
    n_estimators=100,
    max_depth=10,
    learning_rate=0.1,
    objective='multi:softprob',
    num_class=len(le.classes_),  # number of classes = num breeds
    eval_metric='mlogloss',
    use_label_encoder=False
)

# Train
xgb.fit(X_train_pca, y_train, eval_set=[(X_val_pca, y_val)], verbose=True)

# Evaluate
y_pred = xgb.predict(X_test_pca)
acc = accuracy_score(y_test, y_pred)
print(f"✅ XGBoost Test Accuracy: {acc:.2%}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[0]	validation_0-mlogloss:4.91390
[1]	validation_0-mlogloss:4.86186
[2]	validation_0-mlogloss:4.81361
[3]	validation_0-mlogloss:4.76795
[4]	validation_0-mlogloss:4.73020
[5]	validation_0-mlogloss:4.69216
[6]	validation_0-mlogloss:4.66002
[7]	validation_0-mlogloss:4.62769
[8]	validation_0-mlogloss:4.59747
[9]	validation_0-mlogloss:4.57073
[10]	validation_0-mlogloss:4.54190
[11]	validation_0-mlogloss:4.51709
[12]	validation_0-mlogloss:4.49502
[13]	validation_0-mlogloss:4.47315
[14]	validation_0-mlogloss:4.45160
[15]	validation_0-mlogloss:4.43265
[16]	validation_0-mlogloss:4.41466
[17]	validation_0-mlogloss:4.39722
[18]	validation_0-mlogloss:4.38249
[19]	validation_0-mlogloss:4.36660
[20]	validation_0-mlogloss:4.35238
[21]	validation_0-mlogloss:4.33937
[22]	validation_0-mlogloss:4.32679
[23]	validation_0-mlogloss:4.31452
[24]	validation_0-mlogloss:4.30280
[25]	validation_0-mlogloss:4.29277
[26]	validation_0-mlogloss:4.28232
[27]	validation_0-mlogloss:4.27228
[28]	validation_0-mlogloss:4.2

In [24]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [25]:
import joblib
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

joblib.dump(xgb, '/content/drive/MyDrive/xgboost_petbreed_model.pkl')
joblib.dump(pca, '/content/drive/MyDrive/pca_transformer.pkl')
joblib.dump(le, '/content/drive/MyDrive/label_encoder.pkl')

print("✅ Saved XGBoost model, PCA, and Label Encoder.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Saved XGBoost model, PCA, and Label Encoder.
