# Explore here

In [4]:
!pip install kagglehub
import kagglehub

path = kagglehub.dataset_download("salader/dogs-vs-cats")

print("Path to dataset files:", path)

Defaulting to user installation because normal site-packages is not writeable
Path to dataset files: /home/vscode/.cache/kagglehub/datasets/salader/dogs-vs-cats/versions/1


In [5]:
!pip install scipy
!pip install matplotlib>=3.7.0
!pip install tensorflow
import os
import shutil
import random
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import zipfile 


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


# --- Configuration ---

In [6]:
IMAGE_WIDTH, IMAGE_HEIGHT = 200, 200
BATCH_SIZE = 8
EPOCHS = 5

# Define local directories for our structured dataset
BASE_DATA_DIR = 'dogs_vs_cats_dataset'
TRAIN_DIR = os.path.join(BASE_DATA_DIR, 'train')
VALIDATION_DIR = os.path.join(BASE_DATA_DIR, 'validation')

# Model saving path
MODEL_SAVE_PATH = 'trained_models/dogs_vs_cats_vgg_like.h5'
CHECKPOINT_FILEPATH = 'trained_models/best_model_checkpoint.h5'

# --- Step 1: Loading the dataset using kagglehub ---

In [7]:
download_path = None

try:
    download_path = kagglehub.dataset_download("salader/dogs-vs-cats")
    print(f"Dataset downloaded to: {download_path}")

    # Based on your latest 'ls -l' output, the raw images are directly within
    # 'dogs_vs_cats/train/' organized into 'cats/' and 'dogs/' subfolders.
    # We will use this 'train' folder from the cache as our source.
    SOURCE_TRAIN_CATS_DIR = os.path.join(download_path, 'dogs_vs_cats', 'train', 'cats')
    SOURCE_TRAIN_DOGS_DIR = os.path.join(download_path, 'dogs_vs_cats', 'train', 'dogs')

    if not os.path.isdir(SOURCE_TRAIN_CATS_DIR) or not os.path.isdir(SOURCE_TRAIN_DOGS_DIR):
        print(f"CRITICAL ERROR: Expected source directories not found or incomplete:")
        print(f"  Cats source: {SOURCE_TRAIN_CATS_DIR}")
        print(f"  Dogs source: {SOURCE_TRAIN_DOGS_DIR}")
        print("Please manually verify the structure of the KaggleHub downloaded dataset.")
        exit()

    print(f"Identified source directories for images:")
    print(f"  Cats: {SOURCE_TRAIN_CATS_DIR}")
    print(f"  Dogs: {SOURCE_TRAIN_DOGS_DIR}")

except Exception as e:
    print(f"Error during dataset download or initial path identification: {e}")
    print("Please ensure 'kagglehub' is installed and your Kaggle API credentials are correctly set up (kaggle.json in ~/.kaggle/).")
    exit()

# --- Prepare local directory structure for ImageDataGenerator ---
print("\n--- Preparing local directory structure for ImageDataGenerator ---")

# Clean up existing structure if it exists to avoid old files
if os.path.exists(BASE_DATA_DIR):
    print(f"Removing existing '{BASE_DATA_DIR}' directory for a clean start...")
    shutil.rmtree(BASE_DATA_DIR)

os.makedirs(os.path.join(TRAIN_DIR, 'dogs'), exist_ok=True)
os.makedirs(os.path.join(TRAIN_DIR, 'cats'), exist_ok=True)
os.makedirs(os.path.join(VALIDATION_DIR, 'dogs'), exist_ok=True)
os.makedirs(os.path.join(VALIDATION_DIR, 'cats'), exist_ok=True)

# List all image files from the source 'cats' and 'dogs' directories
all_cat_files = [f for f in os.listdir(SOURCE_TRAIN_CATS_DIR) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
all_dog_files = [f for f in os.listdir(SOURCE_TRAIN_DOGS_DIR) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

random.shuffle(all_cat_files)
random.shuffle(all_dog_files)

print(f"Found {len(all_cat_files)} cat images and {len(all_dog_files)} dog images in source.")

# Define split ratio (e.g., 80% train, 20% validation)
train_split_ratio = 0.8

# Distribute cat images
num_train_cats = int(len(all_cat_files) * train_split_ratio)
for i, img_name in enumerate(all_cat_files):
    src_path = os.path.join(SOURCE_TRAIN_CATS_DIR, img_name)
    if i < num_train_cats:
        dst_path = os.path.join(TRAIN_DIR, 'cats', img_name)
    else:
        dst_path = os.path.join(VALIDATION_DIR, 'cats', img_name)
    try:
        shutil.copy(src_path, dst_path)
    except Exception as e:
        print(f"ERROR: Could not copy cat image '{img_name}': {e}")

# Distribute dog images
num_train_dogs = int(len(all_dog_files) * train_split_ratio)
for i, img_name in enumerate(all_dog_files):
    src_path = os.path.join(SOURCE_TRAIN_DOGS_DIR, img_name)
    if i < num_train_dogs:
        dst_path = os.path.join(TRAIN_DIR, 'dogs', img_name)
    else:
        dst_path = os.path.join(VALIDATION_DIR, 'dogs', img_name)
    try:
        shutil.copy(src_path, dst_path)
    except Exception as e:
        print(f"ERROR: Could not copy dog image '{img_name}': {e}")

print("Dataset successfully structured into local train/validation directories.")
print(f"  Training Cats: {len(os.listdir(os.path.join(TRAIN_DIR, 'cats')))}")
print(f"  Training Dogs: {len(os.listdir(os.path.join(TRAIN_DIR, 'dogs')))}")
print(f"  Validation Cats: {len(os.listdir(os.path.join(VALIDATION_DIR, 'cats')))}")
print(f"  Validation Dogs: {len(os.listdir(os.path.join(VALIDATION_DIR, 'dogs')))}")


Dataset downloaded to: /home/vscode/.cache/kagglehub/datasets/salader/dogs-vs-cats/versions/1
Identified source directories for images:
  Cats: /home/vscode/.cache/kagglehub/datasets/salader/dogs-vs-cats/versions/1/dogs_vs_cats/train/cats
  Dogs: /home/vscode/.cache/kagglehub/datasets/salader/dogs-vs-cats/versions/1/dogs_vs_cats/train/dogs

--- Preparing local directory structure for ImageDataGenerator ---
Found 10000 cat images and 10000 dog images in source.
Dataset successfully structured into local train/validation directories.
  Training Cats: 8000
  Training Dogs: 8000
  Validation Cats: 2000
  Validation Dogs: 2000
