# Dogs vs Cats — ready-to-run notebook

This notebook is a ready-to-run solution template for the Kaggle **Dogs vs Cats Redux** contest.

It uses transfer learning (VGG16) + data augmentation and includes training, prediction and submission cells.

**Note:** to reach LogLoss < 0.3 you'll likely need to train longer and fine-tune on a full GPU instance (Kaggle/GPU or Colab Pro).

In [None]:
# Environment check & imports
import os
import re
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import VGG16
from tensorflow.keras import layers, models, optimizers, callbacks
print('TensorFlow version:', tf.__version__)


In [None]:
# Paths and hyperparameters - edit these to match your environment
IMG_SIZE = (224, 224)
BATCH_SIZE = 32
EPOCHS = 20  # increase on Kaggle/GPU
TRAIN_DIR = 'data/train_sample'   # change to '/kaggle/input/dogs-vs-cats-redux-kernels-edition/train' on Kaggle if needed
TEST_DIR = 'data/test_sample'     # change accordingly
OUTPUT_MODEL = 'cats_dogs_vgg16.h5'
SEED = 42


In [None]:
# list a few files and visual check
from glob import glob
train_files = sorted(glob(os.path.join(TRAIN_DIR, '*.jpg')))
test_files = sorted(glob(os.path.join(TEST_DIR, '*.jpg')))
print('Train sample files:', len(train_files))
print('Test sample files:', len(test_files))
# show 6 examples
import cv2
fig = plt.figure(figsize=(12,4))
for i, p in enumerate(train_files[:6]):
    ax = fig.add_subplot(1,6,i+1)
    img = cv2.imread(p)[...,::-1]
    img = cv2.resize(img, IMG_SIZE)
    ax.imshow(img.astype('uint8'))
    ax.axis('off')
plt.show()


In [None]:
# Data generators (balanced class mode using filenames naming convention 'dog.123.jpg' and 'cat.123.jpg')
train_datagen = ImageDataGenerator(
    preprocessing_function=tf.keras.applications.vgg16.preprocess_input,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest',
    validation_split=0.05  # small val split; for final eval use separate holdout
)

test_datagen = ImageDataGenerator(preprocessing_function=tf.keras.applications.vgg16.preprocess_input)

# flow_from_directory expects subfolders like train/cat and train/dog. If your TRAIN_DIR already is the folder containing cat/ and dog/, pass its parent.
train_generator = train_datagen.flow_from_directory(
    os.path.dirname(TRAIN_DIR) if os.path.isdir(os.path.dirname(TRAIN_DIR)) else TRAIN_DIR,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary',
    subset='training',
    shuffle=True,
    seed=SEED
)

val_generator = train_datagen.flow_from_directory(
    os.path.dirname(TRAIN_DIR) if os.path.isdir(os.path.dirname(TRAIN_DIR)) else TRAIN_DIR,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='binary',
    subset='validation',
    shuffle=False,
    seed=SEED
)


In [None]:
# Alternative: if your TRAIN_DIR contains files like 'dog.123.jpg' and 'cat.456.jpg' (no subfolders),
# use a custom generator that reads filenames and yields batches (uncomment and adapt if needed).
'''
from tensorflow.keras.utils import Sequence
class FileSequence(Sequence):
    def __init__(self, files, batch_size, img_size, shuffle=True):
        self.files = files
        self.batch_size = batch_size
        self.img_size = img_size
        self.shuffle = shuffle
        self.on_epoch_end()

    def __len__(self):
        return int(np.ceil(len(self.files) / self.batch_size))

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.files)

    def __getitem__(self, idx):
        batch_files = self.files[idx * self.batch_size:(idx + 1) * self.batch_size]
        X = np.zeros((len(batch_files), self.img_size[0], self.img_size[1], 3), dtype=np.float32)
        y = np.zeros((len(batch_files),), dtype=np.float32)
        for i, p in enumerate(batch_files):
            img = cv2.imread(p)[...,::-1]
            img = cv2.resize(img, self.img_size)
            img = tf.keras.applications.vgg16.preprocess_input(img)
            X[i] = img
            y[i] = 1.0 if os.path.basename(p).startswith('dog') else 0.0
        return X, y

# seq = FileSequence(train_files[val_count:], BATCH_SIZE, IMG_SIZE)
'''


In [None]:
# Build model: VGG16 base + small classifier head
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3))
for layer in base_model.layers:
    layer.trainable = False

x = base_model.output
x = layers.GlobalAveragePooling2D()(x)
x = layers.Dense(256, activation='relu')(x)
x = layers.Dropout(0.5)(x)
out = layers.Dense(1, activation='sigmoid')(x)

model = models.Model(inputs=base_model.input, outputs=out, name='vgg16_transfer')
model.compile(optimizer=optimizers.Adam(learning_rate=1e-4),
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()


In [None]:
# Callbacks
cb = [
    callbacks.ModelCheckpoint(OUTPUT_MODEL, save_best_only=True, monitor='val_loss'),
    callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-7)
]

# Train - if using flow_from_directory above (ensure directories structured as cat/ and dog/)
steps_per_epoch = max(1, train_generator.samples // BATCH_SIZE)
validation_steps = max(1, val_generator.samples // BATCH_SIZE)
history = model.fit(
    train_generator,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_generator,
    validation_steps=validation_steps,
    epochs=EPOCHS,
    callbacks=cb
)


In [None]:
# Fine-tuning: unfreeze top VGG blocks and train with lower LR
# WARNING: only do this on a GPU and after initial training converged
for layer in base_model.layers[-6:]:
    layer.trainable = True
model.compile(optimizer=optimizers.Adam(learning_rate=1e-5),
              loss='binary_crossentropy', metrics=['accuracy'])
ft_history = model.fit(
    train_generator,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_generator,
    validation_steps=validation_steps,
    epochs=10,
    callbacks=cb
)


In [None]:
# Predict on test files and create submission (adapt depending on Kaggle test set naming)
import pandas as pd
# If test images are in a single folder, create a generator
from glob import glob
test_files = sorted(glob(os.path.join(TEST_DIR, '*.jpg')))
def predict_single(model, files):
    preds = []
    ids = []
    for p in files:
        img = cv2.imread(p)[...,::-1]
        img = cv2.resize(img, IMG_SIZE)
        x = tf.keras.applications.vgg16.preprocess_input(img)
        x = np.expand_dims(x, 0)
        pred = float(model.predict(x)[0][0])
        preds.append(pred)
        m = re.search(r'(\d+)\.jpg$', p)
        ids.append(m.group(1) if m else os.path.basename(p))
    return ids, preds

ids, preds = predict_single(model, test_files)

sub = pd.DataFrame({'id': ids, 'label': preds})
sub.to_csv('submission.csv', index=False)
print('Saved submission.csv with', len(sub), 'rows.')
sub.head()


## How to get a low LogLoss (practical tips)

- Train longer with more data augmentation and strong regularization.
- Use class weight balancing if your dataset is imbalanced.
- Fine-tune the top VGG blocks on a GPU.
- Use Test Time Augmentation (TTA) and ensembling of models.

---

**After you submit to Kaggle**, paste your Leaderboard LogLoss value below in the next cell. The task is considered passed when LogLoss < 0.3.

In [None]:
# -------------------------------
# Paste your Kaggle Leaderboard LogLoss here after successful submit
kaggle_logloss = None  # e.g. 0.28902
print('Kaggle LogLoss:', kaggle_logloss)


### Final notes

This notebook is a starting point. To actually reach the required LogLoss < 0.3 you will likely need:

- Full dataset (not tiny sample).
- More epochs (50+), progressive resizing, lr schedules.
- Ensembling (e.g., VGG16 + EfficientNet variants) and TTA.

Good luck — run this on Kaggle with GPU and paste the final LogLoss in the cell above.