**Description**

I used a VGG16 and then added new layers to better learn patterns of cats and dogs. Originally, I tried a CNN and that yielded an accuracy of about 85%. Then I tried VGG16 and got around 90%. Other models I tried were ResNet50 and EfficientNetB0 but they outputted very low accuraccy scores. So I decided to use VGG16 as my base. I then used bianary cross-entropy as my loss function and Adam as my optimizer. This helped with efficient learning. I only did 20 epochs with halting procautions. In hindsight I could have added more. Finally, I evaluated the model on Accuracy and log loss and then outputted my findings. 

In [None]:
import os
import shutil
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss, accuracy_score

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    Conv2D, MaxPooling2D, Flatten, Dense, Activation,
    Dropout, GlobalAveragePooling2D
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import (
    ImageDataGenerator, load_img, img_to_array
)
from tensorflow.keras.applications import VGG16
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

train_path = "/kaggle/input/dogs-vs-cats-redux-kernels-edition/test.zip"
test_path = "/kaggle/input/dogs-vs-cats-redux-kernels-edition/train.zip"

files = "/kaggle/working/HW4_2"

import zipfile

with zipfile.ZipFile(train_path, 'r') as zipp:
    zipp.extractall(files)
    
with zipfile.ZipFile(test_path, 'r') as zipp:
    zipp.extractall(files)

base_dir = "/kaggle/working/HW4_2/train"
cat_dir = os.path.join(base_dir, "cat")
dog_dir = os.path.join(base_dir, "dog")

# Create folders
os.makedirs(cat_dir, exist_ok=True)
os.makedirs(dog_dir, exist_ok=True)

# Move only .jpg files that start with "cat" or "dog"
for filename in os.listdir(base_dir):
    src_path = os.path.join(base_dir, filename)
    
    if os.path.isfile(src_path):  # ✅ Only move actual files
        if filename.startswith("cat"):
            shutil.move(src_path, os.path.join(cat_dir, filename))
        elif filename.startswith("dog"):
            shutil.move(src_path, os.path.join(dog_dir, filename))

# === CONFIG ===
img_rows, img_cols = 150, 150
input_shape = (img_rows, img_cols, 3)
batch_size = 32
epochs = 20
train_dir = '/kaggle/working/HW4_2/train'
test_dir = '/kaggle/working/HW4_2/test'

# === ADVANCED DATA AUGMENTATION ===
train_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2,
    rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2,
    zoom_range=0.2,
    shear_range=0.2,
    horizontal_flip=True
)

train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(img_rows, img_cols),
    batch_size=batch_size,
    class_mode='binary',
    subset='training'
)

val_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=(img_rows, img_cols),
    batch_size=batch_size,
    class_mode='binary',
    subset='validation',
    shuffle=False
)

# === LOAD VGG16 AND UNFREEZE LAST 4 LAYERS ===
base_model = VGG16(include_top=False, weights='imagenet', input_shape=input_shape)
for layer in base_model.layers:
    layer.trainable = False
for layer in base_model.layers[-4:]:
    layer.trainable = True

# === TUNED CLASSIFIER HEAD ===
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(256, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
predictions = Dense(1, activation='sigmoid')(x)

model = Model(inputs=base_model.input, outputs=predictions)

# === COMPILE ===
model.compile(optimizer=Adam(learning_rate=1e-5), loss='binary_crossentropy', metrics=['accuracy'])

# === CALLBACKS ===
callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, verbose=1)
]

# === TRAIN ===
model.fit(
    train_generator,
    steps_per_epoch=train_generator.samples // batch_size,
    validation_data=val_generator,
    validation_steps=val_generator.samples // batch_size,
    epochs=epochs,
    callbacks=callbacks,
    verbose=1
)

# === EVALUATE ===
val_preds = model.predict(val_generator, verbose=0)
y_true = val_generator.classes
val_acc = accuracy_score(y_true, (val_preds > 0.5).astype(int))
val_logloss = log_loss(y_true, val_preds[:len(y_true)])

print(f"\n✅ Tuned VGG16 Validation Accuracy: {val_acc:.4f}")
print(f"✅ Tuned VGG16 Validation Log Loss: {val_logloss:.4f}")


# === LOAD TEST IMAGES ===
test_images = sorted([f for f in os.listdir(test_dir) if f.endswith(".jpg")])
X_test, ids = [], []

for fname in test_images:
    img_path = os.path.join(test_dir, fname)
    img = load_img(img_path, target_size=(img_rows, img_cols))
    img_array = img_to_array(img) / 255.0
    X_test.append(img_array)
    ids.append(int(fname.split('.')[0]))

X_test = np.array(X_test)

# === PREDICT ===
preds = model.predict(X_test, verbose=1)

# === CREATE SUBMISSION CSV ===
submission = pd.DataFrame({
    "id": ids,
    "label": preds.flatten()
})
submission = submission.sort_values("id")
submission.to_csv("submission.csv", index=False)

print("✅ submission.csv created and ready to upload to Kaggle.")