#### Download the dataset from https://www.kaggle.com/competitions/dogs-vs-cats/data

#### Extract the dogs-vs-cats.zip in the same folder as this notebook

In [1]:
import os
import re
import shutil
import random



# The root directory of the project.
root_dir = 'E:\\Projects\\Dogs vs Cats Classifier'


source_dir = os.path.join(root_dir, 'dogs-vs-cats\\train\\train')

In [2]:
# Creating the structured directory path for imagedatagenerator
data_dir = os.path.join(root_dir,'data')
os.makedirs(data_dir, exist_ok = True)

training_dir = os.path.join(data_dir, 'training')
os.makedirs(training_dir, exist_ok = True)

validation_dir = os.path.join(data_dir, 'validation')
os.makedirs(validation_dir, exist_ok = True)

training_cats_dir = os.path.join(training_dir, "cats")
os.makedirs(training_cats_dir, exist_ok = True)

training_dogs_dir = os.path.join(training_dir, "dogs")
os.makedirs(training_dogs_dir, exist_ok = True)

validation_cats_dir = os.path.join(validation_dir, "cats")
os.makedirs(validation_cats_dir, exist_ok = True)

validation_dogs_dir = os.path.join(validation_dir, "dogs")
os.makedirs(validation_dogs_dir, exist_ok = True)

In [3]:
# Checking the directories created
for rootdir, dirs, files in os.walk(data_dir):
    for subdir in dirs:
        print(os.path.join(rootdir, subdir))

E:\Projects\Dogs vs Cats Classifier\data\training
E:\Projects\Dogs vs Cats Classifier\data\validation
E:\Projects\Dogs vs Cats Classifier\data\training\cats
E:\Projects\Dogs vs Cats Classifier\data\training\dogs
E:\Projects\Dogs vs Cats Classifier\data\validation\cats
E:\Projects\Dogs vs Cats Classifier\data\validation\dogs


In [4]:
source_files = os.listdir(source_dir)

In [5]:
# Make cats and dogs folder inside the source folder
source_cats_dir = os.path.join(source_dir, "cats")
os.makedirs(source_cats_dir, exist_ok = True)

source_dogs_dir = os.path.join(source_dir, "dogs")
os.makedirs(source_dogs_dir, exist_ok = True)

In [6]:
# Function to check if image is of cat or dog using its name
def is_cat_image(input_string):
    pattern = r'^cat'
    if re.match(pattern, input_string):
        return True
    else:
        return False

In [7]:
# Move the images into cats and dogs folder inside the source folder
for file in source_files:
    if is_cat_image(file) == True:
        source = os.path.join(source_dir, file)
        destination = os.path.join(source_cats_dir, file)
        shutil.move(source, destination)
    else:
        source = os.path.join(source_dir, file)
        destination = os.path.join(source_dogs_dir, file)

        shutil.move(source, destination)

In [8]:
# Function to copy images to training and validation directories
def split_data(SOURCE_DIR, TRAINING_DIR, VALIDATION_DIR, SPLIT_SIZE):
    files = os.listdir(SOURCE_DIR)

    for file in files:
        if os.path.getsize(os.path.join(SOURCE_DIR, file)) > 0:
            continue
        else:
            files.remove(file)
            print(f"{file} is zero length, so ignoring.")

    random.sample(files, len(files))

    split_index = int(SPLIT_SIZE * len(files))

    # Split the files into training and validation sets
    training_files = files[:split_index]
    validation_files = files[split_index:]

    # Copy training files
    for file in training_files:
        source = os.path.join(SOURCE_DIR, file)
        destination = os.path.join(TRAINING_DIR, file)
        shutil.copyfile(source, destination)

    # Copy validation files
    for file in validation_files:
        source = os.path.join(SOURCE_DIR, file)
        destination = os.path.join(VALIDATION_DIR, file)
        shutil.copyfile(source, destination)

In [9]:
source_cats_dir = os.path.join(source_dir, "cats")
source_dogs_dir = os.path.join(source_dir, "dogs")

# Defining proportion of images used for training
split_size = 0.8

# Running the function
split_data(source_cats_dir, training_cats_dir, validation_cats_dir, split_size)
split_data(source_dogs_dir, training_dogs_dir, validation_dogs_dir, split_size)

In [10]:
# Checking that the number of images matches the expected output
print(f"\n\nOriginal cat's directory has {len(os.listdir(source_cats_dir))} images")
print(f"Original dog's directory has {len(os.listdir(source_dogs_dir))} images\n")

# Training and validation splits
print(f"There are {len(os.listdir(training_cats_dir))} images of cats for training")
print(f"There are {len(os.listdir(training_dogs_dir))} images of dogs for training")
print(f"There are {len(os.listdir(validation_cats_dir))} images of cats for validation")
print(f"There are {len(os.listdir(validation_dogs_dir))} images of dogs for validation")



Original cat's directory has 12500 images
Original dog's directory has 12500 images

There are 10000 images of cats for training
There are 10000 images of dogs for training
There are 2500 images of cats for validation
There are 2500 images of dogs for validation


In [12]:
# Using the ImageDataGenerator to load the images
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Adding data-augmentation parameters to ImageDataGenerator
train_datagen = ImageDataGenerator(rescale = 1./255.,
                                   rotation_range = 45,
                                   width_shift_range = 0.2,
                                   height_shift_range = 0.2,
                                   shear_range = 0.2,
                                   zoom_range = 0.2,
                                   horizontal_flip = True)

# Augmentation not used in validation
validation_datagen = ImageDataGenerator( rescale = 1.0/255. )

# Flow training images in batches of 100 using train_datagen generator
train_generator = train_datagen.flow_from_directory(training_dir,
                                                    batch_size = 100,
                                                    class_mode = 'binary', 
                                                    target_size = (150, 150))     

# Flow validation images in batches of 25 using validation_datagen generator
validation_generator =  validation_datagen.flow_from_directory( validation_dir,
                                                          batch_size  = 25,
                                                          class_mode  = 'binary', 
                                                          target_size = (150, 150))

Found 20000 images belonging to 2 classes.
Found 5000 images belonging to 2 classes.


In [17]:
# Download the pre-trained weights of InceptionV3 Model.
import urllib.request

url = "https://storage.googleapis.com/mledu-datasets/inception_v3_weights_tf_dim_ordering_tf_kernels_notop.h5"
file_name = "inception_v3_weights.h5"
download_path = root_dir

file_path = os.path.join(download_path, file_name)

urllib.request.urlretrieve(url, file_path)


('E:\\Projects\\Dogs vs Cats Classifier\\inception_v3_weights.h5',
 <http.client.HTTPMessage at 0x1b86604dd10>)