In [1]:
import joblib
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import numpy as np
import os
from PIL import Image, UnidentifiedImageError




## Cleaning data
This function clean corrupted data or data with other extension, by checking each file esxtension and veriying it, if is corrupted then remove it

In [2]:
def check_folder(folder):
    deleted = 0
    for subdir, _, files in os.walk(folder):
        for file in files:
            path = os.path.join(subdir, file)
            try:
                with Image.open(path) as img:
                    img.verify()  
            except (UnidentifiedImageError, OSError):
                print(f"Deleting corrupted image: {path}")
                os.remove(path)
                deleted += 1
    return deleted

## Get the data
First we want the images to be 64x64 to be easier and faster to process and analyze, we provide the path for the datasets, check the folder to see if it have corrupted data, then print to show the user, then rescale the image, use data augmentation to avoid overfitting, the create the image generator

In [3]:
img_size = (64, 64)
path_train = "dataset/train"
path_validation = "dataset/validation"
deleted_train = check_folder(path_train)
deleted_val = check_folder(path_validation)
print(f"Total of image deleted in train: {deleted_train}")
print(f"Total of image deleted in validation: {deleted_val}")
train = ImageDataGenerator(rescale=1./255, rotation_range=20, zoom_range=0.2, horizontal_flip=True)
validation = ImageDataGenerator(rescale=1./255)
train_gen = train.flow_from_directory(directory= path_train, target_size=img_size, batch_size=32, class_mode='binary', shuffle=True)
validation_gen = validation.flow_from_directory(directory= path_validation, target_size=img_size, batch_size=32, class_mode='binary', shuffle=True)



Total of image deleted in train: 0
Total of image deleted in validation: 0
Found 17797 images belonging to 2 classes.
Found 5905 images belonging to 2 classes.


## Creation of the model
I create a sequential model to use 3 conv2D block to filter the images, make a callback for early stopping, pool the data to reduce size a little bit, flatten all the data then make a dense layer and dropout to avoid overfitting, the output use sigmoid

In [4]:
callback = EarlyStopping(patience=3)
model = Sequential([
    Conv2D(64, 3, activation='relu', input_shape=(64, 64, 3)),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(128, 3, activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(256, 3, activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(rate=0.5),
    Dense(1, activation='sigmoid')
])





Modify the optimizer so it learn a little bit slower

In [5]:
optimizer = Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics='accuracy')

In [None]:
model.fit(train_gen, validation_data=validation_gen, epochs=20, callbacks=[callback])

Epoch 1/20


Epoch 2/20
Epoch 3/20

In [None]:
model.save("dog_vs_cat_model.keras")