# Kaggle Competition Dogs vs Cats Classification:

_Competition link: https://www.kaggle.com/c/dogs-vs-cats/overview_

## Libraries:

In [1]:
import pandas as pd
import numpy as np
import os, shutil
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Conv2D, Dropout, MaxPooling2D, Flatten
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

## Data Importation:

In [2]:
dir = '/Users/spavot/Documents/Perso/Kaggle-Cat-vs-Dog-Classification/Data'
original_dir = '/Users/spavot/Documents/Perso/Kaggle-Cat-vs-Dog-Classification/Data/Original_Data'
train_dir = os.path.join(dir, 'Training')
# os.mkdir(train_dir)
validation_dir = os.path.join(dir, 'Validation')
# os.mkdir(validation_dir)
test_dir = os.path.join(dir, 'Test')
# os.mkdir(test_dir)

In [3]:
#Training set directory
train_cats_dir = os.path.join(train_dir, 'Cats')
# os.mkdir(train_cats_dir)
train_dogs_dir = os.path.join(train_dir, 'Dogs')
# os.mkdir(train_dogs_dir)
#Validation set directory
validation_cats_dir = os.path.join(validation_dir, 'Cats')
# os.mkdir(validation_cats_dir)
validation_dogs_dir = os.path.join(validation_dir, 'Dogs')
# os.mkdir(validation_dogs_dir)
#Test set directory
test_cats_dir = os.path.join(test_dir, 'Cats')
# os.mkdir(test_cats_dir)
test_dogs_dir = os.path.join(test_dir, 'Dogs')
# os.mkdir(test_dogs_dir)

In [4]:
# #Copy the image to the right directory:
# fnames = ['cat.{}.jpg'.format(i) for i in range(20)]
# for fname in fnames:
#     src = os.path.join(original_dir, fname)
#     dst = os.path.join(train_cats_dir, fname)
#     shutil.copyfile(src,dst)
# #Dogs images:
# fnames = ['dog.{}.jpg'.format(i) for i in range(20)]
# for fname in fnames:
#     src = os.path.join(original_dir, fname)
#     dst = os.path.join(train_dogs_dir, fname)
#     shutil.copyfile(src,dst)

In [5]:
# #Copy the image to the validation directory:
# #Cats images
# fnames = ['cat.{}.jpg'.format(i) for i in range(20,40)]
# for fname in fnames:
#     src = os.path.join(original_dir, fname)
#     dst = os.path.join(validation_cats_dir, fname)
#     shutil.copyfile(src,dst)
# #Dogs images:
# fnames = ['dog.{}.jpg'.format(i) for i in range(20,40)]
# for fname in fnames:
#     src = os.path.join(original_dir, fname)
#     dst = os.path.join(validation_dogs_dir, fname)
#     shutil.copyfile(src,dst)

In [6]:
# #Copy the image to the test directory:
# #Cats images
# fnames = ['cat.{}.jpg'.format(i) for i in range(40,50)]
# for fname in fnames:
#     src = os.path.join(original_dir, fname)
#     dst = os.path.join(test_cats_dir, fname)
#     shutil.copyfile(src,dst)
# #Dogs images:
# fnames = ['dog.{}.jpg'.format(i) for i in range(40,50)]
# for fname in fnames:
#     src = os.path.join(original_dir, fname)
#     dst = os.path.join(test_dogs_dir, fname)
#     shutil.copyfile(src,dst)

In [7]:
#Check if we have the correct number of pictures per set:
print('Total training set cats:', len(os.listdir(train_cats_dir)))
print('Total training set dogs:', len(os.listdir(train_dogs_dir)))
print('Total validation set cats:', len(os.listdir(validation_cats_dir)))
print('Total validation set dogs:', len(os.listdir(validation_dogs_dir)))
print('Total test set cats:', len(os.listdir(test_cats_dir)))
print('Total test set dogs:', len(os.listdir(test_dogs_dir)))

Total training set cats: 20
Total training set dogs: 20
Total validation set cats: 20
Total validation set dogs: 20
Total test set cats: 10
Total test set dogs: 10


As expected, we have 8000 training samples of each class, 2000 for validation set and finally 2500 for the testing set to confirm the results and avoid overfitting on the validation set.

## Data preprocessing:

In [8]:
#Because we have a lot of data, we won't load them and instead use a generator:
train_datagen = ImageDataGenerator(rescale = 1./255)
val_datagen = ImageDataGenerator(rescale = 1./255)

batch_size = 2

train_generator = train_datagen.flow_from_directory(
    train_dir,
    color_mode = 'rgb',
    target_size = (150, 150),
    batch_size = batch_size,
    class_mode = 'binary'
)
validation_generator = val_datagen.flow_from_directory(
    validation_dir,
    color_mode = 'rgb',
    target_size = (150, 150),
    batch_size = batch_size,
    class_mode = 'binary'
)

Found 40 images belonging to 2 classes.
Found 40 images belonging to 2 classes.


In [9]:
#Sanity check:
for data, labels in train_generator:
    print(data.shape)
    print(labels.shape)
    break
for data, labels in validation_generator:
    print(data.shape)
    print(labels.shape)
    break

(2, 150, 150, 3)
(2,)
(2, 150, 150, 3)
(2,)


Seems to be as we wanted, we have our generators ready!

## Modeling using Keras and CNN layers:

First we will create callbacks to save the model and stop it when it start overfitting. Note that we use accuracy as the metric as our classes are equally distributed.

In [10]:
Model_Name = ''
callback = [EarlyStopping(monitor='val_accuracy', patience=3), ModelCheckpoint(filepath = '/Users/spavot/Documents/Perso/Kaggle-Cat-vs-Dog-Classification/Models/'+ Model_Name, monitor = 'val_accuracy', save_best_only = True)]

## Own build model:

In [11]:
#Assign the variable to a name for saving the best model
Model_Name = 'Own_1st_Model_CNN.h5'
#Build the model
own_model = Sequential()
own_model.add(Conv2D(32,(3,3), activation = 'relu', input_shape = (150,150,3)))
own_model.add(MaxPooling2D((2,2)))
own_model.add(Conv2D(32, (3,3), activation = 'relu'))
own_model.add(MaxPooling2D((2,2)))
own_model.add(Conv2D(32, (3,3), activation = 'relu'))
own_model.add(MaxPooling2D((2,2)))
own_model.add(Flatten())
own_model.add(Dense(128, activation = 'relu'))
own_model.add(Dense(1, activation = 'sigmoid'))
#Compile the model
own_model.compile(loss = 'binary_crossentropy', opti = 'adam', metrics = ['accuracy'])
#Print the summary of the model
own_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 148, 148, 32)      896       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 74, 74, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 72, 72, 32)        9248      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 36, 36, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 34, 34, 32)        9248      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 17, 17, 32)        0         
_________________________________________________________________
flatten (Flatten)            (None, 9248)              0

In [12]:
(validation_generator.samples//validation_generator.batch_size)

20

### Need a cloud GPU to run it

In [13]:
history = own_model.fit_generator(
    train_generator,
    epochs = 3,
    validation_data = validation_generator,
    steps_per_epoch = train_generator.samples//train_generator.batch_size,
    validation_steps = validation_generator.samples//validation_generator.batch_size,
    verbose = 1)