<a href="https://colab.research.google.com/github/SimeonHristov99/CodeEveryDay/blob/main/Dogs_vs_Cats.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dogs vs. Cats Kaggle Challenge

In [1]:
import os
import zipfile
from glob import glob

from functools import partial

from sklearn.model_selection import train_test_split

import tensorflow as tf
print(tf.__version__)

import numpy as np
import matplotlib.pyplot as plt

2.7.0


In [2]:
config = {
    'kaggle_dir': '../gdrive/MyDrive/kaggle',
    'data_path': '/content/data/train/train'
}

In [3]:
os.chdir('/content')
if not os.path.isdir('data'):
  kaggle_dir = config.get('kaggle_dir', None)

  from google.colab import drive
  drive.mount('/gdrive')
  
  assert kaggle_dir is not None and os.path.isdir(kaggle_dir), 'Kaggle directory not found!'
  assert os.path.isfile(f'{kaggle_dir}/kaggle.json'), 'Kaggle API token not found!'

  !pip install -q kaggle
  os.environ['KAGGLE_CONFIG_DIR'] = kaggle_dir

  # You have to enter in the competition to get access to the data.
  !kaggle competitions download -c dogs-vs-cats -p /content/data/
  assert os.path.isdir('data'), 'ERROR: Could not download the dataset!'

  with zipfile.ZipFile("/content/data/test1.zip","r") as zip_ref:
      zip_ref.extractall("/content/data/test1")
      os.remove("/content/data/test1.zip")

  with zipfile.ZipFile("/content/data/train.zip","r") as zip_ref:
      zip_ref.extractall("/content/data/train")
      os.remove('/content/data/train.zip')

  print('Successfully downloaded the dataset!')
else:
  print('Dataset already downloaded.')

Dataset already downloaded.


## EDA

In [4]:
len(glob('/content/data/train/train/dog*'))

12500

In [5]:
len(glob('/content/data/train/train/cat*'))

12500

In [6]:
len(glob('/content/data/train/train/*'))

25000

Classes are balanced.

## Preprocessing

In [7]:
def train_val_test_split(data_path):
  X = np.array(glob(data_path + '/*'))
  y = np.array([ x.split('.')[0][-3:].lower() for x in X ])

  X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)
  X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, train_size=0.5, random_state=42)

  return X_train, X_val, X_test, y_train, y_val, y_test

In [8]:
@tf.function
def parse(filenames, resize_to, will_augment):
  image = filenames.get('image', None)
  image = tf.io.read_file(image)
  image = tf.io.decode_jpeg(image, channels=3)
  image = tf.image.convert_image_dtype(image, tf.float32) # Convert to float and normalize at the same time.
  image = tf.image.resize(image, resize_to)

  label = filenames.get('label', None)
  label = 1 if label == 'dog' else 0

  if will_augment:
    return {
        'image': image,
        'label': label,
    }
  
  return image, label

In [9]:
@tf.function
def augment(filenames):
  image = filenames.get('image', None)
  image = tf.image.random_flip_left_right(image)
  image = tf.image.random_flip_up_down(image)
  image = tf.image.random_brightness(image, 0.2)
  image = tf.image.random_contrast(image, 0.5, 2.0)
  image = tf.image.random_saturation(image, 0.75, 1.25)
  image = tf.image.random_hue(image, 0.1)
  image = tf.clip_by_value(image, 0.0, 1.0)  # Keep pixel values between 0 and 1.

  return image, filenames.get('label', None)

In [10]:
def generate_dataset(image_files, label_files, resize_to, shuffle, batch_size, do_augment):
  data_dict = {
      'image': tf.constant(image_files),
      'label': tf.constant(label_files),
  }

  dataset = tf.data.Dataset.from_tensor_slices(data_dict)

  # Parse the files.
  parse_partial_fn = partial(parse, resize_to=resize_to, will_augment=do_augment)
  dataset = dataset.map(parse_partial_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE, deterministic=False)

  # Cache the parsed files.
  dataset = dataset.cache()

  # Augment the images (if requested).
  if do_augment:
    dataset = dataset.map(augment, num_parallel_calls=tf.data.experimental.AUTOTUNE, deterministic=False)

  # Shuffle the data.
  if shuffle:
    dataset = dataset.shuffle(buffer_size=len(image_files), reshuffle_each_iteration=True)

  # Batch the data.
  dataset = dataset.batch(batch_size, num_parallel_calls=tf.data.experimental.AUTOTUNE, deterministic=False)

  # Include prefetching.
  dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

  return dataset

In [11]:
def get_datasets(resize_to, batch_size, shuffle_X_train, augment_X_train):
  """
  Returns the three datasets (train, validation, test).
  """
  # Get the filepaths to the different sets.
  X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(config.get('data_path', None))

  # Get the datasets of tensors.
  X_train_dataset = generate_dataset(X_train, y_train, resize_to, shuffle=shuffle_X_train, batch_size=batch_size, do_augment=augment_X_train)
  X_val_dataset = generate_dataset(X_val, y_val, resize_to, shuffle=False, batch_size=batch_size, do_augment=False)
  X_test_dataset = generate_dataset(X_test, y_test, resize_to, shuffle=False, batch_size=batch_size, do_augment=False)

  return X_train_dataset, X_val_dataset, X_test_dataset

In [12]:
X_train_dataset, X_val_dataset, X_test_dataset = get_datasets(resize_to=[224, 224], batch_size=4, shuffle_X_train=True, augment_X_train=True)

## Training

In [None]:
it = X_train_dataset.__iter__()
image_show1, label1 = it.__next__()