# Dog breed classification

In [113]:
# !unzip "drive/MyDrive/Dog Vision/dog-breed-identification.zip" -d "drive/MyDrive/Dog Vision"

In [114]:
import tensorflow as tf
import tensorflow_hub as hub
import tf_keras as tfk
import pandas as pd
print("Tensorflow version:", tf.__version__)
print("Using GPU:", "yes" if tf.config.list_physical_devices("GPU") else "NO")

from pathlib import Path
path_to_dir = Path("drive/MyDrive/Dog Vision")
path_to_train_data = path_to_dir / "train"
path_to_test_data = path_to_dir / "test"
path_to_logs = path_to_dir / "logs"

train_data_filenames = [i for i in path_to_train_data.iterdir()]
test_data_filenames = [i for i in path_to_test_data.iterdir()]

Tensorflow version: 2.18.0
Using GPU: yes


In [115]:
labels_csv = pd.read_csv(path_to_dir / "labels.csv")
labels_csv.head()

Unnamed: 0,id,breed
0,000bec180eb18c7604dcecc8fe0dba07,boston_bull
1,001513dfcb2ffafc82cccf4d8bbaba97,dingo
2,001cdf01b096e06d78e9e5112d419397,pekinese
3,00214f311d5d2247d5dfe4fe24b2303d,bluetick
4,0021f9ceb3235effd7fcde7f7538ed62,golden_retriever


In [116]:
labels_csv.describe()

Unnamed: 0,id,breed
count,10222,10222
unique,10222,120
top,fff43b07992508bc822f33d8ffd902ae,scottish_deerhound
freq,1,126


In [117]:
sample_submission_csv = pd.read_csv(path_to_dir / "sample_submission.csv")
sample_submission_csv.head()

Unnamed: 0,id,affenpinscher,afghan_hound,african_hunting_dog,airedale,american_staffordshire_terrier,appenzeller,australian_terrier,basenji,basset,...,toy_poodle,toy_terrier,vizsla,walker_hound,weimaraner,welsh_springer_spaniel,west_highland_white_terrier,whippet,wire-haired_fox_terrier,yorkshire_terrier
0,000621fb3cbb32d8935728e48679680e,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,...,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333
1,00102ee9d8eb90812350685311fe5890,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,...,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333
2,0012a730dfa437f5f3613fb75efcd4ce,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,...,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333
3,001510bc8570bbeee98c8d80c8a95ec1,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,...,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333
4,001a5f3114548acdefa3d4da05474c2e,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,...,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333,0.008333


In [118]:
num_of_test_files = path_to_test_data.iterdir()
num_of_train_files = path_to_train_data.iterdir()
(len(labels_csv), len([i for i in num_of_train_files])), (len(sample_submission_csv), len([i for i in num_of_test_files]))

((10222, 10222), (10357, 10357))

In [119]:
train_labels = labels_csv["breed"].to_numpy()
train_labels[:2]

array(['boston_bull', 'dingo'], dtype=object)

In [120]:
import numpy as np

breeds = np.unique(train_labels)
len(breeds)

120

In [121]:
boolean_labels = [label == breeds for label in labels_csv["breed"]]
boolean_labels[:2]

[array([False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False,  True, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False,
        False, False, False]),
 array([False, False, False, False, False, Fal

In [122]:
NUM_IMAGES = 1000  #@param {type: "slider", min:1000, max: 10000, step:1000}

In [123]:
X = [str(i) for i in path_to_train_data.iterdir()]
y = boolean_labels

In [124]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X[:NUM_IMAGES],
                                                  y[:NUM_IMAGES],
                                                  test_size=0.2)
len(X_train), len(y_train), len(X_val), len(y_val)

(800, 800, 200, 200)

In [125]:
X_train[:2]

['drive/MyDrive/Dog Vision/train/f2d5c11258efb9e8de45a81a6982f8f2.jpg',
 'drive/MyDrive/Dog Vision/train/e4c7a29165989410af12b0e3fc6bef0d.jpg']

In [126]:
from matplotlib.pyplot import imread
image = tf.io.read_file(str(train_data_filenames[0]))
# image

In [127]:
import os

IMG_SIZE = 224

def process_image(img_path: str, img_size = IMG_SIZE):
  """ Read and convert the image into a tf.Tensor """
  if not os.path.exists(img_path):
    raise FileNotFoundError(f"File not found: {img_path}")

  image = tf.io.read_file(img_path)

  # Turn into numerical Tensor with RGB
  image = tf.image.decode_jpeg(image, channels=3)

  # Normalization
  image = tf.image.convert_image_dtype(image, tf.float32)

  # Resize the image
  image = tf.image.resize(image, size=[img_size, img_size])

  return image

In [128]:
def get_image_label(image_path: str | Path, label: str):
  if not isinstance(image_path, str) or not not isinstance(image_path, Path):
    raise ValueError("Wrong image_path type:", type(image_path))

  image = process_image(str(image_path))
  return image, label

In [136]:
# get_image_label(X[0], y[0])

In [130]:
# import os

# for i, img_path in enumerate(X):
#   if not os.path.exists(img_path):
#     raise FileNotFoundError(f"{i} File not found: {img_path}")

# print(f"Checked {len(X)} images like {X[0]}")

Checked 10222 images like drive/MyDrive/Dog Vision/train/e56ce32aeccac2bfbbcd6eb80ab337da.jpg


In [131]:
BATCH_SIZE = 32

def create_data_batches(X, y=None, batch_size=BATCH_SIZE, valid_data=False, test_data=False):
  if test_data:
    data = tf.data.Dataset.from_tensor_slices((tf.constant(X)))  # Only filepaths (no labels)
    data_batch = data.map(process_image).batch(batch_size)
    return data_batch

  elif valid_data:
    data = tf.data.Dataset.from_tensor_slices((tf.constant(X),
                                               tf.constant(y)))
    data_batch = data.map(get_image_label).batch(batch_size)
    return data_batch

  else:
    print("Creating trainitd data batches...")
    data = tf.data.Dataset.from_tensor_slices((tf.constant(X),
                                               tf.constant(y)))

    data = data.shuffle(buffer_size=len(X))
    data = data.map(get_image_label)

    data_batch = data.batch(batch_size)

    return data_batch

In [132]:
# Create training and validation data
train_data = create_data_batches(X_train, y_train)
val_data = create_data_batches(X_val, y_val, valid_data=True)

Creating trainitd data batches...


ValueError: in user code:

    File "<ipython-input-111-77a4a7f9b7ad>", line 3, in get_image_label  *
        raise ValueError("Wrong image_path type:", type(image_path))

    ValueError: ('Wrong image_path type:', <class 'tensorflow.python.framework.ops.SymbolicTensor'>)


In [None]:
train_data.element_spec, val_data.element_spec

### Building the model
To define:
* The input shape (our images shape, in the form of Tensor) to our model
* The ouppus shape (image labels, in the form of Tensor) of our model
* The URL of the model we want to use

In [None]:
INPUT_SHAPE = (None, IMG_SIZE, IMG_SIZE, 3)  # (224, 224, 3)
OUTPUT_SHAPE = len(breeds)  # 120

# Setup model URL from TensorFlow Hub
MODEL_URL = "https://tfhub.dev/google/imagenet/mobilenet_v2_130_224/feature_vector/5"


In [None]:
def create_model(input_shape=INPUT_SHAPE, output_shape=OUTPUT_SHAPE, model_url=MODEL_URL):
    print("Building model with:", MODEL_URL)
    # Setup the model layers
    model = tfk.Sequential([
        hub.KerasLayer(MODEL_URL), # Layer 1 (input layer)
        tfk.layers.Dense(units=OUTPUT_SHAPE,
                              activation="softmax") # Layer 2 (output layer)
    ])
    # Compile the model
    model.compile(
        loss=tfk.losses.CategoricalCrossentropy(),
        optimizer=tfk.optimizers.Adam(),
        metrics=["accuracy"]
    )
    # Build the model
    model.build(INPUT_SHAPE)
    return model

In [None]:
model = create_model()
model.summary()

## Callbacks

### TensorBoard

In [None]:
%load_ext tensorboard

In [None]:
import datetime

# Create a func to build a TensorBoard callback

def create_tensorboard_callback():
  current_time = datetime.datetime.now().strftime("%Y_%m_%d-%H:%M:%S")
  log_dir = path_to_logs / current_time
  return tfk.callbacks.TensorBoard(log_dir)

### Early stopping Callback

In [None]:
# Create early stopping callback
early_stopping = tfk.callbacks.EarlyStopping(monitor="val_accuracy",
                                                 patience=3)

## Training the model

In [None]:
NUM_EPOCHS = 100 #@param {type:"slider", min:10, max:100}

In [None]:
# Function that trains the model

def train_model():
  model = create_model()

  print("Model is created")

  # Create new TensorBoard session
  tensor_board = create_tensorboard_callback()

  print("create_tensorboard_callback")

  model.fit(x=train_data,
            epochs=NUM_EPOCHS,
            validation_data=val_data,
            validation_freq=1,
            callbacks=[tensor_board, early_stopping],
            )

  return model

In [None]:
model = train_model()