# Project 3: Multi-class Dog Breed Classification

# Get workspace ready

In [5]:
import tensorflow as tf
import tensorflow_hub as hub
print("TF version:", tf.__version__)
print("TF Hub version:", hub.__version__)

TF version: 2.18.0
TF Hub version: 0.16.1


In [6]:
print("GPU", "available" if tf.config.list_physical_devices("GPU") else "not available")

GPU not available


In [7]:
import os
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

# 1.Data pre-processing

## Getting data ready

In [4]:
!unzip dog-breed-identification.zip

Archive:  dog-breed-identification.zip
  End-of-central-directory signature not found.  Either this file is not
  a zipfile, or it constitutes one disk of a multi-part archive.  In the
  latter case the central directory and zipfile comment will be found on
  the last disk(s) of this archive.
unzip:  cannot find zipfile directory in one of dog-breed-identification.zip or
        dog-breed-identification.zip.zip, and cannot find dog-breed-identification.zip.ZIP, period.


In [None]:
!ls

In [None]:
labels_csv = pd.read_csv("/content/dog-breed-identification/labels.csv")
# labels_csv = pd.read_csv("drive/My Drive/Dog Vision/labels.csv")

FileNotFoundError: [Errno 2] No such file or directory: '/content/DogVision/labels.csv'

In [None]:
labels_csv.head()

In [None]:
labels_csv.describe()

In [None]:
labels_csv.info()

In [None]:
labels_csv["breed"].value_counts()

In [None]:
labels_csv["breed"].value_counts().plot.bar(figsize=(35, 35))

In [None]:
labels_csv["breed"].value_counts().median()

In [None]:
from IPython.display import Image
Image("/content/dog-breed-identification/train/00a338a92e4e7bf543340dc849230e75.jpg")

## Getting images and their labels

In [None]:
filenames = [fname for fname in labels_csv["id"]]
filenames

In [None]:
filenames = ["/content/dog-breed-identification/train/" + fname + ".jpg" for fname in labels_csv["id"]]
filenames[:10]

In [None]:
os.listdir("/content/dog-breed-identification/train/")[:10]

In [None]:
if len(os.listdir("/content/dog-breed-identification/train/")) == len(filenames):
  print("Filenames match actual amount of files!!! Proceed.")
else:
  print("Filenames do no match actual amount of files, check the target directory.")

In [None]:
Image(filenames[5])

In [None]:
labels_csv["breed"][5]

In [None]:
labels = labels_csv["breed"].to_numpy()
labels

In [None]:
len(labels)

In [None]:
labels[:10]

In [None]:
if len(labels) == len(filenames):
  print("Number of labels matches number of filenames!")
else:
  print("Number of labels does not match number of filenames, check data directories!")

In [None]:
unique_breeds = np.unique(labels)
len(unique_breeds)

In [None]:
unique_breeds

In [None]:
print(labels[0])
labels[0] == unique_breeds

In [None]:
boolean_labels = [label == unique_breeds for label in labels]
boolean_labels[:3]

In [None]:
len(boolean_labels)

In [None]:
labels[0]

In [None]:
np.where(unique_breeds == labels[0])

In [None]:
boolean_labels[0].argmax()

In [None]:
boolean_labels[0].astype(int)

In [None]:
labels[1]
boolean_labels[1].astype(int)

In [None]:
filenames[:10]

## Creating validation set

In [None]:
X = filenames
y = boolean_labels

In [None]:
len(filenames)

In [None]:
NUM_IMAGES = 5000

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X[:NUM_IMAGES],
                                                  y[:NUM_IMAGES],
                                                  test_size=0.2,
                                                  random_state=42)

len(X_train), len(y_train), len(X_val), len(y_val)

In [None]:
X_train[:3], y_train[:3]

## Preprocessing Images

In [None]:
from matplotlib.pyplot import imread

image = imread(filenames[5])
image.shape

In [None]:
image.max(), image.min()

In [None]:
IMG_SIZE = 224

In [None]:
def process_image(image_path, img_size=IMG_SIZE):

  image = tf.io.read_file(image_path)
  image = tf.image.decode_jpeg(image, channels=3)
  image = tf.image.convert_image_dtype(image, tf.float32)
  image = tf.image.resize(image, size=[IMG_SIZE, IMG_SIZE])
  return image

In [None]:
tensor = tf.io.read_file(filenames[15])
tensor

In [None]:
tensor = tf.image.decode_jpeg(tensor, channels=3)[:2]
tensor

In [None]:
tf.image.convert_image_dtype(tensor, tf.float32)

## Turing data into batches

In [None]:
def get_image_label(image_path, label):

  image = process_image(image_path)
  return image, label

In [None]:
tf.constant(y[5])

In [None]:
process_image(X[5], tf.constant(y[5]))

In [None]:
BATCH_SIZE = 32

In [None]:
def create_data_batches(X, y=None, batch_size=BATCH_SIZE, valid_data=False, test_data=False):

  if test_data:
    print("Creating test data batches...")
    data = tf.data.Dataset.from_tensor_slices((tf.constant(X)))
    data_batch = data.map(process_image).batch(BATCH_SIZE)
    return data_batch

  elif valid_data:
    print("Creating validation data batches...")
    data = tf.data.Dataset.from_tensor_slices((tf.constant(X), tf.constant(y)))
    data_batch = data.map(get_image_label).batch(BATCH_SIZE)
    return data_batch

  else:
    print("Creating training data batches...")
    data = tf.data.Dataset.from_tensor_slices((tf.constant(X), tf.constant(y)))
    data = data.shuffle(buffer_size=len(X))
    data = data.map(get_image_label)
    data_batch = data.batch(BATCH_SIZE)

  return data_batch

In [None]:
X_train[:3]

In [None]:
y_train[:3]

In [None]:
train_data = create_data_batches(X_train, y_train)
val_data = create_data_batches(X_val, y_val, valid_data=True)

In [None]:
train_data, val_data

In [None]:
train_data.element_spec, val_data.element_spec

## Visualize data batches

In [None]:
import matplotlib.pyplot as plt

def show_25_images(images, labels):
  plt.figure(figsize=(10, 10))
  for i in range(25):
    ax = plt.subplot(5, 5, i+1)
    plt.imshow(images[i])
    plt.title(unique_breeds[labels[i].argmax()])
    plt.axis("off")

In [None]:
unique_breeds

In [None]:
len(unique_breeds)

In [None]:
unique_breeds[y[0].argmax()]

In [None]:
train_images, train_labels = next(train_data.as_numpy_iterator())
show_25_images(train_images, train_labels)

In [None]:
len(train_images), len(train_labels)

In [None]:
val_images, val_labels = next(val_data.as_numpy_iterator())
show_25_images(val_images, val_labels)

# 2.Building a model

In [None]:
IMG_SIZE

In [None]:
INPUT_SHAPE = [None, IMG_SIZE, IMG_SIZE, 3]
OUTPUT_SHAPE = len(unique_breeds)
MODEL_URL = "https://tfhub.dev/google/imagenet/mobilenet_v2_130_224/classification/4"

In [None]:
!pip install tensorflow==2.15.0                     ### restart session

In [None]:
from tensorflow.keras import layers

def create_model(input_shape=INPUT_SHAPE, output_shape=OUTPUT_SHAPE, model_url=MODEL_URL):

  print("Building model with:", MODEL_URL)
  model = tf.keras.Sequential([
    hub.KerasLayer(MODEL_URL),
    tf.keras.layers.Dense(units=OUTPUT_SHAPE, activation="softmax")
  ])

  model.compile(
      loss=tf.keras.losses.CategoricalCrossentropy(),
      optimizer=tf.keras.optimizers.Adam(),
      metrics=["accuracy"]
  )

  model.build(INPUT_SHAPE)
  return model

In [None]:
model = create_model()
model.summary()

## Creating callbacks

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", patience=3)

# 3.Training a model

In [None]:
NUM_EPOCHS = 10     # @param {type:"slider", min:10, max:50, step:5}

In [None]:
def train_model():

  model = create_model()
  tensorboard = create_tensorboard_callback()
  model.fit(x=train_data,
            epochs=NUM_EPOCHS,
            validation_data=val_data,
            validation_freq=1,
            callbacks=[tensorboard, early_stopping])
  return model

In [None]:
model = train_model()

In [None]:
def save_model(model, suffix=None):

  modeldir = os.path.join("/content/dog-breed-identification/models", datetime.datetime.now().strftime("%Y%m%d-%H%M%s"))
  model_path = modeldir + "-" + suffix + ".h5"
  print(f"Saving model to: {model_path}")
  model.save(model_path)
  return model_path

In [None]:
save_model(model, suffix="1000-images-mobilenetv2-Adam")

In [None]:
def load_model(model_path):

  print(f"Loading saved model from: {model_path}")
  model = tf.keras.models.load_model(model_path, custom_objects={"KerasLayer":hub.KerasLayer})
  return model

In [None]:
loaded_full_model = load_model('/content/dog-breed-identification/models/20250316-08491742114970-1000-images-mobilenetv2-Adam.h5')

# 4.Evaluating a model

In [None]:
predictions = model.predict(val_data, verbose=1)
predictions

In [None]:
predictions.shape

In [None]:
predictions[0]

In [None]:
np.sum(predictions[0])

In [None]:
len(predictions)

In [None]:
len(predictions[0])

In [None]:
len(y_val)

In [None]:
len(unique_breeds)

In [None]:
index = 42
print(predictions[index])
print(f"Max value (probability of prediction): {np.max(predictions[index])}")
print(f"Sum: {np.sum(predictions[index])}")
print(f"Max index: {np.argmax(predictions[index])}")
print(f"Predicted label: {unique_breeds[np.argmax(predictions[index])]}")

In [None]:
unique_breeds[5]

In [None]:
def get_pred_label(prediction_probabilities):
  return unique_breeds[np.argmax(prediction_probabilities)]

In [None]:
pred_label = get_pred_label(predictions[5])
pred_label

In [None]:
val_data

In [None]:
len(val_data)

In [None]:
image_ = []
label_ = []

for image, label in val_data.unbatch().as_numpy_iterator():
  image_.append(image)
  label_.append(label)

image_[0], label_[0]

In [None]:
get_pred_label(label_[0])

In [None]:
get_pred_label(predictions[0])

In [None]:
get_pred_label(val_labels[0])

In [None]:
def unbatchify(data):
  images = []
  labels = []

  for image, label in data.unbatch().as_numpy_iterator():
    images.append(image)
    labels.append(unique_breeds[np.argmax(label)])
  return images, labels

In [None]:
val_images, val_labels = unbatchify(val_data)
val_images[0], val_labels[0]

In [None]:
def plot_pred(prediction_probabilities, labels, images, n=1):

  pred_prob, true_label, image = prediction_probabilities[n], labels[n], images[n]
  pred_label = get_pred_label(pred_prob)

  plt.imshow(image)
  plt.xticks([])
  plt.yticks([])

  if pred_label == true_label:
    color = "green"
  else:
    color = "red"

  plt.title("{} {:2.0f}% {}".format(pred_label, np.max(pred_prob)*100, true_label), color=color)

In [None]:
plot_pred(prediction_probabilities=predictions, labels=val_labels, images=val_images, n=5)

In [None]:
predictions[0]

In [None]:
predictions[0].argsort()

In [None]:
predictions[0].argsort()[-10:][::-1]

In [None]:
predictions[0][predictions[0].argsort()[-10:][::-1]]

In [None]:
predictions[0].max()

In [None]:
unique_breeds[predictions[0].argsort()[-10:][::-1]]

In [None]:
def plot_pred_conf(prediction_probabilities, labels, n=1):

  pred_prob, true_label = prediction_probabilities[n], labels[n]
  pred_label = get_pred_label(pred_prob)
  top_10_pred_indexes = pred_prob.argsort()[-10:][::-1]
  top_10_pred_values = pred_prob[top_10_pred_indexes]
  top_10_pred_labels = unique_breeds[top_10_pred_indexes]
  top_plot = plt.bar(np.arange(len(top_10_pred_labels)), top_10_pred_values, color="grey")
  plt.xticks(np.arange(len(top_10_pred_labels)), labels=top_10_pred_labels, rotation="vertical")

  if np.isin(true_label, top_10_pred_labels):
    top_plot[np.argmax(top_10_pred_labels == true_label)].set_color("green")
  else:
    pass

In [None]:
plot_pred_conf(prediction_probabilities=predictions, labels=val_labels, n=5)

In [None]:
i_multiplier = 20
num_rows = 3
num_cols = 2
num_images = num_rows*num_cols
plt.figure(figsize=(10*num_cols, 5*num_rows))
for i in range(num_images):
  plt.subplot(num_rows, 2*num_cols, 2*i+1)
  plot_pred(prediction_probabilities=predictions,
            labels=val_labels,
            images=val_images,
            n=i+i_multiplier)
  plt.subplot(num_rows, 2*num_cols, 2*i+2)
  plot_pred_conf(prediction_probabilities=predictions,
                 labels=val_labels,
                 n=i+i_multiplier)
plt.tight_layout(h_pad=1.0)
plt.show()

In [None]:
unique_breeds

In [None]:
y_val[:3]

In [None]:
true_indices = [np.where(array == True)[0][0] for array in y_val]
y_test = np.array(true_indices)
y_test

In [None]:
len(y_test)

In [None]:
len(y_val)

In [None]:
val_data = create_data_batches(X_val, y_val, valid_data=True)

In [None]:

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score
import seaborn as sns

In [None]:
y_preds = predictions.argmax(axis=1)
y_preds

In [None]:
len(y_preds)

In [None]:
sns.set(font_scale=1.5)

def plot_conf_mat(y_test, y_preds):

    fig, ax = plt.subplots(figsize=(30, 30))
    ax = sns.heatmap(confusion_matrix(y_test, y_preds),annot=True,cbar=False)
    plt.xlabel("Predicted  label")
    plt.ylabel("True label")
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)

In [None]:
plot_conf_mat(y_test, y_preds)

In [None]:
y_target = unique_breeds

In [None]:
len(y_target)

In [None]:
ax = plt.subplot()
plt.rcParams['figure.figsize'] = [30, 30]
plt.rcParams['figure.dpi'] = 100
plt.rcParams['figure.facecolor'] = 'white'
plt.rcParams['font.size'] = 18

sns.heatmap(confusion_matrix(y_test, y_preds), annot=True, fmt='g', ax=ax, cmap="coolwarm");  #annot=True to annotate cells, ftm='g' to disable scientific notation

ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');
ax.set_title('Confusion Matrix', fontsize=20);
plt.show()


## Saving and reloading a trained model

In [None]:
def save_model(model, suffix=None):

  modeldir = os.path.join("/content/dog-breed-identification/models", datetime.datetime.now().strftime("%Y%m%d-%H%M%s"))
  model_path = modeldir + "-" + suffix + ".h5"
  print(f"Saving model to: {model_path}")
  model.save(model_path)
  return model_path

In [None]:
def load_model(model_path):

  print(f"Loading saved model from: {model_path}")
  model = tf.keras.models.load_model(model_path, custom_objects={"KerasLayer":hub.KerasLayer})
  return model

In [None]:
save_model(model, suffix="1000-images-mobilenetv2-Adam")

In [None]:
loaded_1000_image_model = load_model('/content/dog-breed-identification/models/20250316-08491742114970-1000-images-mobilenetv2-Adam.h5')

In [None]:
model.evaluate(val_data)

In [None]:
loaded_1000_image_model.evaluate(val_data)

# 5.Training a model with full data

In [None]:
full_data = create_data_batches(X, y)
full_data

In [None]:
full_model = create_model()

In [None]:
full_model.summary()

In [None]:
full_model_early_stopping = tf.keras.callbacks.EarlyStopping(monitor="accuracy", patience=3)

In [None]:
full_model.fit(x=full_data, epochs=NUM_EPOCHS, callbacks=[full_model_early_stopping])

In [None]:
save_model(full_model, suffix="full-image-set-mobilenetv2-Adam")

In [None]:
loaded_full_model = load_model('/content/dog-breed-identification/models/20250316-23421742168569-full-image-set-mobilenetv2-Adam.h5')

# 6.Deployment

## Making predictions on test dataset

In [None]:
test_path = "/content/dog-breed-identification/test/"
test_filenames = [test_path + fname for fname in os.listdir(test_path)]
test_filenames[:10]

In [None]:
len(test_filenames)

In [None]:
test_data = create_data_batches(test_filenames, test_data=True)
test_data

In [None]:
test_predictions = loaded_full_model.predict(test_data, verbose=1)

In [None]:
np.savetxt("/content/drive/dog-breed-identification/preds_array.csv", test_predictions, delimiter=",")

In [None]:
test_predictions = np.loadtxt("/content/dog-breed-identification/preds_array.csv", delimiter=",")

In [None]:
test_predictions[:10]

In [None]:
test_predictions.shape

## Making predictions on custom images

In [None]:
custom_path = "/content/dog-breed-identification/custom_image/"

In [None]:
custom_image_paths = [custom_path + fname for fname in os.listdir(custom_path)]

In [None]:
os.listdir(custom_path)

In [None]:
custom_data = create_data_batches(custom_image_paths, test_data=True)
custom_data

In [None]:
custom_preds = model.predict(custom_data)

In [None]:
custom_pred_labels = [get_pred_label(custom_preds[i]) for i in range(len(custom_preds))]
custom_pred_labels

In [None]:
custom_images = []
for image in custom_data.unbatch().as_numpy_iterator():
  custom_images.append(image)

In [None]:
plt.figure(figsize=(10, 10))
for i, image in enumerate(custom_images):
  plt.subplot(1, 3, i+1)
  plt.xticks([])
  plt.yticks([])
  plt.title(custom_pred_labels[i])
  plt.imshow(image)