# Breed Classification 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from keras.models import Sequential
from keras.layers import Dense, Dropout, Input
from keras.layers import MaxPooling2D, ZeroPadding2D, Conv2D, Flatten
from keras.optimizers import Adam, SGD
from keras.preprocessing.image import img_to_array, load_img, ImageDataGenerator
from keras.utils import to_categorical

# Target
- A model that can classify 120 classes of the different breeds of dogs.

# Approach 
- A CNN with 120 classes 

# EDA


In [None]:
# read the CSV that contains the class labels
df1=pd.read_csv('../input/dog-breed-identification/labels.csv')
df1.head()

In [None]:
#location of files 
img_file='../input/dog-breed-identification/train/'

# adding column in the df for the image location
df = df1.assign(img_path=lambda x: img_file + x['id']+'.jpg')
df.head()

In [None]:
print(f"Total {df.shape[0]} images available for training")


There looks like a case of class imbalance. We can add synthetic oversampling (for the minority class) and undersampling (for the majority class) in the of data augmentaiton. 


In [None]:
# class distribution 
df.breed.value_counts()

# Loading dataset

Assuming that we have cleaned the data and the data is ready for modeling, there are multiple ways of loading the dataset. The size of the available RAM dictates our choices here. 


The data overfits the RAM in our case, so to keep things a bit simple, we’ll use a third of the data available.


In [None]:
from tqdm import tqdm_notebook as tqdm

''' loading the data directly to the RAM
    
    - create a numpy array.
    - fill it with the images.
 
    
'''

parts = 2
image_shape = (324, 324, 3)


images = df.img_path.values.tolist()
total_images = len(images)

img_pixels = np.zeros(shape=(total_images//parts,image_shape[0],image_shape[1],image_shape[2]), dtype=np.uint8)

for i, img in tqdm(enumerate(images[:len(images)//parts])):
    pixels = load_img(img, target_size=image_shape)
    img_pixels[i,:,:,:] = pixels

print(img_pixels.shape)

In [None]:
# label encoding

labels = df.breed[:len(df.breed)//parts]
img_label = pd.get_dummies(labels)
img_label

In [None]:
# lets get the dimensions 
X=img_pixels
y=img_label.values
print(X.shape)
print(y.shape)

In [None]:
# train_test_split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# Data Augmentation
train_datagen = ImageDataGenerator(
#     rotation_range=30,
    width_shift_range=0.2,
    height_shift_range=0.2,
    rescale=1./255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest')

test_datagen=ImageDataGenerator(rescale=1./255)

In [None]:
batch_size = 64
training_set=train_datagen.flow(X_train,y=y_train,batch_size=batch_size)
testing_set=test_datagen.flow(X_test,y=y_test,batch_size=batch_size)

In [None]:
# image preview
def show_image(generator):
    pass

show_image(training_set)

# Model definition [Keras]

In [None]:
# CNN model definition 
from keras.layers import Conv2D, MaxPooling2D, GlobalAveragePooling2D
from keras.layers import Dropout, Activation, Dense
from keras.models import Sequential
from keras.layers.normalization import BatchNormalization



model = Sequential()

model.add(Conv2D(16, (3, 3), padding='same', use_bias=False, input_shape=image_shape))
model.add(BatchNormalization(axis=3, scale=False))
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(4, 4), strides=(4, 4), padding='same'))
model.add(Dropout(0.2))

model.add(Conv2D(32, (3, 3), padding='same', use_bias=False))
model.add(BatchNormalization(axis=3, scale=False))
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(4, 4), strides=(4, 4), padding='same'))
model.add(Dropout(0.4))

model.add(Conv2D(64, (3, 3), padding='same', use_bias=False))
model.add(BatchNormalization(axis=3, scale=False))
model.add(Activation("relu"))
model.add(MaxPooling2D(pool_size=(4, 4), strides=(4, 4), padding='same'))
model.add(Dropout(0.6))

model.add(Conv2D(128, (3, 3), padding='same', use_bias=False))
model.add(BatchNormalization(axis=3, scale=False))
model.add(Activation("relu"))
model.add(Flatten())
model.add(Dropout(0.5))

model.add(Dense(1024, activation='relu'))
model.add(Dense(120, activation='softmax'))
model.summary()

In [None]:
# compile the model with loss
from keras.losses import categorical_crossentropy
model.compile(loss=categorical_crossentropy,optimizer='adam',metrics=['accuracy'])

In [None]:
# scheduling learning rate
import tensorflow as tf

def scheduler(epoch, lr):
    if epoch < 5:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

from keras import callbacks 
callback = callbacks.LearningRateScheduler(scheduler)

In [None]:
# Lets train it out 
history=model.fit_generator(training_set,
                            steps_per_epoch = 64,
                            validation_data = testing_set,
                            validation_steps = 32,
                            callbacks=[callback],
                            epochs = 50,
                            verbose = 1)

In [None]:
plt.figure(211)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')

plt.figure(212)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')

plt.show()