# Introduction

This is project where I explore a new technique to classify images. In this case I will transfer learning to extract features.

This project is base on: https://www.kaggle.com/phylake1337/0-18-loss-simple-feature-extractors

# Load data

In [None]:
!pip install kaggle --upgrade --force

In [1]:
from google.colab import files
uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json
User uploaded file "kaggle.json" with length 68 bytes


In [2]:
!kaggle competitions download -c dog-breed-identification

dog-breed-identification.zip: Skipping, found more recently modified local copy (use --force to force download)


# Data understanding

Libraries

In [3]:
import pandas as pd
import os
import tensorflow as tf
import numpy as np

Hyperparameters

In [4]:
pathData = '/content/'
seed = 1
imgHeight = 224
imgWidth = 224
batchSize = 32
numberChannels = 3

In [5]:
!unzip /content/dog-breed-identification.zip

## Training data

### Structure data

In [6]:
dsLabel = pd.read_csv(os.path.join(pathData, 'labels.csv'))
print('Shape:', dsLabel.shape)
dsLabel.head()

Shape: (10222, 2)


Unnamed: 0,id,breed
0,000bec180eb18c7604dcecc8fe0dba07,boston_bull
1,001513dfcb2ffafc82cccf4d8bbaba97,dingo
2,001cdf01b096e06d78e9e5112d419397,pekinese
3,00214f311d5d2247d5dfe4fe24b2303d,bluetick
4,0021f9ceb3235effd7fcde7f7538ed62,golden_retriever


In [7]:
print('Number of breeds:', len(dsLabel['breed'].unique()))

Number of breeds: 120


Calculating number of observations per breed.

In [8]:
dsLabelGroup = dsLabel.groupby('breed').count().reset_index()
dsLabelGroup = dsLabelGroup.rename(columns={'id': 'count'})
print('Shape:', dsLabelGroup.shape)
dsLabelGroup.head()

Shape: (120, 2)


Unnamed: 0,breed,count
0,affenpinscher,80
1,afghan_hound,116
2,african_hunting_dog,86
3,airedale,107
4,american_staffordshire_terrier,74


In [9]:
dsLabelGroup[dsLabelGroup['count'] == dsLabelGroup['count'].max()]

Unnamed: 0,breed,count
97,scottish_deerhound,126


In [10]:
dsLabelGroup[dsLabelGroup['count'] == dsLabelGroup['count'].min()]

Unnamed: 0,breed,count
23,briard,66
43,eskimo_dog,66


### Images

In [11]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [12]:
Xpath = dsLabel['id'].values
y = dsLabel['breed'].values

In [13]:
uniqueBreed = set(y)
print('Number of breeds:', len(uniqueBreed))

Number of breeds: 120


Encoder target variable

In [14]:
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)
print('Shape:', y.shape)

Shape: (10222,)


In [15]:
y = tf.keras.utils.to_categorical(y)

In [16]:
print('Shape Xpath:', Xpath.shape)
print('Shape y:', y.shape)

Shape Xpath: (10222,)
Shape y: (10222, 120)


Split data

In [17]:
Xtrain, Xval, ytrain, yval = train_test_split(Xpath, y, test_size=0.2, random_state=42)

print('Xtrain:', Xtrain.shape)
print('Xval:', Xval.shape)
print('ytrain:', ytrain.shape)
print('yval:', yval.shape)

Xtrain: (8177,)
Xval: (2045,)
ytrain: (8177, 120)
yval: (2045, 120)


Read images

In [18]:
# Create function to read images
def getImage(filePath):
    img = tf.io.read_file(filePath)
    img = tf.image.decode_jpeg(img, channels = numberChannels)
    img = tf.image.resize(img, [imgHeight, imgWidth])
    return img

In [19]:
Xtrain = tf.data.Dataset.from_tensor_slices(Xtrain)

In [20]:
Xtrain = Xtrain.map(lambda x: getImage(tf.strings.join([pathData, 'train/', x, '.jpg'])), num_parallel_calls=tf.data.experimental.AUTOTUNE)
print('Number of observations:', tf.data.experimental.cardinality(Xtrain).numpy())

Number of observations: 8177


In [21]:
Xval = tf.data.Dataset.from_tensor_slices(Xval)

In [22]:
Xval = Xval.map(lambda x: getImage(tf.strings.join([pathData, 'train/', x, '.jpg'])), num_parallel_calls=tf.data.experimental.AUTOTUNE)
print('Number of observations:', tf.data.experimental.cardinality(Xval).numpy())

Number of observations: 2045


## Testing data

### Images

In [23]:
# Load path files
listFileTest = tf.data.Dataset.list_files(os.path.join(pathData, 'test', '*'), shuffle = False, seed = seed)

In [24]:
print('Number of observations:', tf.data.experimental.cardinality(listFileTest).numpy())

Number of observations: 10357


In [25]:
# Create function to read images
def getImage(filePath):
    img = tf.io.read_file(filePath)
    img = tf.image.decode_jpeg(img, channels = numberChannels)
    img = tf.image.resize(img, [imgHeight, imgWidth])
    return img

In [26]:
Xtest = listFileTest.map(getImage, num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [27]:
Xtest = Xtest.batch(batchSize)

# Modeling

## Extract features
In this step, I will extract features using transfer learning from three different model. These new features will be input features of classify model.


In [28]:
Xtrain = Xtrain.batch(batchSize)
Xval = Xval.batch(batchSize)

### InceptionV3

In [29]:
inceptionV3Model = tf.keras.applications.inception_v3.InceptionV3(weights='imagenet', include_top=False, input_shape=(imgHeight, imgWidth, numberChannels))

In [30]:
extractor1 = tf.keras.Sequential([
  tf.keras.Input(shape=(imgHeight, imgWidth, numberChannels)),
  tf.keras.layers.Lambda(tf.keras.applications.inception_v3.preprocess_input),
  #augmentationLayer,
  inceptionV3Model,
  tf.keras.layers.GlobalAveragePooling2D()
])

In [31]:
extractor1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lambda (Lambda)              (None, 224, 224, 3)       0         
_________________________________________________________________
inception_v3 (Functional)    (None, 5, 5, 2048)        21802784  
_________________________________________________________________
global_average_pooling2d (Gl (None, 2048)              0         
Total params: 21,802,784
Trainable params: 21,768,352
Non-trainable params: 34,432
_________________________________________________________________


In [32]:
featuresTrain1 = extractor1.predict(Xtrain)
featuresTrain1.shape

(8177, 2048)

In [33]:
featuresVal1 = extractor1.predict(Xval)
featuresVal1.shape

(2045, 2048)

### Xception

In [34]:
xceptionModel = tf.keras.applications.Xception(weights='imagenet', include_top=False, input_shape=(imgHeight, imgWidth, numberChannels))

In [35]:
extractor2 = tf.keras.Sequential([
  tf.keras.Input(shape=(imgHeight, imgWidth, numberChannels)),
  tf.keras.layers.Lambda(tf.keras.applications.xception.preprocess_input),
  xceptionModel,
  tf.keras.layers.GlobalAveragePooling2D()
])

In [36]:
featuresTrain2 = extractor2.predict(Xtrain)
featuresTrain2.shape

(8177, 2048)

In [37]:
featuresVal2 = extractor2.predict(Xval)
featuresVal2.shape

(2045, 2048)

### InceptionResNetV2

In [38]:
inceptionResNetV2Model = tf.keras.applications.InceptionResNetV2(weights='imagenet', include_top=False, input_shape=(imgHeight, imgWidth, numberChannels))

In [39]:
extractor3 = tf.keras.Sequential([
  tf.keras.Input(shape=(imgHeight, imgWidth, numberChannels)),
  tf.keras.layers.Lambda(tf.keras.applications.inception_resnet_v2.preprocess_input),
  #augmentationLayer,
  inceptionResNetV2Model,
  tf.keras.layers.GlobalAveragePooling2D()
])

In [40]:
featuresTrain3 = extractor3.predict(Xtrain)
featuresTrain3.shape

(8177, 1536)

In [41]:
featuresVal3 = extractor3.predict(Xval)
featuresVal3.shape

(2045, 1536)

## Integration
To create a input for classification model, I have to concat previous features.

In [42]:
XtrainConcat = tf.concat([featuresTrain1, featuresTrain2, featuresTrain3], -1)
print('XtrainConcat:', XtrainConcat.shape)

XtrainConcat: (8177, 5632)


In [43]:
XvalConcat = tf.concat([featuresVal1, featuresVal2, featuresVal3], -1)
print('XvalConcat:', XvalConcat.shape)

XvalConcat: (2045, 5632)


In [44]:
y.shape

(10222, 120)

In [45]:
model = tf.keras.Sequential([
  tf.keras.Input(shape=(XtrainConcat.shape[1],)),
  tf.keras.layers.Dropout(0.7),

  tf.keras.layers.Dense(1024, activation='relu'),

  tf.keras.layers.Dense(ytrain.shape[1], activation='softmax')
])

In [46]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dropout (Dropout)            (None, 5632)              0         
_________________________________________________________________
dense (Dense)                (None, 1024)              5768192   
_________________________________________________________________
dense_1 (Dense)              (None, 120)               123000    
Total params: 5,891,192
Trainable params: 5,891,192
Non-trainable params: 0
_________________________________________________________________


In [47]:
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001),
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
              metrics=['accuracy'])

In [48]:
history = model.fit(XtrainConcat, ytrain, validation_data=(XvalConcat, yval), epochs=30, callbacks=tf.keras.callbacks.EarlyStopping('val_loss'))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30


## Testing models

Extract features

In [49]:
featuresTest1 = extractor1.predict(Xtest)
print('featuresTest1:', featuresTest1.shape)

featuresTest2 = extractor2.predict(Xtest)
print('featuresTest2:', featuresTest2.shape)

featuresTest3 = extractor3.predict(Xtest)
print('featuresTest3:', featuresTest3.shape)

featuresTest1: (10357, 2048)
featuresTest2: (10357, 2048)
featuresTest3: (10357, 1536)


Concat extrated features

In [50]:
XtestConcat = tf.data.Dataset.from_tensor_slices(tf.concat([featuresTest1, featuresTest2, featuresTest3], -1))
XtestConcat

<TensorSliceDataset shapes: (5632,), types: tf.float32>

In [51]:
XtestConcat = XtestConcat.batch(batchSize)

In [52]:
predicted = model.predict(XtestConcat)
print('Predicted shape:', predicted.shape)

Predicted shape: (10357, 120)


Create submission file

In [53]:
imgNameTest = []
for file in os.listdir(os.path.join(pathData, 'test')):
  imgNameTest.append(file.split('.')[0])

In [54]:
dsSubmission = pd.DataFrame(np.round(predicted, 6), columns=uniqueBreed)
dsSubmission.head()

Unnamed: 0,cardigan,doberman,brittany_spaniel,norfolk_terrier,french_bulldog,yorkshire_terrier,clumber,saint_bernard,eskimo_dog,irish_terrier,sussex_spaniel,cairn,staffordshire_bullterrier,border_terrier,black-and-tan_coonhound,silky_terrier,curly-coated_retriever,papillon,great_dane,gordon_setter,pekinese,boxer,dandie_dinmont,miniature_schnauzer,redbone,affenpinscher,old_english_sheepdog,bernese_mountain_dog,saluki,american_staffordshire_terrier,entlebucher,english_foxhound,otterhound,malinois,mexican_hairless,samoyed,west_highland_white_terrier,malamute,lhasa,bull_mastiff,...,tibetan_mastiff,schipperke,kuvasz,basenji,shih-tzu,boston_bull,maltese_dog,pug,newfoundland,blenheim_spaniel,standard_schnauzer,walker_hound,german_short-haired_pointer,pomeranian,lakeland_terrier,sealyham_terrier,irish_wolfhound,borzoi,bluetick,bouvier_des_flandres,vizsla,beagle,bedlington_terrier,chihuahua,norwich_terrier,miniature_poodle,dingo,scotch_terrier,irish_setter,appenzeller,groenendael,shetland_sheepdog,giant_schnauzer,toy_poodle,brabancon_griffon,miniature_pinscher,chow,african_hunting_dog,japanese_spaniel,wire-haired_fox_terrier
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4e-06,0.0,0.0,0.0,3.5e-05,0.0,0.0,0.0,0.0,0.0,5e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1e-06,0.000518,0.008769,0.0,1e-06,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1e-06,0.0,9.9e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2e-06,1e-06,0.0,0.0,0.0,3e-06,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1e-06,0.0,0.0,0.0,0.0,0.0,0.0,4e-06,1e-06,1e-06,1e-06,0.0,1e-06,0.0,0.0,0.0,1e-06,0.0,0.0,2e-06,0.0,0.0,0.0,3e-06,0.0,0.0,0.0,0.0,0.0,...,0.0,1e-06,0.0,0.0,5e-06,0.0,0.0,4e-05,3e-06,0.0,0.0,0.0,0.0,0.0,0.999642,7e-06,1e-06,0.0,8e-06,0.0,0.0,4e-06,0.0,0.0,1e-06,0.0,1e-06,0.0,4e-06,0.0,0.0,0.0,0.0,0.0,2e-06,0.0,4e-05,0.0,0.0,0.0
2,0.0,1e-05,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.2e-05,0.0,0.0,2.7e-05,0.0,0.0,0.0,0.0,3e-06,6e-06,0.0,0.0,0.0,0.0,0.0,0.0,9e-06,1e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2e-06,1e-06,1.6e-05,0.0,0.0,1e-06,0.0,0.0,0.0,1e-06,0.000125,0.0,0.0,0.0,0.0,0.0,0.0,1.6e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3e-06,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000365,0.000427,0.00035,7e-06,0.000145,6e-05,0.0,4e-05,2.8e-05,4e-05,1.2e-05,4e-06,2.2e-05,0.0,6e-06,0.002972,0.000161,0.000409,6.8e-05,0.004504,9.1e-05,6.2e-05,3.8e-05,0.000109,2e-06,0.001552,5e-06,3.7e-05,4.8e-05,0.000406,0.00047,2.5e-05,1.6e-05,1.8e-05,0.001735,3e-06,3e-06,3.2e-05,2e-06,1.7e-05,...,2.7e-05,3e-06,0.000177,5e-06,2.3e-05,7.1e-05,0.0,5.1e-05,0.719629,2e-06,2e-06,6.1e-05,4.7e-05,2.8e-05,2.4e-05,0.000136,8.5e-05,0.000473,0.000187,2.3e-05,0.000393,0.000233,3e-06,2.4e-05,0.233199,0.000836,8.3e-05,1.2e-05,0.001883,0.000445,5.2e-05,8e-06,2e-06,0.000414,6.7e-05,2e-06,2.4e-05,0.002219,3.3e-05,3.4e-05
4,1.9e-05,0.000117,2.1e-05,0.0,1e-06,1e-06,3e-06,1e-06,0.0,0.0,0.0,0.0,1e-06,0.0,1e-06,7.9e-05,0.0,0.0,0.0,1.4e-05,0.000188,5e-06,1e-06,0.006772,7e-06,0.000136,0.0,1e-06,0.0,1e-06,1.4e-05,4e-06,1e-05,2e-06,0.0,1e-06,2.9e-05,1e-06,0.0,3e-06,...,1e-06,0.0,0.000169,4e-06,1.2e-05,3e-06,0.0,7e-06,1.4e-05,0.0,1e-06,3.2e-05,2.4e-05,0.0,0.0,2e-06,0.0,1e-06,1.3e-05,9e-06,0.017707,2e-06,4.4e-05,1.7e-05,1.2e-05,1e-06,0.0,1e-06,1.9e-05,0.358507,1e-06,0.0,0.0,1.9e-05,1e-06,0.0,0.0,1.5e-05,1.6e-05,5e-06


In [55]:
dsSubmission.insert(0, 'id', imgNameTest)

In [56]:
dsSubmission.head()

Unnamed: 0,id,cardigan,doberman,brittany_spaniel,norfolk_terrier,french_bulldog,yorkshire_terrier,clumber,saint_bernard,eskimo_dog,irish_terrier,sussex_spaniel,cairn,staffordshire_bullterrier,border_terrier,black-and-tan_coonhound,silky_terrier,curly-coated_retriever,papillon,great_dane,gordon_setter,pekinese,boxer,dandie_dinmont,miniature_schnauzer,redbone,affenpinscher,old_english_sheepdog,bernese_mountain_dog,saluki,american_staffordshire_terrier,entlebucher,english_foxhound,otterhound,malinois,mexican_hairless,samoyed,west_highland_white_terrier,malamute,lhasa,...,tibetan_mastiff,schipperke,kuvasz,basenji,shih-tzu,boston_bull,maltese_dog,pug,newfoundland,blenheim_spaniel,standard_schnauzer,walker_hound,german_short-haired_pointer,pomeranian,lakeland_terrier,sealyham_terrier,irish_wolfhound,borzoi,bluetick,bouvier_des_flandres,vizsla,beagle,bedlington_terrier,chihuahua,norwich_terrier,miniature_poodle,dingo,scotch_terrier,irish_setter,appenzeller,groenendael,shetland_sheepdog,giant_schnauzer,toy_poodle,brabancon_griffon,miniature_pinscher,chow,african_hunting_dog,japanese_spaniel,wire-haired_fox_terrier
0,d2a440580e48f9004697bd6c13026a25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4e-06,0.0,0.0,0.0,3.5e-05,0.0,0.0,0.0,0.0,0.0,5e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1e-06,0.000518,0.008769,0.0,1e-06,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1e-06,0.0,9.9e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2e-06,1e-06,0.0,0.0,0.0,3e-06,0.0,0.0,0.0,0.0,0.0
1,73032fe95faf9bad6bc81663dba0d9b8,0.0,0.0,0.0,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1e-06,0.0,0.0,0.0,0.0,0.0,0.0,4e-06,1e-06,1e-06,1e-06,0.0,1e-06,0.0,0.0,0.0,1e-06,0.0,0.0,2e-06,0.0,0.0,0.0,3e-06,0.0,0.0,0.0,0.0,...,0.0,1e-06,0.0,0.0,5e-06,0.0,0.0,4e-05,3e-06,0.0,0.0,0.0,0.0,0.0,0.999642,7e-06,1e-06,0.0,8e-06,0.0,0.0,4e-06,0.0,0.0,1e-06,0.0,1e-06,0.0,4e-06,0.0,0.0,0.0,0.0,0.0,2e-06,0.0,4e-05,0.0,0.0,0.0
2,250aecbc5d19d365b32804f19e16c8a4,0.0,1e-05,2e-06,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.2e-05,0.0,0.0,2.7e-05,0.0,0.0,0.0,0.0,3e-06,6e-06,0.0,0.0,0.0,0.0,0.0,0.0,9e-06,1e-06,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2e-06,1e-06,1.6e-05,0.0,0.0,1e-06,0.0,0.0,0.0,1e-06,0.000125,0.0,0.0,0.0,0.0,0.0,0.0,1.6e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3e-06,0.0,0.0,0.0,0.0,0.0,0.0
3,9ae64eb724d302df9b59eae70864962a,0.000365,0.000427,0.00035,7e-06,0.000145,6e-05,0.0,4e-05,2.8e-05,4e-05,1.2e-05,4e-06,2.2e-05,0.0,6e-06,0.002972,0.000161,0.000409,6.8e-05,0.004504,9.1e-05,6.2e-05,3.8e-05,0.000109,2e-06,0.001552,5e-06,3.7e-05,4.8e-05,0.000406,0.00047,2.5e-05,1.6e-05,1.8e-05,0.001735,3e-06,3e-06,3.2e-05,2e-06,...,2.7e-05,3e-06,0.000177,5e-06,2.3e-05,7.1e-05,0.0,5.1e-05,0.719629,2e-06,2e-06,6.1e-05,4.7e-05,2.8e-05,2.4e-05,0.000136,8.5e-05,0.000473,0.000187,2.3e-05,0.000393,0.000233,3e-06,2.4e-05,0.233199,0.000836,8.3e-05,1.2e-05,0.001883,0.000445,5.2e-05,8e-06,2e-06,0.000414,6.7e-05,2e-06,2.4e-05,0.002219,3.3e-05,3.4e-05
4,a2f259b5440aa56ab3977e23d02f82ed,1.9e-05,0.000117,2.1e-05,0.0,1e-06,1e-06,3e-06,1e-06,0.0,0.0,0.0,0.0,1e-06,0.0,1e-06,7.9e-05,0.0,0.0,0.0,1.4e-05,0.000188,5e-06,1e-06,0.006772,7e-06,0.000136,0.0,1e-06,0.0,1e-06,1.4e-05,4e-06,1e-05,2e-06,0.0,1e-06,2.9e-05,1e-06,0.0,...,1e-06,0.0,0.000169,4e-06,1.2e-05,3e-06,0.0,7e-06,1.4e-05,0.0,1e-06,3.2e-05,2.4e-05,0.0,0.0,2e-06,0.0,1e-06,1.3e-05,9e-06,0.017707,2e-06,4.4e-05,1.7e-05,1.2e-05,1e-06,0.0,1e-06,1.9e-05,0.358507,1e-06,0.0,0.0,1.9e-05,1e-06,0.0,0.0,1.5e-05,1.6e-05,5e-06


In [81]:
dsSubmission.to_csv('submission.csv', index=False)

# References
https://www.kaggle.com/c/dog-breed-identification/data

https://www.kaggle.com/phylake1337/0-18-loss-simple-feature-extractors