# Importing Libraries

In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.image import load_img
import os
import shutil
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Model

# Data Preparation

In [13]:
path = 'C:/Users/nikhil/Desktop/Machine Learning/ML Datatalks ZoomCamp/Week - 9/train'
train_folder = path + '/train'
val_folder = path + '/val'

In [16]:
fnames = ['cat.{}.jpg'.format(i) for i in range(10000)]
for fname in fnames:
    src = path+'/'+fname
    dst = train_folder+'/cats/'+fname
    shutil.copyfile(src, dst)

In [20]:
fnames = ['dog.{}.jpg'.format(i) for i in range(10000)]
for fname in fnames:
    src = path+'/'+fname
    dst = train_folder+'/dogs/'+fname
    shutil.copyfile(src, dst)

In [18]:
fnames = ['cat.{}.jpg'.format(i) for i in range(10000, 12500)]
for fname in fnames:
    src = path+'/'+fname
    dst = val_folder+'/cats/'+fname
    shutil.copyfile(src, dst)

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/nikhil/Desktop/Machine Learning/ML Datatalks ZoomCamp/Week - 9/train/cat.12500.jpg'

In [19]:
fnames = ['dog.{}.jpg'.format(i) for i in range(10000, 12500)]
for fname in fnames:
    src = path+'/'+fname
    dst = val_folder+'/dogs/'+fname
    shutil.copyfile(src, dst)

In [22]:
for data in ['train', 'val']:
    for c in ['cats', 'dogs']:
        total_images = len(os.listdir(path+'/'+data+'/'+c))
        print("There are : ", total_images, " for Category -> ", c, " in folder : ", data)

There are :  10000  for Category ->  cats  in folder :  train
There are :  10000  for Category ->  dogs  in folder :  train
There are :  2500  for Category ->  cats  in folder :  val
There are :  2500  for Category ->  dogs  in folder :  val


In [25]:
Layers= [
    # The shape for input should be (150, 150, 3)
    keras.Input(shape=(150,150,3), name ='input'),
    # Next, create a covolutional layer (Conv2D)
    # Use 32 filters
    # Kernel size should be (3, 3) (that's the size of the filter)
    # Use 'relu' as activation
    keras.layers.Conv2D(
        filters = 32, 
        kernel_size = (3,3),
        name = 'Conv-Layer',
        activation = 'relu'
    ),
    # Reduce the size of the feature map with max pooling (MaxPooling2D)
    # Set the pooling size to (2, 2)
    keras.layers.MaxPool2D(pool_size=(2,2), name = 'MaxPooling'),
    # Turn the multi-dimensional result into vectors using a Flatten layer
    keras.layers.Flatten(name = 'Flatten'),
    # Next, add a Dense layer with 64 neurons and 'relu' activation
    keras.layers.Dense(units = 64, activation='relu', name='inner_dense'),
    # Finally, create the Dense layer with 1 neuron - this will be the output
    # The output layer should have an activation - use the appropriate activation for the binary classification case
    keras.layers.Dense(units = 1, activation='sigmoid', name='output')
    
]

# compiling the layers in model
model = Sequential(Layers)

# As optimizer use SGD with the following parameters:
# SGD(lr=0.002, momentum=0.8)

optimizer = keras.optimizers.SGD(learning_rate = 0.002, momentum = 0.8)



# Question 1:

Since we have a binary classification problem, what is the best loss function for us?

Note: since we specify an activation for the output layer, we don't need to set from_logits=True

In [28]:
loss = keras.losses.BinaryCrossentropy()

model.compile(loss = 'binary_crossentropy',
             optimizer = optimizer,
             metrics = ['accuracy']
             )

# Answer 1 : Best Loss Function : binary cross-entropy

# Question 2

What's the total number of parameters of the model? You can use the summary method for that.

# Generators and Training

For the next two questions, use the following data generator for both train and validation:

ImageDataGenerator(rescale=1./255)

We don't need to do any additional pre-processing for the images.
When reading the data from train/val directories, check the class_mode parameter. 

Which value should it be for a binary classification problem?
Use batch_size=20

For training use .fit() with the following params:


model.fit(

    train_generator,
    
    steps_per_epoch=100,
    
    epochs=10,
    
    validation_data=validation_generator,
    
    validation_steps=50
)


In [29]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Conv-Layer (Conv2D)          (None, 148, 148, 32)      896       
_________________________________________________________________
MaxPooling (MaxPooling2D)    (None, 74, 74, 32)        0         
_________________________________________________________________
Flatten (Flatten)            (None, 175232)            0         
_________________________________________________________________
inner_dense (Dense)          (None, 64)                11214912  
_________________________________________________________________
output (Dense)               (None, 1)                 65        
Total params: 11,215,873
Trainable params: 11,215,873
Non-trainable params: 0
_________________________________________________________________


# Answer 2 : 11,215,873

# Question 3 

What is the median of training accuracy for this model?

In [30]:
train_gen = ImageDataGenerator(rescale=1./255)
train_ds = train_gen.flow_from_directory(train_folder,
                                         target_size=(150, 150),
                                         class_mode='binary',
                                         batch_size=20) 
val_gen = ImageDataGenerator(rescale=1./255)
val_ds = val_gen.flow_from_directory(val_folder,
                                    target_size = (150,150),
                                    class_mode = 'binary',
                                    batch_size = 20)

Found 20000 images belonging to 2 classes.
Found 5000 images belonging to 2 classes.


In [31]:
# training the model
t_model = model.fit(train_ds,
                    steps_per_epoch=100,
                    epochs=10,
                    validation_data=val_ds,
                    validation_steps=50)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [32]:
t_model.history

{'loss': [0.699600100517273,
  0.6839891672134399,
  0.6797141432762146,
  0.6698445081710815,
  0.6603745818138123,
  0.6559429168701172,
  0.6484664082527161,
  0.6359925270080566,
  0.6388604640960693,
  0.6424087882041931],
 'accuracy': [0.5184999704360962,
  0.546500027179718,
  0.5615000128746033,
  0.578000009059906,
  0.5979999899864197,
  0.6079999804496765,
  0.6085000038146973,
  0.6420000195503235,
  0.6305000185966492,
  0.6200000047683716],
 'val_loss': [0.6909972429275513,
  0.6847776174545288,
  0.673385500907898,
  0.6728634834289551,
  0.6897282004356384,
  0.6609770655632019,
  0.6547914147377014,
  0.6428967714309692,
  0.6355530619621277,
  0.6339273452758789],
 'val_accuracy': [0.5090000033378601,
  0.5569999814033508,
  0.5820000171661377,
  0.5799999833106995,
  0.5320000052452087,
  0.5960000157356262,
  0.621999979019165,
  0.6140000224113464,
  0.6230000257492065,
  0.6330000162124634]}

In [33]:
median_training_accuracy = np.mean(t_model.history['accuracy'])
print("median_training_accuracy : ", median_training_accuracy.round(2))

median_training_accuracy :  0.59


# Answer 3 : 0.56

# Question 4
What is the standard deviation of training loss for this model?

In [37]:
std_dev_training_loss = np.std(t_model.history['loss'])
print("std_dev_training_loss : ", std_dev_training_loss.round(4))

std_dev_training_loss :  0.0202


# Answer 4 : 0.01 (closest approximation)

 # Data Augmentation
For the next two questions, we'll generate more data using data augmentations.

Add the following augmentations to your training data generator:

    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'

In [38]:
train_gen = ImageDataGenerator(rescale=1./255,
                               rotation_range=40,
                               width_shift_range=0.2,
                               height_shift_range=0.2,
                               shear_range=0.2,
                               zoom_range=0.2,
                               horizontal_flip=True,
                               fill_mode='nearest')
train_ds = train_gen.flow_from_directory(train_folder,
                                        target_size = (150,150),
                                        class_mode = 'binary',
                                        batch_size = 20)



Found 20000 images belonging to 2 classes.


In [41]:
# Let's train our model for 10 more epochs using the same code as previously. 
# Make sure you don't re-create the model - we want to continue training the model we already started training.


t_model = model.fit(train_ds,
                    steps_per_epoch=100,
                    epochs=10,
                    validation_data=val_ds,
                    validation_steps=50)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [42]:
t_model.history

{'loss': [0.6391681432723999,
  0.636695921421051,
  0.6315542459487915,
  0.6334671974182129,
  0.632651150226593,
  0.6263101100921631,
  0.6301519870758057,
  0.6313118934631348,
  0.6376796364784241,
  0.6250604391098022],
 'accuracy': [0.6269999742507935,
  0.6294999718666077,
  0.6449999809265137,
  0.6424999833106995,
  0.6514999866485596,
  0.6524999737739563,
  0.6309999823570251,
  0.6424999833106995,
  0.6324999928474426,
  0.6359999775886536],
 'val_loss': [0.6062123775482178,
  0.7034369111061096,
  0.6027674674987793,
  0.5945980548858643,
  0.5938239097595215,
  0.5974228978157043,
  0.585679829120636,
  0.6033152937889099,
  0.603245198726654,
  0.612169623374939],
 'val_accuracy': [0.6650000214576721,
  0.5839999914169312,
  0.671999990940094,
  0.7020000219345093,
  0.6959999799728394,
  0.6840000152587891,
  0.7089999914169312,
  0.6729999780654907,
  0.6729999780654907,
  0.6579999923706055]}

# Question 5

What is the mean of validation loss for the model trained with augmentations?

In [43]:
mean_validation_loss = np.mean(t_model.history['val_loss'])
print("mean_validation_loss : ", mean_validation_loss)

mean_validation_loss :  0.6102671563625336


In [44]:
print(mean_validation_loss.round(3))

0.61


# Answer 5 : 0.67 

# Question 6
What's the average of validation accuracy for the last 5 epochs (from 6 to 10) for the model trained with augmentations?

In [45]:
t_model.history['val_accuracy'][5:10]

[0.6840000152587891,
 0.7089999914169312,
 0.6729999780654907,
 0.6729999780654907,
 0.6579999923706055]

In [46]:
mean_validation_accuracy = np.mean(t_model.history['val_accuracy'][5:10])
print("mean_validation_accuracy : ", mean_validation_accuracy)

mean_validation_accuracy :  0.6793999910354614


In [47]:
print(mean_validation_accuracy.round(3))

0.679


# Answer 6 :  0.65