In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#extracting the data set 
#os.path.join (path1, path2, ... )
#    os.path.join(path, *paths) Join one or more path components intelligently. 
#    The return value is the concatenation of path and any members of *paths 
#    with exactly one directory separator (os.sep) following each non-empty part except the last, 
#    meaning that the result will only end in a separator if the last part is empty. 
#    If a component is an absolute path, all previous components are thrown away and 
#    joining continues from the absolute path component.
#    Example:
#    os.path.join('/path', 'path1', 'path2')    # '/path/path1/path2'
#    os.path.join('path', '/path1', 'path2')    # '/path1/path2'
#    os.path.join('path', '/path1', 'path2','') # '/path1/path2/'


input_dir  = '/kaggle/input/dogs-vs-cats'
work_dir   = '/kaggle/working'

import zipfile
with zipfile.ZipFile( os.path.join(input_dir, 'train.zip'), 'r') as zip_ref:
    zip_ref.extractall(work_dir)
    zip_ref.close()
with zipfile.ZipFile( os.path.join(input_dir, 'test1.zip'), 'r') as zip_ref:
    zip_ref.extractall(work_dir)
    zip_ref.close()

In [None]:
#preparing the training set 

train_dir = os.path.join(work_dir,'train')
test_dir  = os.path.join(work_dir,'test1')

train_dog = os.path.join(train_dir,'dog')
train_cat = os.path.join(train_dir,'cat')

#if the two directories had already been create, catch the error and print it
try:
    os.mkdir(train_dog)    
except OSError as error: 
    print(error) 
try:
    os.mkdir(train_cat)    
except OSError as error: 
    print(error) 


#regex tools to identify filenames including 'cat'/'dog' strings
from re import match,compile 

#I will use os.rename basically to implement the bash command mv 
for file in os.listdir(train_dir):
        if compile(r'dog\.').match(file) : 
            old_position = os.path.join(train_dir,file)
            new_position = os.path.join(train_dog,file)
            os.rename(old_position,new_position) 
            
        elif compile(r'cat\.').match(file) : 
            old_position = os.path.join(train_dir,file)
            new_position = os.path.join(train_cat,file)
            os.rename(old_position,new_position)
            
        else :
            print(file, 'this file was neither a dog. , nor a cat. .')

In [None]:
#to later check that ImageDataGenerator will work correctly 
print('Number of dogs in training set :' , len (os.listdir(dog_train)))
print('Number of cats in training set :' , len (os.listdir(cat_train)))

In [None]:
#let's give a look to 1 random cat and 1 random dog from training set 
%matplotlib inline

import matplotlib.image as mpimg #to read 3-channels images 
import matplotlib.pyplot as plt  
import matplotlib.gridspec as gridspec
import random

random_dog = random.randint(1,12500)
random_cat = random.randint(1,12500)
img_dog = mpimg.imread( os.path.join(train_dog, 'dog.'+str(random_dog)+'.jpg') )
img_cat = mpimg.imread( os.path.join(train_cat, 'cat.'+str(random_cat)+'.jpg') )


#for evantually using in papers:  single colum 8.8 , double column 18.
def cm2inch(cm):
    """Centimeters to inches"""
    return cm *0.393701

width   = 18.
ratio   = 2/3
fig     = plt.figure(figsize=(cm2inch(width), cm2inch(width*ratio)))
gs      = gridspec.GridSpec(nrows=1,ncols=2)
dog_fig = fig.add_subplot(gs[0,0])
cat_fig = fig.add_subplot(gs[0,1])
dog_fig.set_title('dog id:'+str(random_dog))
cat_fig.set_title('cat id:'+str(random_cat))
dog_fig.imshow(img_dog)
cat_fig.imshow(img_cat)


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator( rescale = 1.0/255. )
test_datagen  = ImageDataGenerator( rescale = 1.0/255. )

train = train_datagen.flow_from_directory(
        train_dir,  
        target_size=(300, 300),  
        batch_size=250,
        class_mode='binary')

In [None]:
from keras.utils.vis_utils import plot_model

model = tf.keras.models.Sequential([
    
            #compressing down the 3D volume 
            #note that the depth is going with power of 2 
            tf.keras.layers.Conv2D(16, (3,3), activation='relu', input_shape=(300, 300, 3)),
            tf.keras.layers.MaxPooling2D(2,2),
            tf.keras.layers.Conv2D(32, (3,3), activation='relu'),
            tf.keras.layers.MaxPooling2D(2,2), 
            tf.keras.layers.Conv2D(64, (3,3), activation='relu'), 
            tf.keras.layers.MaxPooling2D(2,2),
    
            #flattening 
            tf.keras.layers.Flatten(),
    
            # 512 neuron hidden layer
            tf.keras.layers.Dense(512, activation='relu'), 
            # Only 1 output neuron.
            tf.keras.layers.Dense(1, activation='sigmoid') 
    
    ])

model.summary()

plot_model(model,  show_shapes=True, show_layer_names=True)

In [None]:
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):  
    if(logs.get('accuracy')>0.9):
      print("\nReached 90% accuracy so cancelling training!")
      self.model.stop_training = True
        
callbacks = myCallback()

optimizer = tf.keras.optimizers.Adam()

model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics = ['accuracy'])

history = model.fit(train,
                    steps_per_epoch=100,
                    epochs=15,
                    verbose=2,callbacks=[callbacks])

In [None]:
plt.plot(history.history['accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
#now I will make prediction on the test set 
from keras.preprocessing import image
#I will use 
#image.load_img   :   it has an option specifying the desired size of the input
#                     note that the test images have different sizes. Our net has (300,300)


predicted_val = []
id_line = []

i = 0
for im in os.listdir(test_dir):
    
    #Our net accepts input with shape of (300,300) 
    right_img = image.load_img(os.path.join(test_dir,im) , color_mode = 'rgb', target_size=(300,300))
    
    #need to have the shape (batch_size, 300,300, n° channels). batch_size = 1 as single example
    #I am also normalising the features : image/255.
    image_to_be_tested = np.expand_dims(image.img_to_array(right_img)/255.,axis=0)
    pred = model.predict(image_to_be_tested)
    pred = int(round(pred[0][0]))
    predicted_val.append(pred)
    
    #regex expression to get the id of the image as a sequence of digits \d+ followed by .
    id_line.append( eval(re.compile('^\d+(?=\.)').search(im).group()) )
    i+=1
    
    #lengthy process: I am keeping track of it in this way
    if i == int(12500*0.25):
        print('25% test set analysed')

    if i == int(12500*0.5):
        print('50% test set analysed')

    if i == int(12500*0.75):
        print('75% test set analysed')

    if i == int(12500):
        print('100% test set analysed')

In [None]:
submission_df = pd.DataFrame({'id':id_line, 'label':predicted_val})
submission_df.to_csv("submission.csv", index=False)
submission_df