# This notebook is prepared on Kaggle kernel for submission to the Dphi practice competition on the following link: https://dphi.tech/challenges/data-sprint-87-hymenoptera-species-recognition/255/overview/about

# These are the steps followed before arriving at this final submission:
1. First the Image dataloaders were created using dataframe because all the files were mixed up in a single folder
2. Next, a shallow hand made CNN was created to test if all codes were working fine
3. Then, a pretrained VGG16 model was used and fine tuned for our ants and bees classes, also called transfer learning, achieved above 80% accuracy
4. Then, resnet50 model was used and fine tuned, slight improvement in accuracy
5. Then, VGG19 model was used and fine tuned, slight improvement in accuracy, still around 86%
6. Finally Xception model gave a high accuracy of 96%, on fine tuning, it gave a 100% accuracy on the 50% revealed dataset

### Import the required packages.

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, MaxPool2D, Flatten, BatchNormalization, Activation
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd

### Plot a few images of train data.

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import os
src_path = "../input/hymenoptera/hymenoptera_0/train"
sub_class = os.listdir(src_path)

fig = plt.figure(figsize=(10,5))
for e in range(len(sub_class[:8])):
    plt.subplot(2,4,e+1)
    img = plt.imread(os.path.join(src_path,sub_class[e]))
    plt.imshow(img, cmap=plt.get_cmap('gray'))

### Load the Pandas DataFrame

In [3]:
train_df = pd.read_csv('../input/hymenoptera/hymenoptera_0/Training_set.csv')
test_df = pd.read_csv('../input/hymenoptera/hymenoptera_0/Testing_set.csv')

In [4]:
train_df.head()

In [5]:
train_df['target'] = train_df.label.map({'ants':0, 'bees':1}).astype(str)
train_df.head()

In [6]:
test_df.head()

### Initialize Keras’ ImageDataGenerator class

In [7]:
src_path_train = "../input/hymenoptera/hymenoptera_0/train"
src_path_test = "../input/hymenoptera/hymenoptera_0/test"

train_datagen = ImageDataGenerator(
        rescale=1 / 255.0,
        rotation_range=20,
        zoom_range=0.05,
        width_shift_range=0.05,
        height_shift_range=0.05,
        shear_range=0.05,
        horizontal_flip=True,
        fill_mode="nearest",
        validation_split=0.10)  # We are using 0.1 val split because number of images is very less

test_datagen = ImageDataGenerator(rescale=1 / 255.0)

### Initialize our training, validation and testing generators

In [8]:
batch_size = 16  # We have increased batch size to improve learning
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=src_path_train,
    x_col="filename",
    y_col="target",
    target_size=(400, 400),
    batch_size=batch_size,
    class_mode="categorical",
    subset='training',
    shuffle=True,
    seed=42
)

valid_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory=src_path_train,
    x_col="filename",
    y_col="target",
    target_size=(400, 400),
    batch_size=batch_size,
    class_mode="categorical",
    subset='validation',
    shuffle=True,
    seed=42
)

test_generator = test_datagen.flow_from_dataframe(
    dataframe=test_df,
    directory=src_path_test,
    x_col="filename",
    target_size=(400, 400),
    batch_size=1,
    class_mode=None,
    shuffle=False,
)

## Now we will use a pretrained model Xception from Keras library

In [9]:
pretrained_model = keras.applications.Xception(
                                            include_top=False,
                                            weights="imagenet",
                                            input_shape=(400,400,3),
                                            pooling=None,
                                            classes=2,
                                            classifier_activation="sigmoid",
                                        )

### Initialize a Sequential model, freeze the pretrained model layers

In [10]:
model = Sequential()
for layer in pretrained_model.layers:
    layer.trainable=False

### Now add the pretrained model, flatten the output, add one additional hidden fully connected layer, and the last output layer to classify only 2 categories for our ants and bees

In [11]:
model.add(pretrained_model)
model.add(Flatten())
model.add(Dense(2048, activation='relu'))
model.add(Dense(2, activation='sigmoid'))

### Check the summary of the model

In [12]:
model.summary()

### Compile the new model

In [13]:
model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', 
               metrics=['accuracy'])

### Fit the model on the training dataset and check with the validation set

In [14]:
history = model.fit(train_generator,
          validation_data = valid_generator,
          steps_per_epoch = train_generator.n//train_generator.batch_size,
          validation_steps = valid_generator.n//valid_generator.batch_size,
          epochs=3)

### Plot the training and validation losses

In [15]:
data = history.history
y1 = data['loss']
y2 = data['val_loss']
x = [i for i in range(len(y1))]
plt.plot(x, y1, 'g-', label='Train Loss')
plt.plot(x, y2, 'r--', label='Val Loss')
plt.title('Training Loss vs Validation Loss')
plt.legend()
plt.show()

### Plot the training and validation accuracy

In [18]:
y1 = data['accuracy']
y2 = data['val_accuracy']
x = [i for i in range(len(y1))]
plt.plot(x, y1, 'g-', label='Train Accuracy')
plt.plot(x, y2, 'r--', label='Val Accuracy')
plt.title('Training Accuracy vs Validation Accuracy')
plt.legend()
plt.show()

### Evaluate our model performance

In [19]:
score = model.evaluate(valid_generator)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

### Make predictions on test data using Keras’ predict

In [20]:
preds = model.predict(test_generator, steps = len(test_generator.filenames))

### Keras’ predict returns the class probability of each class. Let’s print the prediction of the first 5 test data

In [21]:
preds[:5]

### Predict the class label

In [22]:
preds = preds.argmax(axis=-1)
preds

### Add the predictions to the test dataframe

In [23]:
test_df['label']=preds
test_df.head()

### Add the label names to the test dataframe

In [24]:
test_df['target'] = test_df.label.map({0: 'ants', 1: 'bees'})
test_df.head()

### Clean the test dataframe before creating submission file

In [26]:
test_df.drop('label', axis=1, inplace=True)
test_df.rename(columns={'target':'label'}, inplace=True)
test_df.head(2)

### Create the final submission file

In [27]:
test_df.to_csv('submission.csv', index=False)

### Check the submission file before uploading to competition

In [28]:
!head submission.csv