In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
from matplotlib import pyplot
from matplotlib.image import imread
import gc

train_df = pd.read_csv('../input/planets-dataset/planet/planet/train_classes.csv')
train_df.head()

In [3]:
sample_submission_df = pd.read_csv('../input/planets-dataset/planet/planet/sample_submission.csv')
sample_submission_df.head()

In [4]:
testA = !ls ../input/planets-dataset/planet/planet/test-jpg | wc -l
testB = !ls ../input/planets-dataset/test-jpg-additional/test-jpg-additional | wc -l
assert sample_submission_df.shape[0] == float(testA[0])+float(testB[0])

In [5]:
from skimage import io
import matplotlib.pyplot as plt
%matplotlib inline

image_number =10
img = io.imread('../input/planets-dataset/planet/planet/train-jpg/train_{}.jpg'.format(image_number))
print(img.shape)
plt.imshow(img)

In [6]:
train_df[train_df['image_name'] == 'train_10']
train_df.head()

In [7]:
unique_labels = set()
def getting_unique_labels(tags):
    for tag in tags.split():
        unique_labels.add(tag)

train_classes = train_df.copy()
train_classes['tags'].apply(getting_unique_labels)
unique_labels = list(unique_labels)
print(unique_labels)

In [8]:
assert len(train_classes['image_name'].unique()) == train_classes.shape[0]

In [9]:
#one hot encoding
for tag in unique_labels:
    train_classes[tag] = train_classes['tags'].apply(lambda x: 1 if tag in x.split() else 0)
    
# adding '.jpg' extension to 'image_name'
train_classes['image_name'] = train_classes['image_name'].apply(lambda x: '{}.jpg'.format(x)) 
train_classes.head()

In [12]:
import tensorflow as tf

y_column = list(train_classes.columns[2:]) # storing the tags column names as a variable

# initializing image generator and data augmentation
image_gen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1/255)

# loading images from dataframe
X = image_gen.flow_from_dataframe(dataframe=train_classes, \
        directory='../input/planets-dataset/planet/planet/train-jpg', x_col='image_name', y_col=y_column, \
       target_size=(128, 128), class_mode='raw', seed=1, batch_size=128)

In [13]:
# X is an iterable, It contains 623 batches, each batch contains 64 images and labels because 
#40479 / 128 is 31 remainder 607 each image is of shape (64, 64, 3), each label is of shape (17, )

x109 = X[0][0][109] # first batch, images, 53rd image
y109 = X[0][1][109] # first batch, labels, 53rd label
print("each image's shape is {}".format(x109.shape))
print("each label's shape is {}".format(y109.shape))
print('we have {} batches'.format(len(X)))
print('each batch has {} images/labels'.format(X[0][0].shape[0]))
print('40479/64 is {:.2F}, so the last batch will have {} images/labels'.format(40479/128, X[109][0].shape[0]))

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, BatchNormalization, Conv2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# calculating fbeta score for multi-class/label classification
def fbeta(y_true, y_pred, beta=2):
    # clipping predictions
    y_pred = backend.clip(y_pred, 0, 1)
    # calculating elements
    tp = backend.sum(backend.round(backend.clip(y_true * y_pred, 0, 1)), axis=1)
    fp = backend.sum(backend.round(backend.clip(y_pred - y_true, 0, 1)), axis=1)
    fn = backend.sum(backend.round(backend.clip(y_true - y_pred, 0, 1)), axis=1)
    # calculating precision
    p = tp / (tp + fp + backend.epsilon())
    # calculating recall
    r = tp / (tp + fn + backend.epsilon())
    # calculating fbeta, averaged across each class
    bb = beta ** 2
    fbeta_score = backend.mean((1 + bb) * (p * r) / (bb * p + r + backend.epsilon()))
    return fbeta_score

In [19]:
import sys
from numpy import load
from keras import backend
from keras.layers import Dense
from keras.layers import Flatten
from keras.models import Sequential
from keras.layers import Conv2D
from keras.layers import MaxPooling2D
from keras import backend
from tensorflow.keras.optimizers import SGD
from keras.applications.vgg16 import VGG16
from keras.models import Model
from keras.preprocessing.image import ImageDataGenerator
import gc


def defining_model(in_shape=(128, 128, 3), out_shape=17):
    
    model = VGG16(include_top=False, input_shape=in_shape)
    # mark loaded layers as not trainable
    for layer in model.layers:
        layer.trainable = False
    # allow last vgg block to be trainable
    model.get_layer('block5_conv1').trainable = True
    model.get_layer('block5_conv2').trainable = True
    model.get_layer('block5_conv3').trainable = True
    model.get_layer('block5_pool').trainable = True
    # add new classifier layers
    flat1 = Flatten()(model.layers[-1].output)
    class1 = Dense(128, activation='relu', kernel_initializer='he_uniform')(flat1)
    output = Dense(out_shape, activation='sigmoid')(class1)
    # define new model
    model = Model(inputs=model.inputs, outputs=output)
    # compile model
    opt = SGD(lr=0.01, momentum=0.9)
    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=[fbeta])
    
    return model

In [20]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, BatchNormalization, Conv2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

save_best_check_point = ModelCheckpoint(filepath='best_model.hdf5', monitor='val_fbeta', \
                                        mode='max', save_best_only=True, save_weights_only=True)

In [21]:
import tensorflow as tf
train_image_gen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1/255,validation_split=0.3, horizontal_flip=True, vertical_flip=True, rotation_range=90)

# generating 70% training image data
train_gen = train_image_gen.flow_from_dataframe(dataframe=train_classes, \
        directory='../input/planets-dataset/planet/planet/train-jpg/', x_col='image_name', y_col=y_column, \
       target_size=(128, 128), class_mode='raw', seed=0, batch_size=128, subset='training')

# generating 30% validation image data
val_gen = train_image_gen.flow_from_dataframe(dataframe=train_classes, \
        directory='../input/planets-dataset/planet/planet/train-jpg/', x_col='image_name', y_col=y_column, \
       target_size=(128, 128), class_mode='raw', seed=0, batch_size=128, subset='validation')

In [22]:
step_train_size = int(np.ceil(train_gen.samples / train_gen.batch_size))
step_val_size = int(np.ceil(val_gen.samples / train_gen.batch_size))

print(step_train_size)
print(step_val_size)
print(len(train_gen))
print(len(val_gen))

In [None]:
modelA = defining_model() # building a sequential model for training

# fitting the model
modelA.fit_generator(train_gen, steps_per_epoch=len(train_gen), validation_data=val_gen, validation_steps=step_val_size, epochs=50, callbacks=[save_best_check_point])

In [None]:
modelB = define_model() 

#loading in the weights of the trained model
modelB.load_weights('best_model.hdf5')

In [None]:
sample_submission = sample_submission_df.copy()
sample_submission['image_name'] = sample_submission['image_name'].apply(lambda x: '{}.jpg'.format(x))
sample_submission.head()

In [None]:
testA_df = sample_submission.iloc[:40669]['image_name'].reset_index().drop('index', axis=1)
testA_df.head()

In [None]:
test1_df.shape

In [None]:
test_image_genA = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1/255)

# generating the image data for the first 40669 images in the sample submission dataframe
test_genA = test_image_gen1.flow_from_dataframe(dataframe=test1_df, \
            directory='../input/planets-dataset/planet/planet/test-jpg/', x_col='image_name', y_col=None, \
            batch_size=128, shuffle=False, class_mode=None, target_size=(128, 128))

# setting the step size for the testing set for the first 40669 images in the sample submission dataframe
step_test_sizeA = int(np.ceil(test_genA.samples / test_genA.batch_size))

In [None]:
test_genA.reset() # reseting the generator to be sure of avoiding shuffling
predA = modelB.predict(test_genA, steps=step_test_sizeA, verbose=1)
test_file_namesA = test_genA.filenames

In [None]:
pred_tagsA = pd.DataFrame(predA)
pred_tagsA = pred_tagsA.apply(lambda x: ' '.join(np.array(unique_labels)[x > 0.5]), axis=1)

# converting the predictions of the first 40669 to a dataframe
resultA = pd.DataFrame({'image_name': test_file_namesA, 'tags': pred_tagsA})
resultA.head()

In [None]:
testB_df = sample_submission.iloc[40669:]['image_name'].reset_index().drop('index', axis=1)
testB_df.head()


In [None]:
test_image_genB = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1/255)

# generating the image data for the remaining images in the sample submission dataframe
test_genB = test_image_genB.flow_from_dataframe(dataframe=testB_df, \
            directory='../input/planets-dataset/test-jpg-additional/test-jpg-additional/', x_col='image_name', \
            y_col=None, batch_size=128, shuffle=False, class_mode=None, target_size=(128, 128))

# setting the step size for the testing set for the remaining images in the sample submission dataframe
step_test_sizeB = int(np.ceil(test_genB.samples / test_genB.batch_size))

In [None]:
test_genB.reset() # reseting the generator to be sure of avoiding shuffling
predB = modelB.predict(test_genB, steps=step_test_sizeB, verbose=1)
test_file_namesB = test_genB.filenames

In [None]:
pred_tagsB = pd.DataFrame(predB)
pred_tagsB = pred_tagsB.apply(lambda x: ' '.join(np.array(unique_labels)[x > 0.5]), axis=1)

# converting the predictions of the remaining to a dataframe
resultB = pd.DataFrame({'image_name': test_file_namesB, 'tags': pred_tagsB})
resultB.head()

In [None]:
final_df = pd.concat([resultA, resultB])

In [None]:
final_df = final_df.reset_index().drop('index', axis=1)

In [None]:
print(final_result.shape)
final_df.head()


In [None]:
assert sum(sample_submission['image_name'] == final_sf['image_name']) == 61191

In [None]:
# removing the .jpg extension 
final_df['image_name'] = final_result['image_name'].apply(lambda x: x[:-4])
final_result.head()

In [None]:
final_df.to_csv('second_submission.csv', index=False)