In [1]:
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np 
import pandas as pd

import gc
import glob
from tqdm import tqdm
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import fbeta_score
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from tensorflow import keras
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
from keras import backend as K
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization, Flatten, InputLayer
from keras.applications.vgg19 import VGG19
from tensorflow.keras.applications import ResNet50, VGG16
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import fbeta_score
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.backend import clear_session
import tensorflow as tf

import cv2
import os
from tqdm import tqdm

import time

from os import listdir
import csv

%matplotlib inline

In [2]:
input_size = 128
epochs = 10
dropout_rate = 0.5
batch_size = 128
shape = (input_size, input_size, 3)
path = '/kaggle/input/planets-dataset/planet/planet/'
train_path = '/kaggle/input/planets-dataset/planet/planet/train-jpg'
test_path = '/kaggle/input/planets-dataset/planet/planet/test-jpg'
lr = 0.0001
reg_str = 0.01 # Regularization Strength
nfolds = 2 # No of folds for cross validation
workers = 4 # Multithreading no of threads
maxq = 10 # Max Queue size for multithreading
tresh = [0.2] * 17 # Threshold for truth value of label, applied on sigmoid output.

In [3]:
df_train = pd.read_csv(f'{path}train_classes.csv')
df_test = pd.read_csv(f'{path}sample_submission.csv')

df_train['image_name'] = df_train['image_name'].astype(str) + '.jpg'
df_test['image_name'] = df_test['image_name'].astype(str) + '.jpg'

df_test['tags'] = df_test['tags'].apply(lambda x: x.split(' '))

print(df_train.head())
print(df_test.head())

X_train_files = np.array(df_train['image_name'].tolist())
X_train_files.reshape((X_train_files.shape[0], 1))

y_train = np.array(df_train['tags'].tolist())

In [4]:
labels = []

for tag in df_train['tags'].values:
    labels_in_tag = tag.split(' ')
    for label in labels_in_tag:
        if label not in labels:
            labels.append(label)
        
labels.sort()
print(labels)

In [5]:
# viewing 9 different images
image_paths = []

all_image_paths_jpg = sorted(glob.glob('../input/planets-dataset/planet/planet/train-jpg/*.jpg'))
nine_imgs_paths_jpg = []
nine_imgs_paths_jpg.append(all_image_paths_jpg[11006])
nine_imgs_paths_jpg.append(all_image_paths_jpg[15000])
nine_imgs_paths_jpg.append(all_image_paths_jpg[4005])
nine_imgs_paths_jpg.append(all_image_paths_jpg[12007])
nine_imgs_paths_jpg.append(all_image_paths_jpg[8002])
nine_imgs_paths_jpg.append(all_image_paths_jpg[14001])
nine_imgs_paths_jpg.append(all_image_paths_jpg[16004])
nine_imgs_paths_jpg.append(all_image_paths_jpg[13003])
nine_imgs_paths_jpg.append(all_image_paths_jpg[9006])

nine_image_names = list(map(lambda row: row.split("/")[-1][:-4], nine_imgs_paths_jpg))
labels_df = pd.read_csv(f'{path}train_classes.csv')

# Plot them in a 3 by 3 grid
plt.figure(figsize=(12,8))
for i in range(9):
    plt.subplot(3,3,i+1)
    plt.subplots_adjust(top = 0.99, bottom=0.01, hspace=0.5, wspace=0.4)
    plt.imshow(plt.imread(nine_imgs_paths_jpg[i]))
    plt.title(str(labels_df[labels_df.image_name == nine_image_names[i]].tags.values))

In [7]:
# using open cv to view some images
plt.figure(figsize=(12, 12))
res = [32, 64, 128, 256]
NIMGS = 5

for i in range(len(res)):
    for j in range(NIMGS):
        img = cv2.imread(os.path.join(train_path,df_train['image_name'][j+1]))
        img = cv2.resize(img, (res[i], res[i]))
        plt.subplot(len(res), NIMGS, i*NIMGS+j+1)
        plt.imshow(img)
        plt.title(df_train['tags'][j+1] + "\n" + str(res[i]) + "x" + str(res[i]), rotation=18)
        plt.axis('off')
    
plt.show()

In [8]:
plt.figure(figsize=(8, 8))

labels_count = {}

for tag in df_train['tags'].values:
    labels_in_tag = tag.split(' ')
    for label in labels_in_tag:
        if label in labels_count:
            labels_count[label] += 1
        else:
            labels_count[label] = 0
            
min_label = min(labels_count, key=labels_count.get)
max_label = max(labels_count, key=labels_count.get)

plt.bar(range(len(labels_count)), list(labels_count.values()), align='center')
plt.xticks(range(len(labels_count)), list(labels_count.keys()), rotation=90)
plt.title('Data Point for each Labels')

plt.show()

In [9]:
# defining model used

def create_model():
    model = Sequential()
    model.add(InputLayer(shape))
    model.add(VGG16(weights='imagenet', include_top=False))
    model.add(Flatten())
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(dropout_rate))
    model.add(Dense(17, activation='sigmoid'))
    return model

clear_session()

model = create_model()
model.summary()

In [10]:
# defining metrics used

def f2_score(y_true, y_pred):
    y_true = tf.cast(y_true, "int32")
    y_pred = tf.cast(tf.round(y_pred), "int32") # implicit 0.5 threshold via tf.round
    y_correct = y_true * y_pred
    sum_true = tf.reduce_sum(y_true, axis=1)
    sum_pred = tf.reduce_sum(y_pred, axis=1)
    sum_correct = tf.reduce_sum(y_correct, axis=1)
    precision = sum_correct / sum_pred
    recall = sum_correct / sum_true
    f_score = 5 * precision * recall / (4 * precision + recall)
    f_score = tf.where(tf.math.is_nan(f_score), tf.zeros_like(f_score), f_score)
    return tf.reduce_mean(f_score)

In [11]:
# using the Model on the train and validation data
num_fold = 0

y_test = []
y_test2 = []

folds = KFold(n_splits=nfolds, shuffle=True, random_state=1).split(X_train_files, y_train)

for train_index, val_index in folds:
    X_train_files_fold = X_train_files[train_index]
    y_train_fold = y_train[train_index]
    X_val_files_fold = X_train_files[val_index]
    y_val_fold = np.array(y_train[val_index])
    
    train_df = pd.DataFrame(list(zip(X_train_files_fold, y_train_fold)), columns = ['image_name', 'tags'])
    val_df = pd.DataFrame(list(zip(X_val_files_fold, y_val_fold)), columns = ['image_name', 'tags'])
    
    train_df['tags'] = train_df['tags'].apply(lambda x: x.split(' '))
    val_df['tags'] = val_df['tags'].apply(lambda x: x.split(' '))

    train_datagen = ImageDataGenerator(
        rescale=1./255,
        width_shift_range=0.2,
        height_shift_range=0.2,
        horizontal_flip=True,
        vertical_flip=True
    )
    
    train_generator = train_datagen.flow_from_dataframe(
        train_df,
        directory=train_path,
        x_col='image_name',
        y_col='tags',
        target_size=(shape[0], shape[1]),
        class_mode='categorical',
        batch_size=batch_size,
        classes=labels,
    )
    
    val_datagen = ImageDataGenerator(
        rescale=1./255
    )
    
    val_generator = val_datagen.flow_from_dataframe(
        val_df,
        directory=train_path,
        x_col='image_name',
        y_col='tags',
        target_size=(shape[0], shape[1]),
        class_mode='categorical',
        batch_size=batch_size,
        classes=labels,
    )
    
    test_datagen = ImageDataGenerator(
        rescale=1./255
    )
    
    test_generator = test_datagen.flow_from_dataframe(
        df_test,
        directory=test_path,
        x_col='image_name',
        y_col='tags',
        target_size=(shape[0], shape[1]),
        class_mode='categorical',
        batch_size=batch_size,
        classes=labels,
        shuffle=False,
    )
    
    test_datagen2 = ImageDataGenerator(
        rescale=1./255
    )
    
    test_generator2 = test_datagen2.flow_from_dataframe(
        df_test,
        directory="../input/planets-dataset/test-jpg-additional/test-jpg-additional",
        x_col='image_name',
        y_col='tags',
        target_size=(shape[0], shape[1]),
        class_mode='categorical',
        batch_size=batch_size,
        classes=labels,
        shuffle=False,
    )


    model_path_of_fold = os.path.join('', 'weights_of_fold_' + str(num_fold) + '.h5')
    
    clear_session()
    model = create_model()
    
    adam = Adam(learning_rate=lr)
    model.compile(loss='binary_crossentropy', optimizer=adam, metrics=[f2_score])
    
    callbacks = [
        ModelCheckpoint(model_path_of_fold, monitor='val_f2_score', save_best_only=True, mode='max'),
        ReduceLROnPlateau(monitor='loss', factor=0.1, patience=3, mode='min', min_lr=0.000001)
    ]
    
    model.fit_generator(train_generator, epochs=epochs, validation_data=val_generator, callbacks=callbacks,
                       workers=workers, use_multiprocessing=True, max_queue_size=maxq)

    model.load_weights(model_path_of_fold)

    p_test = model.predict_generator(test_generator, workers=workers, use_multiprocessing=True, max_queue_size=maxq)
    y_test.append(p_test)
    
    p_test2 = model.predict_generator(test_generator2, workers=workers, use_multiprocessing=True, max_queue_size=maxq)
    y_test2.append(p_test2)
    

# Making preparation for submission

In [12]:
result1 = np.array(y_test[0])
for i in range(1, nfolds):
    result1 += np.array(y_test[i])
result1 /= nfolds
result1 = pd.DataFrame(result1, columns = labels)
result1.head()

In [13]:
result2 = np.array(y_test2[0])
for i in range(1, nfolds):
    result2 += np.array(y_test2[i])
result2 /= nfolds
result2 = pd.DataFrame(result2, columns = labels)
result2.head()

In [14]:
results = result1.append(result2, ignore_index=True)
results.shape

In [15]:
preds = []
for i in range(results.shape[0]):
    a = results.iloc[[i]]
    a = a.apply(lambda x: x > tresh, axis=1)
    a = a.transpose()
    a = a.loc[a[i] == True]
    ' '.join(list(a.index))
    preds.append(' '.join(list(a.index)))


In [16]:
preds[:10]

In [17]:
df_test['image_name'] = df_test['image_name'].astype(str).str.slice(stop=-4)
df_test.head()

In [27]:
df_test['tags'] = preds

In [28]:
sub = pd.DataFrame(df_test["image_name"])
#sub.head()

In [29]:
sub["tags"] = preds
sub.head()

In [30]:
fin.to_csv('submission.csv', index=False)