In [1]:
import numpy as np
import pandas as pd
import pickle
import logging
import os
import cv2, gc
from tqdm import tqdm
from tensorflow import keras
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers
from tensorflow.keras import optimizers
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras.applications.vgg19 import VGG19
from keras.applications.resnet import ResNet50
from keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import fbeta_score
from sklearn.model_selection import KFold
from sklearn.utils import shuffle
from tensorflow import keras
from tensorflow.keras.utils import to_categorical

import warnings
warnings.filterwarnings('ignore')
print('done')

done


In [2]:
PROJECT_FOLDER = '/kaggle/input/planet-understanding-the-amazon-from-space'
TRAIN_CSV_PATH = os.path.join(PROJECT_FOLDER, "train_v2.csv/train_v2.csv")
TEST_CSV_PATH = os.path.join(PROJECT_FOLDER, 'sample_submission_v2.csv/sample_submission_v2.csv')

In [3]:
train_df = pd.read_csv(TRAIN_CSV_PATH, dtype=str)
test_df = pd.read_csv(TEST_CSV_PATH, dtype=str)

In [4]:
trad_sample_df = test_df[test_df.image_name.str.contains('file_')].copy()
sample_submission = test_df[test_df.image_name.str.contains('test_')]
trad_sample_df['image_name'] = trad_sample_df["image_name"].apply(lambda fn: fn+".jpg")
sample_submission['image_name'] = sample_submission["image_name"].apply(lambda fn: fn+".jpg")

In [5]:
s = train_df.tags.str.split(' ').explode()
lb = MultiLabelBinarizer()
encoded = lb.fit_transform(s.values[:, None])
one_hot_df = pd.DataFrame(encoded.tolist(), columns=np.ravel(lb.classes_), dtype='int') \
                .groupby(s.index) \
                .sum()
one_hot_df['image_name'] = train_df["image_name"].apply(lambda fn: fn+".jpg")
cols = ['image_name'] + list(np.ravel(lb.classes_))
train_class = one_hot_df[cols].copy()
del one_hot_df, s, encoded, lb

In [6]:
datagen = ImageDataGenerator(horizontal_flip=True, vertical_flip=True,
                             zoom_range=0.5, rotation_range=50,
                             rescale=1./255., validation_split = 0.25)

In [7]:
train_class.shape

(40479, 18)

In [8]:
train_generator = datagen.flow_from_dataframe(
    dataframe = train_class,
    directory = '../input/train-jpg/train-jpg',
    x_col = 'image_name',
    y_col = cols[1:],
    subset = 'training',
    batch_size = 340,
    seed = 42,
    shuffle = True,
    class_mode = 'raw',
    target_size = (128, 128))

valid_generator = datagen.flow_from_dataframe(
    dataframe = train_class,
    directory = '../input/train-jpg/train-jpg',
    x_col = 'image_name',
    y_col = cols[1:],
    subset = 'validation',
    batch_size = 340,
    seed = 42,
    shuffle = True,
    class_mode = 'raw',
    target_size = (128, 128))

Found 30360 validated image filenames.
Found 10119 validated image filenames.


In [9]:
input_shape = (128, 128, 3)
epoch = 20
batch_size = 340

def VGG19_Amazon_Model(input_shape=input_shape):
    gc.collect()
    base_model = VGG19(include_top=False, weights='imagenet',
                           input_shape=input_shape)
    model = Sequential()
    model.add(BatchNormalization(input_shape=input_shape))
    model.add(base_model)
    model.add(Flatten())
    model.add(Dense(17, activation='sigmoid'))
    
    return model

def return_model_name(k):
    return '/kaggle/working/model_'+str(k)+'.h5'

def generate_original_format(df):
    preds = []
    for i in tqdm(range(df.shape[0]), miniters=1000):
        a = df.iloc[[i]]
        pred_tag=[]
        for k in cols[1:]:
            if(a[k][i] == 1):
                pred_tag.append(k)
        preds.append(' '.join(pred_tag))

    df['tags'] = preds
    df['image_name'] = df['image_name'].apply(lambda x: x.split('.')[0])
    return df[['image_name', 'tags']]

In [10]:
gc.collect()
model = VGG19_Amazon_Model()
STEP_SIZE_TRAIN = train_generator.n//train_generator.batch_size
STEP_SIZE_VAL = valid_generator.n//valid_generator.batch_size
        
opt = Adam(lr=0.0001)
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

callback = [EarlyStopping(monitor='val_accuracy', patience=4, verbose=1),
                    ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2,
                                   cooldown=2, verbose=1),
                    ModelCheckpoint(return_model_name(1), monitor='val_accuracy', 
                                    verbose=1, save_best_only=True, mode='max')]
history = model.fit_generator(train_generator, 
                                      validation_data=valid_generator,
                                      callbacks=callback, verbose=1, epochs=epoch) 
        

2022-09-16 05:19:41.497163: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-16 05:19:41.605499: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-16 05:19:41.606310: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-16 05:19:41.607530: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5


2022-09-16 05:19:47.865454: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/20


2022-09-16 05:19:52.879982: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005



Epoch 00001: val_accuracy improved from -inf to 0.06206, saving model to /kaggle/working/model_1.h5
Epoch 2/20

Epoch 00002: val_accuracy improved from 0.06206 to 0.06997, saving model to /kaggle/working/model_1.h5
Epoch 3/20

Epoch 00003: val_accuracy improved from 0.06997 to 0.10179, saving model to /kaggle/working/model_1.h5
Epoch 4/20

Epoch 00004: val_accuracy improved from 0.10179 to 0.11296, saving model to /kaggle/working/model_1.h5
Epoch 5/20

Epoch 00005: val_accuracy did not improve from 0.11296
Epoch 6/20

Epoch 00006: val_accuracy did not improve from 0.11296
Epoch 7/20

Epoch 00007: val_accuracy improved from 0.11296 to 0.13796, saving model to /kaggle/working/model_1.h5
Epoch 8/20

Epoch 00008: val_accuracy did not improve from 0.13796
Epoch 9/20

Epoch 00009: val_accuracy improved from 0.13796 to 0.14201, saving model to /kaggle/working/model_1.h5
Epoch 10/20

Epoch 00010: val_accuracy did not improve from 0.14201
Epoch 11/20

Epoch 00011: val_accuracy did not improve 

In [11]:
model = VGG19_Amazon_Model()
full_test = []

model.load_weights(filepath=return_model_name(1))
p_test = model.predict_generator(valid_generator, verbose=1)
full_test.append(p_test)
    
result = np.array(full_test[0])
    
result_bool = (result > 0.18).astype(int)
preds = np.argmax(result_bool, axis=1)
vals = np.argmax(valid_generator.labels, axis=1)



In [12]:
def predict_model(data):
    model = VGG19_Amazon_Model()
    full_test = []

    model.load_weights(filepath=return_model_name(1))
    p_test = model.predict_generator(data, verbose=1)
    full_test.append(p_test)
        
    result = np.array(full_test[0])
    
    result_bool = (result > 0.18).astype(int)
    return result_bool

In [13]:
print('F2 = {}'.format(fbeta_score(vals, preds, beta=2, average='micro')))

F2 = 0.32443917383140625


In [14]:
test_datagen=ImageDataGenerator(rescale=1./255.)
test_generator = test_datagen.flow_from_dataframe(
    dataframe = sample_submission,
    directory = '../input/testjpg/test-jpg',
    x_col = 'image_name',
    y_col = None,
    batch_size = 340,
    seed = 42,
    shuffle = False,
    class_mode = None,
    target_size = (128, 128))

result1 = predict_model(test_generator)
result1 = pd.DataFrame(result1, columns=cols[1:])
result1["image_name"]=test_generator.filenames
result1 = generate_original_format(result1.copy())

Found 40669 validated image filenames.


100%|██████████| 40669/40669 [00:33<00:00, 1227.90it/s]


In [15]:
test_generator2 = test_datagen.flow_from_dataframe(
    dataframe = trad_sample_df,
    directory = '../input/testjpg/-additional/test-jpg-additional',
    x_col = 'image_name',
    y_col = None,
    batch_size = 340,
    seed = 42,
    shuffle = False,
    class_mode = None,
    target_size = (128, 128))
result2 = predict_model(test_generator2)
result2 = pd.DataFrame(result2, columns=cols[1:])
result2["image_name"]=test_generator2.filenames
result2 = generate_original_format(result2.copy())

Found 20522 validated image filenames.


100%|██████████| 20522/20522 [00:16<00:00, 1252.99it/s]


In [16]:
final_results = result1.append(result2, ignore_index=True)
final_results.to_csv("submission.csv", index=False)