In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
path = "../input/planets-dataset/planet/planet/"
train_path = os.path.join(path, "train-jpg")
test_path = os.path.join(path, "test-jpg")
class_path = os.path.join(path, "train_classes.csv")
submission_path = os.path.join(path, "sample_submission.csv")

In [None]:
train = pd.read_csv(class_path)
train.head(10)

In [None]:
train.info()

Our train csv file has no missing data

In [None]:
train.tags.value_counts()

In [None]:
label_list = []
for tag_split in train.tags.to_numpy():
    labels = tag_split.split(' ')
    for label in labels:
        if label not in label_list:
            label_list.append(label)

In [None]:
labels_dict = dict(zip(range(0,17), label_list))
labels_dict

In [None]:
# One-hot encode the columns
for label in label_list:
    train[label] = train['tags'].apply(lambda x: 1 if label in x.split() else 0)
    
train.head()

In [None]:
# Onehot encode the image name
train['image_name'] = train['image_name'].apply(lambda x: f'{x}.jpg')
train.head()

In [None]:
train_columns = list(train.columns[2:])

In [None]:
import tensorflow_hub as hub
import tensorflow_datasets as tfds

from tensorflow.keras import layers

In [None]:
import logging
import tensorflow as tf
logger = tf.get_logger()
logger.setLevel(logging.ERROR)

In [None]:
IMAGE_RES = 224
URL = "https://tfhub.dev/google/tf2-preview/mobilenet_v2/feature_vector/2"
feature_extractor = hub.KerasLayer(URL,
                                   input_shape=(IMAGE_RES, IMAGE_RES,3))

In [None]:
from PIL import Image

train_images = []
for file in os.listdir(train_path):
    with Image.open(os.path.join(train_path, file)) as img:
        train_images.append(img)

In [None]:
percentage = 0.8
split_index = int(len(train_images) * percentage)
(train_examples, validation_examples) = train_images[:split_index], train_images[split_index:]

In [None]:
from keras.preprocessing.image import ImageDataGenerator
##Split training data to training and validation sets
train_datagen = ImageDataGenerator(rescale = 1./255, validation_split = 0.2)


In [None]:
train_batches = train_datagen.flow_from_dataframe(dataframe=train,
                                              directory = train_path,  
                                              x_col="image_name", 
                                              y_col=train_columns, 
                                              subset="training", 
                                              batch_size=128,
                                              seed=42, 
                                              shuffle = True, 
                                              class_mode="raw", 
                                              target_size=(224,224))


# val_set contains 20 percent of images 
validation_batches = train_datagen.flow_from_dataframe(dataframe=train,
                                            directory = train_path,  
                                            x_col="image_name", 
                                            y_col=train_columns, 
                                            subset="validation", 
                                            batch_size=128,
                                            seed=42, 
                                            shuffle = True, 
                                            class_mode="raw", 
                                            target_size=(224,224))

In [None]:
feature_extractor.trainable = False

In [None]:
model = tf.keras.Sequential([
  feature_extractor,
  layers.Dense(17, activation = 'softmax')
])

model.summary()

In [None]:
model.compile(
  optimizer='adam',
  loss='binary_crossentropy',
  metrics=['accuracy'])

EPOCHS = 2
history = model.fit(train_batches,
                    epochs=EPOCHS,
                    validation_data=validation_batches)

In [None]:
##visualize the images
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
submission_df = pd.read_csv(submission_path)
test_df = submission_df.iloc[:40669]['image_name'].reset_index().drop('index', axis =1)

In [None]:
test_df['image_name'] = submission_df['image_name'].apply(lambda x: '{}.jpg'.format(x))
test_df.head()

In [None]:
test_datagen = ImageDataGenerator(rescale = 1./255)

test_gen = test_datagen.flow_from_dataframe(dataframe=test_df,
                                            directory = test_path,  
                                            x_col="image_name", 
                                            y_col=None, 
                                            batch_size=67,
                                            shuffle=False,
                                            class_mode=None, 
                                            target_size=(IMAGE_RES,IMAGE_RES))

test_size = test_gen.n // test_gen.batch_size

In [None]:
pred = model.predict(test_gen, steps=test_size, verbose=1)

In [None]:
test_names = test_gen.filenames 
        
test_result = pd.DataFrame(pred)
test_result = test_result.apply(lambda x: ' '.join(np.array(label_list)[x >= 0.2]), axis=1)

test_result_df = pd.DataFrame({'image_name': test_names, 'tags': test_result})
test_result_df.head()

In [None]:
# additional batch of the test dataset
additional_df = submission_df.iloc[40669:]['image_name'].reset_index().drop('index', axis=1)

In [None]:
additional_df['image_name'] = additional_df['image_name'].apply(lambda x: '{}.jpg'.format(x))
additional_df.head()

In [None]:
additional_gen = test_datagen.flow_from_dataframe(dataframe=additional_df,
                                                  directory="../input/planets-dataset/test-jpg-additional/test-jpg-additional", 
                                                  x_col='image_name',
                                                  y_col = None,
                                                  batch_size=62,
                                                  shuffle=False,
                                                  class_mode=None, 
                                                  target_size=(IMAGE_RES, IMAGE_RES))


additional_step = additional_gen.n // additional_gen.batch_size

In [None]:
# predicted probabilities for each class
pred_additional = model.predict(additional_gen, steps=additional_step, verbose=1)

In [None]:
additional_names = additional_gen.filenames 
        
additional_result = pd.DataFrame(pred_additional)
additional_result = additional_result.apply(lambda x: ' '.join(np.array(label_list)[x >= 0.2]), axis=1)

additional_result_df = pd.DataFrame({'image_name': additional_names, 'tags': additional_result})
additional_result_df.head()

In [None]:
final_result = pd.concat([test_result_df, additional_result_df])

final_result = final_result.reset_index().drop('index', axis =1)

final_result

In [None]:
# Remove the .jpg extension from the image_name of the final_result
final_result['image_name'] = final_result['image_name'].apply(lambda x: x[:-4])
final_result

In [None]:
# save the results to a CSV file save and set the index to false.
final_result.to_csv('submission.csv', index=False)