In [None]:
import numpy as np # Linear algebra
import pandas as pd # Data Processing
import plotly.graph_objects as go # For graphs and plots
import plotly.offline as po # To generate graphs as images
po.init_notebook_mode() # show images in jupyter notebook
import matplotlib.pyplot as plt # Module pyplot

In [None]:
# Load in the train data
train = pd.read_csv('../input/planets-dataset/planet/planet/train_classes.csv')

# view the first five rows
train.head(5)

### Exploratory Data Analysis

In [None]:
# Bar-plot
labels = train['tags'].apply(lambda x: x.split(' '))
from collections import Counter, defaultdict
counts = defaultdict(int)
for l in labels:
    for l2 in l:
        counts[l2] += 1

data=[go.Bar(x=list(counts.keys()), y=list(counts.values()))]
layout=dict(height=800, width=800, title='Distribution of training labels')
fig=dict(data=data, layout=layout)
po.iplot(data, filename='train-label-dist')

In [None]:
import cv2 # To integrate OpenCV arrays with other libraries which use NumPy

In [None]:
# Preview some images with their respective tags
new_style = {'grid': False}
plt.rc('axes', **new_style)
_, ax = plt.subplots(3, 3, sharex='col', sharey='row', figsize=(20, 20))
i = 0
for f, l in train[:9].values:
    img = cv2.imread('../input/planets-dataset/planet/planet/train-jpg/{}.jpg'.format(f))
    ax[i // 3, i % 3].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    ax[i // 3, i % 3].set_title('{} - {}'.format(f, l))
    #ax[i // 4, i % 4].show()
    i += 1
    
plt.show()

### Training the Model

In [None]:
from keras.models import Sequential # Importing sequential model from Keras for multi-label classification
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D # List of layers that will be passed to the sequential constructor
from keras.callbacks import ReduceLROnPlateau, LearningRateScheduler # Set of functions to be applied at given stages of the training procedure
from tqdm import tqdm # For iterations
from sklearn.metrics import fbeta_score # Weighted harmonic mean of precision and recall between 0 and 1

In [None]:
# Create an empty list for the training images
X_train = []
y_train = []

In [None]:
# Obtaining unique labels from the training data
train_flatten = lambda l: [item for sublist in l for item in sublist]
train_labels = list(set(train_flatten([l.split(' ') for l in train['tags'].values])))
train_label_map = {l: i for i, l in enumerate(train_labels)}
inv_label_map = {i: l for l, i in train_label_map.items()}

In [None]:
# Appending the training images as a list
for f, tags in tqdm(train.values, miniters=1000):
    img = cv2.imread('../input/planets-dataset/planet/planet/train-jpg/{}.jpg'.format(f)) # Loading the images from the specified files
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[train_label_map[t]] = 1
    # Obtaining the features as x and target as y
    X_train.append(cv2.resize(img, (32, 32), cv2.INTER_AREA))
    y_train.append(targets)

In [None]:
# Convert python list of training data to numpy array
X_train = np.array(X_train, np.float16) / 255
y_train = np.array(y_train, np.uint8)

# Check shape of numpy array
print(X_train.shape)
print(y_train.shape)

In [None]:
# split the data into training and validation sets
cut = 35000
X_train, X_valid, y_train, y_valid = X_train[:cut], X_train[cut:], y_train[:cut], y_train[cut:]

In [None]:
# Instantiate sequential model with its hyper-parameters
model = Sequential()
# Instantiating the Keras layers
model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(32, 32, 3)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(17, activation='sigmoid'))

# set epoch and learning rate for the callbacks hyperparameter
epoch = 5
learn_rate= 0.01

In [None]:
# create a function for the epoch and learning rate
def learning_rate(epoch, learn_rate):
    return learn_rate

# Instantiate LearningRateScheduler
lrs = LearningRateScheduler(learning_rate)

# Compiling the sequential model with loss, optimizer and metrics hyper-parameters
model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
              
# Fiting the sequential model with the splitted x_train and y_train
model.fit(X_train, y_train, batch_size=128, epochs=50, verbose=1, validation_data=(X_valid, y_valid), callbacks=[lrs])
          
# models prediction and FBeta Score
predict_valid = model.predict(X_valid, batch_size=128)
print(y_valid)
print(predict_valid)
print(fbeta_score(y_valid, np.array(predict_valid) > 0.2, beta=2, average='samples'))

### Testing

In [None]:
# Create an empty list forthe test images (original and additional)
test_org = []
test_add = []

In [None]:
# Loading the sample submission data
ss = pd.read_csv('../input/planets-dataset/planet/planet/sample_submission.csv')
ss.head(5)

In [None]:
# Loading the 2 test data 
df_test_org = ss.iloc[:40669]
df_test_add = ss.iloc[40669:]

In [None]:
# Obtain the labels from both test data
# Test original
test_org_flatten = lambda l: [item for sublist in l for item in sublist]
test_org_labels = list(set(test_org_flatten([l.split(' ') for l in df_test_org['tags'].values])))
test_org_label_map = {l: i for i, l in enumerate(test_org_labels)}
inv1_label_map = {i: l for l, i in test_org_label_map.items()}

# Test additional
test_add_flatten = lambda l: [item for sublist in l for item in sublist]
test_add_labels = list(set(test_add_flatten([l.split(' ') for l in df_test_add['tags'].values])))
test_add_label_map = {l: i for i, l in enumerate(test_add_labels)}
inv2_label_map = {i: l for l, i in test_add_label_map.items()}

In [None]:
# Appending test images as a list
#Test original
for f, tags in tqdm(df_test_org.values, miniters=1000):
    img_org = cv2.imread('../input/planets-dataset/planet/planet/test-jpg/{}.jpg'.format(f))
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[test_org_label_map[t]] = 1
    test_org.append(cv2.resize(img_org, (32, 32), cv2.INTER_AREA))
    
#Test additional
for f, tags in tqdm(df_test_add.values, miniters=1000):
    img_add = cv2.imread('../input/planets-dataset/test-jpg-additional/test-jpg-additional/{}.jpg'.format(f))
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[test_add_label_map[t]] = 1
    test_add.append(cv2.resize(img_add, (32, 32), cv2.INTER_AREA))

In [None]:
# Convert python list to numpy array
test_org = np.array(test_org, np.float16) / 255.
test_add = np.array(test_add, np.float16) / 255.

# Check shape of numpy array
print(test_org.shape)
print(test_add.shape)

In [None]:
# predict on test data
pred_test_org = model.predict(test_org, batch_size = 128)
pred_test_add = model.predict(test_add, batch_size = 128)

In [None]:
# Convert test predictions to dataframe
df_org = pd.DataFrame(data = pred_test_org)
df_add = pd.DataFrame(data = pred_test_add)

# label the columns of both dataframes
df_add.columns = train_labels
df_org.columns = train_labels

In [None]:
# Set a threshold for each tag for a multi-label classification
# Dataframe original
prediction_org = []
thresh = [0.07, 0.17, 0.2, 0.04, 0.23, 0.33, 0.24, 0.22, 0.1, 0.19, 0.23, 0.24, 0.12, 0.14, 0.25, 0.26, 0.16]
for i in tqdm(range(df_org.shape[0]), miniters=1000):
    p = df_org.iloc[[i]]
    p = p.apply(lambda x: x > thresh, axis=1)
    p = p.transpose()
    p = p.loc[p[i] == True]
    ' '.join(list(p.index))
    prediction_org.append(' '.join(list(p.index))) # append any label greater than its corresponding threshold
    
#Dataframe additional
prediction_add = []
thresh = [0.07, 0.17, 0.2, 0.04, 0.23, 0.33, 0.24, 0.22, 0.1, 0.19, 0.23, 0.24, 0.12, 0.14, 0.25, 0.26, 0.16]
for i in tqdm(range(df_add.shape[0]), miniters=1000):
    p = df_add.iloc[[i]]
    p = p.apply(lambda x: x > thresh, axis=1)
    p = p.transpose()
    p = p.loc[p[i] == True]
    ' '.join(list(p.index))
    prediction_add.append(' '.join(list(p.index))) # append any label greater than its corresponding threshold

In [None]:
# Merge both dataframes to get final predicted tags
prediction_org.extend(prediction_add)

# Replace samle submission tags with predictions 
ss['tags'] = prediction_org

# Save to csv file for submission
ss.to_csv('Baseline.csv', index=False)