In [1]:
import os
import numpy as np
import pandas as pd
import keras
from keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense
import cv2
import ntpath
import random
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import matplotlib.pyplot as plt
import matplotlib.image as mpimg

from imgaug import augmenters as iaa

In [2]:
os.chdir('/kaggle/input/planets-dataset/planet/planet/')
!ls

In [3]:
train_df = pd.read_csv('train_classes.csv')
sample_df = pd.read_csv('sample_submission.csv')

In [4]:
train_df.head()

In [5]:
os.chdir('train-jpg/')

In [6]:
img = mpimg.imread(train_df.iloc[1,0]+'.jpg')

plt.imshow(img)

In [7]:
print(len(train_df['tags'].unique()))
print(len(train_df['tags']))

In [8]:
tag_vals = train_df['tags'].value_counts()
tag_vals

In [9]:
num_of_samples={}
sample_count = []

num_classes = 449

for tag in train_df['tags'].unique():
    x_selected = train_df[train_df['tags'] == tag]['tags']
    num_of_samples[tag] = len(x_selected)
    sample_count.append(len(x_selected))

In [10]:
plt.figure(figsize=(30, 10))
plt.bar(range(0, num_classes), sample_count)
plt.title("Distribution of the train dataset")
plt.xlabel("Class number")
plt.ylabel("Number of images")
plt.show()

In [11]:
def remove_extra_row(df, limit):
    index = random.sample(list(df.index), limit)
    return df.loc[index]

In [12]:
df = pd.DataFrame()
upper_limit = 2500

for tag in train_df['tags'].unique():
    temp = train_df[train_df['tags']==tag]
    if temp.shape[0] > upper_limit:
        df = pd.concat([df, remove_extra_row(temp, upper_limit)], ignore_index=True)
        #df.concat(remove_extra_row(temp, upper_limit))
    else:
        df = pd.concat([df, temp], ignore_index=True)
        #df.concat(temp)

In [13]:
#df = train_df[train_df['tags']=='clear primary']
print(df.shape)
print(train_df.shape)

In [14]:
num_of_samples={}
sample_count = []

num_classes = 449

for tag in df['tags'].unique():
    x_selected = df[df['tags'] == tag]['tags']
    num_of_samples[tag] = len(x_selected)
    sample_count.append(len(x_selected))

In [15]:
plt.figure(figsize=(30, 10))
plt.bar(range(0, num_classes), sample_count)
plt.title("Distribution of the train dataset")
plt.xlabel("Class number")
plt.ylabel("Number of images")
plt.show()

In [16]:
def zoom(image):
    zoom = iaa.Affine(scale=(1, 1.3))
    image = zoom.augment_image(image)
    return image

In [17]:
def pan(image):
    pan = iaa.Affine(translate_percent={"x": (-0.1, 0.1), "y": (-0.1, 0.1)})
    image = pan.augment_image(image)
    return image

In [18]:
def img_random_brightness(image):
    brightness = iaa.Multiply((0.2, 1.2))
    image = brightness.augment_image(image)
    return image

In [19]:
def img_random_flip(image):
    image = cv2.flip(image, 1) #1=Horizontal flip, 0=Vertical flip, -1=Random Flip
    return image

In [20]:
def random_augment(image):
    image = mpimg.imread(image)
    
    if (np.random.rand() < 0.5):
        image = pan(image)
    if (np.random.rand() < 0.5):
        image = zoom(image)
    if (np.random.rand() < 0.5):
        image = img_random_brightness(image)
    if (np.random.rand() < 0.5):
        image = img_random_flip(image)
    
    return image

In [21]:
def img_preprocessing(img):
    img = cv2.cvtColor(img, cv2.COLOR_RGB2YUV)
    img = cv2.GaussianBlur(img, (3, 3), 0)
    img = img/255
    return img

In [22]:
def batch_generator(image_paths, tags, batch_size, istraining):
    while True:
        batch_img = []
        batch_tags = []
        
        for i in range(batch_size):
            random_index = random.randint(0, len(image_paths) - 1)
            
            if istraining:
                img = random_augment(image_paths[random_index]+'.jpg')
                tag = tags[random_index]
                
            else:
                img = mpimg.imread(image_paths[random_index]+'.jpg')
                tag = tags[random_index]
                
            img = img_preprocessing(img)
            batch_img.append(img)
            batch_tags.append(tag)
        yield (np.asarray(batch_img), np.asarray(batch_tags))

In [23]:
X = list(df['image_name'])
Y = df['tags']

encoder = LabelEncoder()
Y = encoder.fit_transform(Y)

In [24]:
X_train, X_val, y_train, y_val = train_test_split(X, Y, test_size=0.3, random_state=6)

In [25]:
print(len(X_train), y_train.shape)
print(len(X_val), y_val.shape)

In [26]:
X_train_gen, y_train_gen = next(batch_generator(X_train, y_train, 1, 1))

In [27]:
i = 0
plt.imshow(X_train_gen[i])
print(y_train_gen[i])
print(encoder.inverse_transform([y_train_gen[i]])[0])

In [28]:
def nvidia_model():
    model = Sequential()
    model.add(Conv2D(24, kernel_size=(5, 5), strides=(2, 2), input_shape=(256, 256, 3), activation='elu'))
    model.add(Conv2D(36, kernel_size=(5, 5), strides=(2, 2), activation='elu'))
    model.add(Conv2D(48, kernel_size=(5, 5), strides=(2, 2), activation='elu'))
    model.add(Conv2D(64, kernel_size=(3, 3), activation='elu'))
    model.add(Conv2D(64, kernel_size=(3, 3), activation='elu'))
    model.add(Dropout(0.5))
    
    model.add(Flatten())
    model.add(Dense(100, activation='elu'))
    model.add(Dropout(0.5))
    
    model.add(Dense(50, activation='elu'))
    model.add(Dropout(0.5))
    
    model.add(Dense(25, activation='elu'))
    
    model.add(Dense(1))
    
    optimizer = Adam(learning_rate=1e-2)
    model.compile(loss='mse', optimizer=optimizer)
    
    return model

In [29]:
model = nvidia_model()

history = model.fit(batch_generator(X_train, y_train, 100, 1),
                              steps_per_epoch=300,
                              epochs=10,
                              validation_data=batch_generator(X_val, y_val, 100, 0),
                              validation_steps=200,
                              verbose=1,
                              shuffle=1)

In [30]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.legend(['training', 'validation'])
plt.title('Loss')
plt.xlabel('Epoch')

In [31]:
print(sample_df.shape)
sample_df.head()

In [32]:
os.chdir('/kaggle/input/planets-dataset/test-jpg-additional/test-jpg-additional/')

In [33]:
test_df = sample_df[sample_df.image_name.str.contains('file_', case=False)]

In [None]:
sample_imgs = []

for sample_path in test_df.image_name:
    if 'file_' in sample_path:
        img = mpimg.imread(sample_path+'.jpg')
        img = img_preprocessing(img)
        
        sample_imgs.append(img)
    
print(test_df.shape)
print(len(sample_imgs))

In [None]:
pred_model = model.predict(sample_imgs)

In [None]:
pred_model1 = model1.predict(sample_imgs)