In [0]:
! unzip -q test.zip
! unzip -q train.zip

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import cv2
import os
from tqdm import tqdm
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization
from keras.optimizers import Adam
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

species = ['Black-grass', 'Charlock', 'Cleavers', 'Common Chickweed', 'Common wheat', 'Fat Hen',
          'Loose Silky-bent', 'Maize','Scentless Mayweed', 'Shepherds Purse',
          'Small-flowered Cranesbill', 'Sugar beet']
data_dir = './'
train_dir = os.path.join(data_dir, 'train')
test_dir = os.path.join(data_dir, 'test')

train_data = []
for species_id, sp in enumerate(species):
    for file in os.listdir(os.path.join(train_dir, sp)):
        train_data.append(['train/{}/{}'.format(sp, file), species_id, sp])
        
train = pd.DataFrame(train_data, columns=['File', 'SpeciesId','Species'])
train.head()

# Randomize the order of training set
SEED = 42
train = train.sample(frac=1, random_state=SEED) 
train.index = np.arange(len(train)) # Reset indices
train.head()

# Organize test files into DataFrame
test_data = []
for file in os.listdir(test_dir):
    test_data.append(['test/{}'.format(file), file])
test = pd.DataFrame(test_data, columns=['Filepath', 'File'])
test.head()

In [0]:
IMAGE_SIZE = 128

def read_image(filepath):
    return cv2.imread(os.path.join(data_dir, filepath))

def noise_image(image):
#     return image + cv2.randn(image,(0),(0))
#     return cv2.randn(image, (5, 5, 5), (5, 5, 5));

#     return cv2.GaussianBlur(image, (205, 205), 0)

#     # Generate Gaussian noise
#     gauss = np.random.normal(0, 1, image.size)
#     gauss = gauss.reshape(image.shape[0], image.shape[1], image.shape[2]).astype('uint8')
#     # Add the Gaussian noise to the image
#     image_gauss = cv2.add(image, gauss)
#     return image_gauss


    # sigmas = 0.1 * image
    sigmas = 0.5
    randomNoise = np.random.randn(*image.shape) * sigmas
    
    randomNoise = randomNoise.astype('uint8')
    output = cv2.add(image, randomNoise)
    return output

# Resize image to target size
def resize_image(image, image_size):
    return cv2.resize(image.copy(), image_size, interpolation=cv2.INTER_AREA)

# Image segmentation
def create_mask(image):
    # Convert from BGR to HSV color-space to extract colored object
    image_hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    # Define range of green in HSV
#     lower_green = np.array([30, 100, 50])
#     upper_green = np.array([85, 255, 255])
    lower_green = np.array([30, 100, 50])
    upper_green = np.array([85, 255, 255])
    # Threshold the HSV image to get only green colors
    mask = cv2.inRange(image_hsv, lower_green, upper_green)
#     kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (15, 15))
#     mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
    return mask

def segment_image(image):
    mask = create_mask(image)
    res = cv2.bitwise_and(image, image, mask=mask)
    return res

In [0]:
X_total_train = np.zeros((train.shape[0], IMAGE_SIZE, IMAGE_SIZE, 3))
X_total_noise_train = np.zeros((train.shape[0], IMAGE_SIZE, IMAGE_SIZE, 3))
for i, file in tqdm(enumerate(train['File'].values)):
    image = read_image(file)
    # Blurring images
    image_noised = noise_image(image)
#     image_segmented = segment_image(image)
#     X_train[i] = resize_image(image_segmented, (IMAGE_SIZE, IMAGE_SIZE))
#     X_train[i] = resize_image(image, (IMAGE_SIZE, IMAGE_SIZE))
    X_total_noise_train[i] = resize_image(image_noised, (IMAGE_SIZE, IMAGE_SIZE))
    X_total_train[i] = resize_image(image, (IMAGE_SIZE, IMAGE_SIZE))
    
#     X_train[i] = resize_image(segment_image(image_noised), (IMAGE_SIZE, IMAGE_SIZE))

# Normalize the data
X_total_train = X_total_train / 255.
X_total_noise_train = X_total_noise_train / 255.
print('Train Shape: {}'.format(X_total_train.shape))
print('Train Shape: {}'.format(X_total_noise_train.shape))

In [0]:
Y_train = train['SpeciesId'].values
Y_total_noise_train = to_categorical(Y_train, num_classes=12)
Y_total_train = to_categorical(Y_train, num_classes=12)
print(len(Y_total_noise_train))
print(len(Y_total_train))

BATCH_SIZE = 32
EPOCHS = 100

# Split the train and validation sets 
X_train, X_val, Y_train, Y_val = train_test_split(X_total_train, Y_total_train, test_size=0.1, random_state=SEED)

X_noise_train, X_noise_val, Y_noise_train, Y_noise_val = train_test_split(X_total_noise_train, Y_total_noise_train, test_size=0.1, random_state=SEED)

print(X_noise_val.shape)
print(X_val.shape)

In [0]:
X_test = np.zeros((test.shape[0], IMAGE_SIZE, IMAGE_SIZE, 3))
X_test_noise = np.zeros((test.shape[0], IMAGE_SIZE, IMAGE_SIZE, 3))
for i, file in tqdm(enumerate(test['Filepath'].values)):
    image = read_image(file)
    image_noised = noise_image(image)
#     image_segmented = segment_image(image)
#     X_test[i] = resize_image(image_segmented, (IMAGE_SIZE, IMAGE_SIZE))
#     X_test[i] = resize_image(image, (IMAGE_SIZE, IMAGE_SIZE))
#     X_test[i] = resize_image(image_noised, (IMAGE_SIZE, IMAGE_SIZE))
    X_test_noise[i] = resize_image(image_noised, (IMAGE_SIZE, IMAGE_SIZE))
    X_test[i] = resize_image(image, (IMAGE_SIZE, IMAGE_SIZE))

X_test = X_test / 255.

X_test_noise = X_test_noise / 255.

In [0]:
from keras.layers import Lambda, Input, Dense
from keras.models import Model
from keras import backend as K
from keras import metrics
from keras.datasets import mnist
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Conv2DTranspose,Reshape
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization, Activation
from keras.optimizers import Adam
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau

image_size = X_train.shape[1]
input_img = Input(shape=(image_size, image_size, 3))
x = Conv2D(64, (3, 3), padding='same')(input_img)
x = Activation('relu')(x)

for i in range(15):
    x = Conv2D(64, (3, 3), padding='same')(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)

x = Conv2D(3, (3, 3), padding='same')(x)
output_img = Activation('tanh')(x)

model = Model(input_img, output_img)
model.compile(optimizer='adam', loss='mean_squared_error')

In [0]:
es_cb = EarlyStopping(monitor='val_accuracy', patience=2, verbose=1, mode='auto')
chkpt = "model.h5"
cp_cb = ModelCheckpoint(filepath = chkpt, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

In [0]:
batch_size = 32
epochs = 100
history = model.fit(X_noise_train, X_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(X_noise_val, X_val),
                    callbacks=[es_cb, cp_cb],
                    shuffle=True)

In [0]:
from keras.models import Sequential, load_model

model = load_model('model.h5')
result = model.predict(X_test_noise) 

In [0]:
# Display the 1st 8 corrupted and denoised images
rows, cols = 1, 3
num = rows * cols
imgs = np.concatenate([X_test[:num], X_test_noise[:num], result[:num]])
imgs = imgs.reshape((rows * 3, cols, image_size, image_size, image_channels))
imgs = np.vstack(np.split(imgs, rows, axis=1))
imgs = imgs.reshape((rows * 3, -1, image_size, image_size, image_channels))
imgs = np.vstack([np.hstack(i) for i in imgs])
imgs = (imgs * 255).astype(np.uint8)
plt.figure()
plt.axis('off')
# plt.title('Original images: top rows, '
#           'Corrupted Input: middle rows, '
#           'Denoised Input:  third rows')
# plt.imshow(imgs, interpolation='none', cmap='gray')
# Image.fromarray(imgs).save('corrupted_and_denoised.png')
plt.imshow(imgs)
plt.show()

In [0]:
def cnn_model():
    model = Sequential()
    
    model.add(Conv2D(filters=32, kernel_size=(3, 3), strides=(1, 1), input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3),
                activation='relu'))
    model.add(BatchNormalization()) # Normalize the activations of the previous layer at each batch
    model.add(Conv2D(filters=32, kernel_size=(3, 3), strides=(1, 1), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPool2D(pool_size=(2, 2), strides=(2, 2)))

    model.add(Conv2D(filters=64, kernel_size=(3, 3), strides=(1, 1), activation='relu'))
    model.add(BatchNormalization())
    model.add(Conv2D(filters=64, kernel_size=(3, 3), strides=(1, 1), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPool2D(pool_size=(2, 2), strides=(2, 2)))

    model.add(Conv2D(filters=128, kernel_size=(3, 3), strides=(1, 1), activation='relu'))
    model.add(BatchNormalization())
    model.add(Conv2D(filters=128, kernel_size=(3, 3), strides=(1, 1), activation='relu'))
    model.add(BatchNormalization())
    model.add(MaxPool2D(pool_size=(2, 2), strides=(2, 2)))

    model.add(Flatten()) # Flatten the input
    model.add(Dense(256, activation='relu'))
    model.add(Dense(12, activation='softmax'))
    # Configure the learning process
    # The loss function is the objective that the model will try to minimize
    # For any classification problem, use accuracy metric
    optimizer = Adam(lr=0.002, beta_1=0.9, beta_2=0.999, epsilon=0.1, decay=0.0)
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    
    model.summary()
    return model

In [0]:
result_train = model.predict(X_noise_train)
result_val = model.predict(X_noise_val)

BATCH_SIZE = 32
EPOCHS = 100

# def train():
#     CNN_model = cnn_model()
#     annealer = ReduceLROnPlateau(monitor='val_acc', factor=0.5, patience=5, verbose=1, min_lr=1e-5)
#     checkpoint = ModelCheckpoint('model_cnn.h5', verbose=1, save_best_only=True)

#     # es_cb = EarlyStopping(monitor='val_accuracy', patience=2, verbose=1, mode='auto')
#     # cp_cb = ModelCheckpoint(filepath = chkpt, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

#     # Generates batches of image data with data augmentation
#     datagen = ImageDataGenerator(rotation_range=360, # Degree range for random rotations
#                             width_shift_range=0.2, # Range for random horizontal shifts
#                             height_shift_range=0.2, # Range for random vertical shifts
#                             zoom_range=0.2, # Range for random zoom
#                             horizontal_flip=True, # Randomly flip inputs horizontally
#                             vertical_flip=True) # Randomly flip inputs vertically
    
#     datagen.fit(result_train)
#     # Fits the model on batches with real-time data augmentation
#     hist = CNN_model.fit_generator(datagen.flow(result_train, Y_noise_train, batch_size=BATCH_SIZE),
#                    steps_per_epoch=result_train.shape[0] // BATCH_SIZE,
#                    epochs=EPOCHS,
#                    verbose=2,
#                    callbacks=[annealer, checkpoint],
#                    validation_data=(result_val, Y_noise_val))
    

# # For orignal
# def train():
#     CNN_model = cnn_model()
#     annealer = ReduceLROnPlateau(monitor='val_acc', factor=0.5, patience=5, verbose=1, min_lr=1e-5)
#     checkpoint = ModelCheckpoint('model_cnn_orig.h5', verbose=1, save_best_only=True)

#     # es_cb = EarlyStopping(monitor='val_accuracy', patience=2, verbose=1, mode='auto')
#     # cp_cb = ModelCheckpoint(filepath = chkpt, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

#     # Generates batches of image data with data augmentation
#     datagen = ImageDataGenerator(rotation_range=360, # Degree range for random rotations
#                             width_shift_range=0.2, # Range for random horizontal shifts
#                             height_shift_range=0.2, # Range for random vertical shifts
#                             zoom_range=0.2, # Range for random zoom
#                             horizontal_flip=True, # Randomly flip inputs horizontally
#                             vertical_flip=True) # Randomly flip inputs vertically
    
#     datagen.fit(X_train)
#     # Fits the model on batches with real-time data augmentation
#     hist = CNN_model.fit_generator(datagen.flow(X_train, Y_train, batch_size=BATCH_SIZE),
#                    steps_per_epoch=X_train.shape[0] // BATCH_SIZE,
#                    epochs=EPOCHS,
#                    verbose=2,
#                    callbacks=[annealer, checkpoint],
#                    validation_data=(X_val, Y_val))
    
def train():
    CNN_model = cnn_model()
    annealer = ReduceLROnPlateau(monitor='val_acc', factor=0.5, patience=5, verbose=1, min_lr=1e-5)
    checkpoint = ModelCheckpoint('model_cnn_noise.h5', verbose=1, save_best_only=True)

    # es_cb = EarlyStopping(monitor='val_accuracy', patience=2, verbose=1, mode='auto')
    # cp_cb = ModelCheckpoint(filepath = chkpt, monitor='val_loss', verbose=1, save_best_only=True, mode='auto')

    # Generates batches of image data with data augmentation
    datagen = ImageDataGenerator(rotation_range=360, # Degree range for random rotations
                            width_shift_range=0.2, # Range for random horizontal shifts
                            height_shift_range=0.2, # Range for random vertical shifts
                            zoom_range=0.2, # Range for random zoom
                            horizontal_flip=True, # Randomly flip inputs horizontally
                            vertical_flip=True) # Randomly flip inputs vertically
    
    datagen.fit(X_noise_train)
    # Fits the model on batches with real-time data augmentation
    hist = CNN_model.fit_generator(datagen.flow(X_noise_train, Y_noise_train, batch_size=BATCH_SIZE),
                   steps_per_epoch=X_noise_train.shape[0] // BATCH_SIZE,
                   epochs=EPOCHS,
                   verbose=2,
                   callbacks=[annealer, checkpoint],
                   validation_data=(X_noise_val, Y_noise_val))
train()

In [0]:
result_test = model.predict(X_test_noise)

final_model = load_model('model_cnn.h5')
final_loss, final_accuracy = final_model.evaluate(result_val, Y_noise_val)
print('Final Loss: {}, Final Accuracy: {}'.format(final_loss, final_accuracy))

predictions = final_model.predict(result_test)
predictions = np.argmax(predictions, axis=1)

df = pd.DataFrame({'file': [file for file in test['File'].values], 'species': [species[i] for i in predictions]})
df.to_csv('submission.csv', index=False)

**Orignal data without noise**

In [0]:
final_orig_model = load_model('model_cnn_orig.h5')
final_orig_loss, final_orig_accuracy = final_orig_model.evaluate(X_val, Y_val)
print('Final Loss: {}, Final Accuracy: {}'.format(final_orig_loss, final_orig_accuracy))

predictions = final_orig_model.predict(X_test)
predictions = np.argmax(predictions, axis=1)

df = pd.DataFrame({'file': [file for file in test['File'].values], 'species': [species[i] for i in predictions]})
df.to_csv('submission.csv', index=False)

**data with noise**

In [0]:
final_noise_model = load_model('model_cnn_noise.h5')
final_noise_loss, final_noise_accuracy = final_noise_model.evaluate(X_noise_val, Y_noise_val)
print('Final Loss: {}, Final Accuracy: {}'.format(final_noise_loss, final_noise_accuracy))

predictions = final_noise_model.predict(X_test_noise)
predictions = np.argmax(predictions, axis=1)

df = pd.DataFrame({'file': [file for file in test['File'].values], 'species': [species[i] for i in predictions]})
df.to_csv('submission.csv', index=False)