In [34]:
import pandas as pd
import os
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf
from tqdm import tqdm
from tensorflow.keras.preprocessing import image
from sklearn.model_selection import train_test_split
import numpy as np
import gc
from tensorflow.keras import Sequential, Model, Input
from tensorflow.keras.layers import Dense, Conv2D, MaxPool2D, BatchNormalization, Dropout, Flatten
from keras.optimizers import Adam

In [2]:
import tensorflow as tf
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [3]:
gc.collect()

0

In [35]:
image_directory = 'C:\\Users\\wells\\Documents\\Hamoye Projects\\Understanding the Amazon from Space\\train-jpg\\'
test_image_directory = 'C:\\Users\\wells\\Documents\\Hamoye Projects\\Understanding the Amazon from Space\\test-jpg\\'
additional_test_image_directory = 'C:\\Users\\wells\\Documents\\Hamoye Projects\\Understanding the Amazon from Space\\ \\'

In [5]:
train_data = pd.read_csv('train_v2.csv')

In [6]:
train_data.head()

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


In [7]:
train_data.shape

(40479, 2)

In [8]:
tags = train_data['tags'].apply(lambda x: x.split())

In [9]:
tags

0                                          [haze, primary]
1                     [agriculture, clear, primary, water]
2                                         [clear, primary]
3                                         [clear, primary]
4          [agriculture, clear, habitation, primary, road]
                               ...                        
40474                                     [clear, primary]
40475                                             [cloudy]
40476                        [agriculture, clear, primary]
40477                  [agriculture, clear, primary, road]
40478    [agriculture, cultivation, partly_cloudy, prim...
Name: tags, Length: 40479, dtype: object

In [10]:
# encoding labels

mlb = MultiLabelBinarizer(classes=[
    'agriculture', 'artisinal_mine', 'bare_ground', 'blooming', 'blow_down',
    'clear', 'cloudy', 'conventional_mine', 'cultivation', 'habitation', 'haze',
    'partly_cloudy', 'primary', 'road', 'selective_logging', 'slash_burn', 'water'
])
y = mlb.fit_transform(tags)

In [11]:
# Add encoded labels to the dataframe
for i, class_name in enumerate(mlb.classes_):
    train_data[class_name] = y[:, i]

In [12]:
train_data['image_name'] = train_data['image_name'].apply(lambda x: '{}.jpg'.format(x))
train_data.head()

Unnamed: 0,image_name,tags,agriculture,artisinal_mine,bare_ground,blooming,blow_down,clear,cloudy,conventional_mine,cultivation,habitation,haze,partly_cloudy,primary,road,selective_logging,slash_burn,water
0,train_0.jpg,haze primary,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0
1,train_1.jpg,agriculture clear primary water,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1
2,train_2.jpg,clear primary,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
3,train_3.jpg,clear primary,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,train_4.jpg,agriculture clear habitation primary road,1,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0


In [13]:
# Define the columns
columns = list(train_data.columns[2:])
columns

['agriculture',
 'artisinal_mine',
 'bare_ground',
 'blooming',
 'blow_down',
 'clear',
 'cloudy',
 'conventional_mine',
 'cultivation',
 'habitation',
 'haze',
 'partly_cloudy',
 'primary',
 'road',
 'selective_logging',
 'slash_burn',
 'water']

In [14]:
gc.collect() #Frequently used to avoid session crashing due to memory exhaustion 

0

In [15]:
# Generating train data generator 
train_generator = ImageDataGenerator(rescale = 1./255., validation_split = 0.2).flow_from_dataframe(dataframe=train_data,
                                                    directory =image_directory, 
                                                    x_col='image_name', y_col=columns, subset='training', 
                                                    batch_size=10,seed=42, shuffle=True, 
                                                    class_mode='raw', target_size=(224,224))

#generating validation data which is expected to be 20% of the train dataset since validation split is 0.2
val_generator = ImageDataGenerator(rescale = 1./255., validation_split = 0.2).flow_from_dataframe(dataframe=train_data,
                                                    directory =image_directory, 
                                                    x_col='image_name', y_col=columns, subset='validation', 
                                                    batch_size=10,seed=42, shuffle=False, 
                                                    class_mode='raw', target_size=(224,224))

Found 32384 validated image filenames.
Found 8095 validated image filenames.


In [24]:
# model architecture
model = Sequential()

model.add(Input(shape = (224, 224, 3)))

model.add(Conv2D(16, 5, activation = 'relu')) 
model.add(BatchNormalization())
model.add(MaxPool2D(pool_size = (2,2)))
model.add(Dropout(0.2))

model.add(Conv2D(32, 5, activation = 'relu')) 
model.add(MaxPool2D(pool_size = (2,2)))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Conv2D(64, 5, activation = 'relu')) 
model.add(MaxPool2D(pool_size = (2,2)))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Conv2D(64, 5, activation = 'relu')) 
model.add(MaxPool2D(pool_size = (2,2)))
model.add(BatchNormalization())
model.add(Dropout(0.2))

model.add(Flatten())
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(0.2))

model.add(Dense(64, activation = 'relu'))
model.add(Dropout(0.2))

model.add(Dense(17, activation = 'sigmoid'))

In [25]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_10 (Conv2D)          (None, 220, 220, 16)      1216      
                                                                 
 batch_normalization_10 (Bat  (None, 220, 220, 16)     64        
 chNormalization)                                                
                                                                 
 max_pooling2d_10 (MaxPoolin  (None, 110, 110, 16)     0         
 g2D)                                                            
                                                                 
 dropout_10 (Dropout)        (None, 110, 110, 16)      0         
                                                                 
 conv2d_11 (Conv2D)          (None, 106, 106, 32)      12832     
                                                                 
 max_pooling2d_11 (MaxPoolin  (None, 53, 53, 32)      

In [26]:
model.compile(optimizer = Adam(), loss = 'binary_crossentropy', metrics = ['accuracy'])

In [27]:
history = model.fit(x=train_generator, validation_data = val_generator, epochs = 10, verbose = 2)

Epoch 1/10
3239/3239 - 321s - loss: 0.2716 - accuracy: 0.0746 - val_loss: 0.2330 - val_accuracy: 0.0209 - 321s/epoch - 99ms/step
Epoch 2/10
3239/3239 - 432s - loss: 0.2056 - accuracy: 0.0500 - val_loss: 0.1941 - val_accuracy: 0.0112 - 432s/epoch - 133ms/step
Epoch 3/10
3239/3239 - 457s - loss: 0.1847 - accuracy: 0.0635 - val_loss: 0.1697 - val_accuracy: 0.0523 - 457s/epoch - 141ms/step
Epoch 4/10
3239/3239 - 436s - loss: 0.1731 - accuracy: 0.0700 - val_loss: 0.1580 - val_accuracy: 0.0574 - 436s/epoch - 135ms/step
Epoch 5/10
3239/3239 - 441s - loss: 0.1647 - accuracy: 0.0773 - val_loss: 0.1514 - val_accuracy: 0.0665 - 441s/epoch - 136ms/step
Epoch 6/10
3239/3239 - 492s - loss: 0.1586 - accuracy: 0.0831 - val_loss: 0.1555 - val_accuracy: 0.0939 - 492s/epoch - 152ms/step
Epoch 7/10
3239/3239 - 388s - loss: 0.1537 - accuracy: 0.0899 - val_loss: 0.1511 - val_accuracy: 0.0589 - 388s/epoch - 120ms/step
Epoch 8/10
3239/3239 - 357s - loss: 0.1490 - accuracy: 0.0907 - val_loss: 0.1452 - val_accu

In [30]:
os.listdir()

['.ipynb_checkpoints',
 'sample_submission_v2.csv',
 'test-jpg',
 'test_v2_file_mapping.csv',
 'train-jpg',
 'train_v2.csv',
 'Untitled.ipynb']

In [31]:
##adding .jpg extension to image name in the sample submission file
sample_submission = pd.read_csv('sample_submission_v2.csv')
sample_submission1 = sample_submission.copy()
sample_submission1['image_name'] = sample_submission1['image_name'].apply(lambda x: '{}.jpg'.format(x))
sample_submission1.head()

Unnamed: 0,image_name,tags
0,test_0.jpg,primary clear agriculture road water
1,test_1.jpg,primary clear agriculture road water
2,test_2.jpg,primary clear agriculture road water
3,test_3.jpg,primary clear agriculture road water
4,test_4.jpg,primary clear agriculture road water


In [32]:
sample_submission1.shape

(61191, 2)

In [33]:
# Divide the sample submission file into two splits,
# first test1_df contains the first 40669 images 
test_df1 = sample_submission1.iloc[:40669]['image_name'].reset_index().drop('index', axis =1)
test_df1.head()

Unnamed: 0,image_name
0,test_0.jpg
1,test_1.jpg
2,test_2.jpg
3,test_3.jpg
4,test_4.jpg


In [40]:
#initialize imagedatagenerator for the test images and also rescaling
test_datagen = ImageDataGenerator(rescale = 1/255)

#creating a generator for the images found in the first test image files
test_gen = test_datagen.flow_from_dataframe(dataframe=test_df1,
                                                    directory =test_image_directory, 
                                                    x_col='image_name', y_col = None,
                                                    batch_size=10,seed=42, shuffle=False, 
                                                    class_mode=None, target_size=(224,224))

Found 40669 validated image filenames.


In [41]:
step_test_size1 = int(np.ceil(test_gen.samples/test_gen.batch_size))

In [42]:
# we reset the test generator to avoid shuffling of index 
test_gen.reset()
pred = model.predict(test_gen, steps=step_test_size1, verbose=1)



In [None]:
labels = ['artisinal_mine', 'water', 'conventional_mine', 'selective_logging', 'haze', 'clear', 'habitation', 'slash_burn', 'partly_cloudy', 'cultivation', 'primary', 'cloudy', 'blooming', 'bare_ground', 'agriculture', 'road', 'blow_down']

In [43]:
# Get the filenames in the generator using the attribute .filenames
file_names = test_gen.filenames

# Convert the predicted values to a dataframe and join two labels together if prob(occurrance of the label) > 0.5 
pred_tags = pd.DataFrame(pred)
pred_tags = pred_tags.apply(lambda x: ' '.join(np.array(labels)[x > 0.5]), axis = 1)

#then the result should look like this 
result1 = pd.DataFrame({'image_name': file_names, 'tags': pred_tags})
result1.head()

NameError: name 'labels' is not defined

In [None]:
#second batch of the test dataset
additional_df = sample_submission1.iloc[40669:]['image_name'].reset_index().drop('index', axis =1)
additional_df.head()

In [None]:
#creating a generator for the second batch of test image files
test_gen1 = test_datagen.flow_from_dataframe(dataframe=additional_df, 
                                                directory=additional_test_image_directory, 
                                                x_col='image_name', 
                                                y_col=None, 
                                                batch_size=10, 
                                                shuffle=False, 
                                                class_mode=None, 
                                                target_size=(224,224))

step_test_size2 = int(np.ceil(test_gen1.samples/test_gen1.batch_size))