<a href="https://colab.research.google.com/github/SergheiMihailov/ml-project-cassava/blob/main/create_training_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q -U keras-tuner

[?25l[K     |█████▏                          | 10kB 20.6MB/s eta 0:00:01[K     |██████████▍                     | 20kB 24.1MB/s eta 0:00:01[K     |███████████████▋                | 30kB 26.5MB/s eta 0:00:01[K     |████████████████████▉           | 40kB 28.7MB/s eta 0:00:01[K     |██████████████████████████      | 51kB 27.6MB/s eta 0:00:01[K     |███████████████████████████████▎| 61kB 29.1MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 8.9MB/s 
[?25h  Building wheel for keras-tuner (setup.py) ... [?25l[?25hdone
  Building wheel for terminaltables (setup.py) ... [?25l[?25hdone


In [2]:
# Imports
import gdown
import os
import json
import csv   
import cv2
import numpy as np
import pandas as pd
from PIL import Image
import scipy.misc
from sklearn.model_selection import StratifiedKFold
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.preprocessing.image import ImageDataGenerator
from keras import backend as K
import tensorflow.keras.layers.experimental.preprocessing as keras_preproc
import kerastuner as kt
from pprint import pprint


In [3]:
# Download provided dataset
!wget files.brainfriz.com/train_images.zip # secondary link for images
!unzip -qq -o train_images.zip
!gdown --id "1xbEVK_NigW_5ngwKMHvuOTehYhT2v2WF" # labels
!gdown --id "1SvI9dN2_25c2OlevwK4TjmzBNysjE_PO" # label mapping

--2021-03-13 20:20:59--  http://files.brainfriz.com/train_images.zip
Resolving files.brainfriz.com (files.brainfriz.com)... 138.201.201.196
Connecting to files.brainfriz.com (files.brainfriz.com)|138.201.201.196|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://files.brainfriz.com/train_images.zip [following]
--2021-03-13 20:21:00--  https://files.brainfriz.com/train_images.zip
Connecting to files.brainfriz.com (files.brainfriz.com)|138.201.201.196|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2569658627 (2.4G) [application/zip]
Saving to: ‘train_images.zip’


2021-03-13 20:22:21 (30.6 MB/s) - ‘train_images.zip’ saved [2569658627/2569658627]

Downloading...
From: https://drive.google.com/uc?id=1xbEVK_NigW_5ngwKMHvuOTehYhT2v2WF
To: /content/train.csv
100% 358k/358k [00:00<00:00, 5.72MB/s]
Downloading...
From: https://drive.google.com/uc?id=1SvI9dN2_25c2OlevwK4TjmzBNysjE_PO
To: /content/label_num_to_disease_map.j

In [10]:
IMG_SIZE = 512

def augment_image(img):
  img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 
  img4d = tf.expand_dims(img, 0)
  data_augmentation = tf.keras.Sequential([
    keras_preproc.Resizing(IMG_SIZE, IMG_SIZE),
    keras_preproc.RandomRotation(0.2),
    keras_preproc.RandomZoom((0,-0.3)),
  ])

  aug_img_arr = data_augmentation(img4d)

  aug_img = Image.fromarray(aug_img_arr.numpy()[0].astype(np.uint8))
    
  return aug_img

def add_train_datapoint_cassava(image, image_id, label, train_images_dir_path, train_csv_path):
    datapoint = dict({
        'image_id': image_id,
        'label': label,
    })
    
    if not os.path.exists(train_images_dir_path):
        os.makedirs(train_images_dir_path)
      
    image.save(train_images_dir_path + str(image_id)) # save
  
    with open(train_csv_path, 'a') as f:
      writer = csv.DictWriter(f, ['image_id', 'label'])
      writer.writerow(datapoint)


In [11]:
def get_data_with_label(data, label):
  return data.loc[data['label'] == label]

original_data = pd.read_csv('train.csv')

original_data.to_csv('orig_and_aug.csv')

train_path = 'train_images/'

unique_labels = set(original_data['label'])

n_aug_for_balance = {}

for label in unique_labels:
  n_aug_for_balance[label] = len(original_data) - len(get_data_with_label(original_data, label))

n_aug_for_balance_largest_class = min(n_aug_for_balance.values())

for label in unique_labels:
  n_aug_for_balance[label] -= n_aug_for_balance_largest_class

!rm -rf orig_and_aug.csv orig_and_aug_train_images/

for label in n_aug_for_balance.keys():
  data_filtered_by_label = get_data_with_label(original_data, label)
  for i in range(n_aug_for_balance[label]):
    print(label, i)
    datapoint_to_augment = data_filtered_by_label.iloc[i % len(data_filtered_by_label)]

    image = cv2.imread(train_path + datapoint_to_augment['image_id'])
    augmented_image = augment_image(image)
    
    add_train_datapoint_cassava(
        image=augmented_image, 
        image_id='aug_'+str(label)+'_'+str(i)+'_'+datapoint_to_augment['image_id'], 
        label=datapoint_to_augment['label'],
        train_images_dir_path='orig_and_aug_train_images/',
        train_csv_path='orig_and_aug.csv'
        )

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
4 5581
4 5582
4 5583
4 5584
4 5585
4 5586
4 5587
4 5588
4 5589
4 5590
4 5591
4 5592
4 5593
4 5594
4 5595
4 5596
4 5597
4 5598
4 5599
4 5600
4 5601
4 5602
4 5603
4 5604
4 5605
4 5606
4 5607
4 5608
4 5609
4 5610
4 5611
4 5612
4 5613
4 5614
4 5615
4 5616
4 5617
4 5618
4 5619
4 5620
4 5621
4 5622
4 5623
4 5624
4 5625
4 5626
4 5627
4 5628
4 5629
4 5630
4 5631
4 5632
4 5633
4 5634
4 5635
4 5636
4 5637
4 5638
4 5639
4 5640
4 5641
4 5642
4 5643
4 5644
4 5645
4 5646
4 5647
4 5648
4 5649
4 5650
4 5651
4 5652
4 5653
4 5654
4 5655
4 5656
4 5657
4 5658
4 5659
4 5660
4 5661
4 5662
4 5663
4 5664
4 5665
4 5666
4 5667
4 5668
4 5669
4 5670
4 5671
4 5672
4 5673
4 5674
4 5675
4 5676
4 5677
4 5678
4 5679
4 5680
4 5681
4 5682
4 5683
4 5684
4 5685
4 5686
4 5687
4 5688
4 5689
4 5690
4 5691
4 5692
4 5693
4 5694
4 5695
4 5696
4 5697
4 5698
4 5699
4 5700
4 5701
4 5702
4 5703
4 5704
4 5705
4 5706
4 5707
4 5708
4 5709
4 5710
4 5711
4 5712
4 5713
4 57

In [None]:
!zip -r orig_and_aug_train_images.zip orig_and_aug_train_images

In [14]:
from google.colab import files
# files.download('orig_and_aug.csv')
files.download('orig_and_aug_train_images.zip') 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:

# def getEfficientNetB0():
#   return [
#       # architecture
#       tf.keras.applications.EfficientNetB0(
#         include_top=True, weights=None, input_tensor=None,
#         input_shape=INPUT_SHAPE, pooling=None, classes=N_CLASSES,
#         classifier_activation='softmax', drop_connect_rate=0.4
#       ),
#       # preprocess_input
#       tf.keras.applications.efficientnet.preprocess_input
#   ]

# def getResNet50V2(): 
#   return [
#       # architecture 
#       tf.keras.applications.ResNet50V2(
#         include_top=True, weights=None, input_tensor=None,
#         input_shape=INPUT_SHAPE, pooling=None, classes=N_CLASSES,
#         classifier_activation='softmax'
#       ),
#       # preprocess_input
#       tf.keras.applications.resnet_v2.preprocess_input
#   ]

# def getMobileNetV3Small(): 
#   return [
#       # architecture
#       tf.keras.applications.MobileNetV3Small(
#         input_shape=INPUT_SHAPE, alpha=1, minimalistic=True, include_top=True,
#         weights=None, input_tensor=None, classes=5, pooling='avg',
#         dropout_rate=0, classifier_activation='softmax'
#       ),
#       # preprocess_input:
#       tf.keras.applications.mobilenet_v3.preprocess_input
#   ]

# IMG_SIZE = 512
# SIZE = (IMG_SIZE,IMG_SIZE)
# INPUT_SHAPE = (IMG_SIZE, IMG_SIZE, 3)
# CLASSES = 5
# BATCH_SIZE = 16
# N_CV_SPLITS = 3

In [None]:
# # Train with cross-validation
# data = pd.read_csv('train.csv')
# f = open('label_num_to_disease_map.json')
# real_labels = json.load(f)
# real_labels = {int(k):v for k,v in real_labels.items()}
# data['class_name'] = data.label.map(real_labels)

# train_path = 'train_images/'

# imageDataGenerator = ImageDataGenerator()

# def model_builder(hp):
#   architecture, preprocess_input = getMobileNetV3Small()

#   input_layer = preprocess_input(tf.keras.layers.Input(shape=INPUT_SHAPE))

#   model = tf.keras.Model(input_layer, architecture(input_layer))
  
#   hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

#   hp_label_smoothing = hp.Choice('label_smoothing', values=[1e-2, 1e-3, 1e-4])

#   model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate),
#                 loss=keras.losses.BinaryCrossentropy(label_smoothing=hp_label_smoothing),
#                 metrics=['accuracy'])

#   return model

# tuner = kt.Hyperband(model_builder,
#                      objective='val_accuracy',
#                      max_epochs=10,
#                      factor=3,
#                      directory='hyperparams',
#                      project='cassava'
#                      )

# stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

# best_hps = None

# tune_set = imageDataGenerator.flow_from_dataframe(data,
#               directory = train_path,
#               x_col = 'image_id',
#               y_col = 'class_name',
#               target_size = SIZE,
#               color_mode="rgb",
#               class_mode = 'categorical',
#               batch_size = BATCH_SIZE)

# # Tune hyperparameters on first cross-validation (refactor later to use saved hps)
# tuner.search(tune_set, epochs=50, callbacks=[stop_early])
# # Get the optimal hyperparameters
# best_hps=tuner.get_best_hyperparameters(num_trials=1)[0]
# print(f"""
# The hyperparameter search is complete. The optimal learning rate for the optimizer
# is {best_hps.get('learning_rate')}.
# """)

# kfold = StratifiedKFold(n_splits = N_CV_SPLITS)
# cv_index = 0

# for train_indices, val_indices in kfold.split(data['image_id'], data['label']):
#   print('Training on cross-validation split '+str(cv_index))
#   train_ds = data.iloc[train_indices]
#   val_ds = data.iloc[val_indices]

#   train_set = imageDataGenerator.flow_from_dataframe(train_ds,
#                                   subset='training',
#                                   directory = train_path,
#                                   x_col = 'image_id',
#                                   y_col = 'class_name',
#                                   target_size = SIZE,
#                                   color_mode="rgb",
#                                   class_mode = 'categorical',
#                                   batch_size = BATCH_SIZE)

#   val_set = imageDataGenerator.flow_from_dataframe(val_ds,
#                                   directory = train_path,
#                                   x_col = 'image_id',
#                                   y_col = 'class_name',
#                                   target_size = SIZE,
#                                   color_mode="rgb",
#                                   class_mode = 'categorical',
#                                   batch_size = BATCH_SIZE)
  

#   model = tuner.hypermodel.build(best_hps)
#   history = model.fit(
#         train_set,
#         steps_per_epoch=train_set.n // 32,
#         epochs=30,
#         validation_data=val_set,
#         validation_steps=val_set.n // 32
#   )

#   val_acc_per_epoch = history.history['val_accuracy']
#   best_epoch = val_acc_per_epoch.index(max(val_acc_per_epoch)) + 1
#   print('Best epoch: %d' % (best_epoch,))


INFO:tensorflow:Reloading Oracle from existing project hyperparams/untitled_project/oracle.json
Found 21397 validated image filenames belonging to 5 classes.

Search: Running Trial #1

Hyperparameter    |Value             |Best Value So Far 
learning_rate     |0.0001            |?                 
tuner/epochs      |2                 |?                 
tuner/initial_e...|0                 |?                 
tuner/bracket     |2                 |?                 
tuner/round       |0                 |?                 

Epoch 1/2
Epoch 2/2


ValueError: ignored