# Cassava Leaf Disease challenge: Exploratory Data Analysis
## Start: November 29, 2020
## Deadline: February 18, 2021

In [None]:
#Check if GPU is activate
import torch

USE_GPU = True

if USE_GPU and torch.cuda.is_available():
    print('Using device: cuda')
else:
    print('Using device: cpu')

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
from pathlib import Path
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
import matplotlib.image as mpimg
sns.set()

In [None]:
# BASE PATHS: ONLY THINGS TO CHANGE WHEN SHARING THIS NOTEBOOK
BASE_DIR = Path("../input/cassava-leaf-disease-classification") #Path to data directory
IMAGE_DIR = Path(BASE_DIR, "train_images") #Path to images directory
OUTPUT_DIR = Path("./") #Path to 'output' directory

SEED = 117

In [None]:
with open(Path(BASE_DIR, "label_num_to_disease_map.json"), 'r') as infile:
    map_classes = json.load(infile)
map_classes = {int(k):v for k, v in map_classes.items()}
map_classes

In [None]:
TRAIN_SOURCES = list(IMAGE_DIR.rglob("*.jpg"))
print("Number of train images:", len(TRAIN_SOURCES))

In [None]:
df_train = pd.read_csv(Path(BASE_DIR, "train.csv"))
df_train['class_name'] = df_train['label'].map(map_classes)
df_train.head()

In [None]:
plt.figure(figsize=(8, 4))
sns.countplot(y="class_name", data=df_train);

**OBS: unbalanced training set, trend for CMD class (label 3)**

In [None]:
img_shapes = {}
for image_path in list(IMAGE_DIR.rglob("*.jpg"))[:300]:
    image = mpimg.imread(image_path)
    img_shapes[image.shape] = img_shapes.get(image.shape, 0) + 1

print(img_shapes)

## Batch visualization

In [None]:
from toolbox import visualize_batch

tmp_df = df_train.sample(9)
image_ids = tmp_df["image_id"].values
labels = tmp_df["class_name"].values

visualize_batch(IMAGE_DIR, image_ids, labels)

### Class 0: Cassava Bacterial Blight (CBB)

In [None]:
df = df_train[df_train["label"] == 0]
print(f"Train images for class 0: {df.shape[0]}")

df = df.sample(9)
image_ids = df["image_id"].values
labels = df["label"].values

visualize_batch(IMAGE_DIR, image_ids, labels)

### Class 1: Cassava Brown Streak Disease (CBSD)

In [None]:
df = df_train[df_train["label"] == 1]
print(f"Train images for class 1: {df.shape[0]}")

df = df.sample(9)
image_ids = df["image_id"].values
labels = df["label"].values

visualize_batch(IMAGE_DIR, image_ids, labels)

### Class 2: Cassava Green Mottle (CGM)

In [None]:
df = df_train[df_train["label"] == 2]
print(f"Train images for class 2: {df.shape[0]}")

df = df.sample(9)
image_ids = df["image_id"].values
labels = df["label"].values

visualize_batch(IMAGE_DIR, image_ids, labels)

### Class 3: Cassava Mosaic Disease (CMD)

In [None]:
df = df_train[df_train["label"] == 3]
print(f"Train images for class 3: {df.shape[0]}")

df = df.sample(9)
image_ids = df["image_id"].values
labels = df["label"].values

visualize_batch(IMAGE_DIR, image_ids, labels)

**Main class (most of training data): common disease or unbalanced data?**

### Class 4: healthy

In [None]:
df = df_train[df_train["label"] == 4]
print(f"Train images for class 4: {df.shape[0]}")

df = df.sample(9)
image_ids = df["image_id"].values
labels = df["label"].values

visualize_batch(IMAGE_DIR, image_ids, labels)

**OBS:** 
- some images are not close-ups hence details are harshly identifiable.  
- some plants labelled healthy show stains or spots which is weird.  
- some images are duplicated (according to Kaggle discussions)

Therefore it might be interesting to identify groups within classes and possibly distinguish outliers.
- clustering ? 

For more accurate observations about the dataset:
https://www.kaggle.com/tanulsingh077/how-to-become-leaf-doctor-with-deep-learning

## Clustering for discovering outliers and unlikely images classes

Mostly based on: https://www.kaggle.com/tanulsingh077/how-to-become-leaf-doctor-with-deep-learning

In [None]:
#Using Keras here as it is way more convenient than PyTorch
from keras.preprocessing.image import load_img, img_to_array, ImageDataGenerator
from keras.applications.resnet50 import preprocess_input, ResNet50
from keras.models import Model


def extract_features(image_id, model):
    source = Path(IMAGE_DIR, image_id)
    image = np.array(load_img(source, target_size=(224, 224))).reshape(1, 224, 224, 3)
    image = preprocess_input(image)
    
    features = model.predict(image, use_multiprocessing=True)
    
    return features

In [None]:
image_size = (224, 224)

datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input
)

model = ResNet50() #Pretrained NN
model = Model(inputs=model.inputs, outputs=model.layers[-2].output)

In [None]:
selection = df_train[df_train['label'] == 1]
selection["label"] = selection["label"].astype(str)

flow = datagen.flow_from_dataframe(
    selection,
    IMAGE_DIR,
    x_col="image_id",
    y_col="label",
    class_mode="categorical",
    target_size=image_size,
    batch_size=32,
    shuffle=False
)

features = model.predict(
    flow,
    verbose=1,
    use_multiprocessing=True
)

In [None]:
from sklearn.cluster import KMeans

K = 5
clt = KMeans(n_clusters=K, random_state=0)
clt.fit(features)

groups = {}
for k in range(K):
    groups[k] = selection['image_id'].iloc[np.where(clt.labels_ == k)[0]].tolist()

In [None]:
from toolbox import view_cluster 
        
for k in range(5):
    view_cluster(IMAGE_DIR, groups[k])

In [None]:
from sklearn.decomposition import KernelPCA

centers = clt.cluster_centers_
reduc = KernelPCA(n_components=2, kernel='rbf')
features_red = reduc.fit_transform(features)

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(features_red[:, 0], features_red[:, 1], hue=clt.labels_)

## Over-sampling for handling imbalanced data distribution

In [None]:
from imbalanced import ImbalancedDatasetSampler

In [None]:
# from joblib import Parallel, delayed

# model = ResNet50() #Pretrained NN
# model = Model(inputs=model.inputs, outputs=model.layers[-2].output)


# N = 1000
# idxs = np.random.randint(df_train.shape[0], size=N)
# image_ids = df_train['image_id'].iloc[idxs].tolist()
# features = [extract_features(image_id, model)
#                        for image_id in image_ids]
# features = np.array(features).reshape(-1, 2048)
# # features = Parallel(n_jobs=-1)(delayed(extract_features)(image_id, model) for image_id in list(df_train['image_id']))
# labels = df_train['label'].iloc[idxs].tolist()

## Image augmentations

In [None]:
import albumentations as alb
from albumentations.pytorch import ToTensorV2

# Augmentations pipeline
transforms = alb.Compose([
    alb.Resize(224, 224),
    alb.HorizontalFlip(p=0.5),
    alb.VerticalFlip(p=0.5),
    alb.CoarseDropout(max_holes=5, max_height=8, max_width=8, p=0.5),
    alb.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5)
#     alb.JpegCompression(quality_lower=99, quality_upper=100, always_apply=True, p=1.0),
])

In [None]:
df = df_train.sample(9)
image_ids = df["image_id"].values
labels = df["label"].values

In [None]:
fig, ax = plt.subplots(3, 2, figsize=(12, 14))
for i, image_id in enumerate(image_ids[:3]):
    image = mpimg.imread(Path(IMAGE_DIR, image_id))
    transf_image = transforms(image=image)["image"]
    ax[i, 0].imshow(image)
    ax[i, 1].imshow(transf_image)
    ax[i, 0].axis("off")
    ax[i, 1].axis("off")