## 💥 Includes

* Creates RGB Image dataset of size 512x512. I am creating this for quick prototyping. The RGB image is created by stacking red(microtubules), green(protein of interest) and blue(nucleoplasm) stain images. 
* Weights and Biases [Artifacts](https://docs.wandb.ai/artifacts) for Dataset versioning. I am splitting the `train.csv` file into train and validation splits. They are logged as Artifacts. 
    * Random Train-Validation split
    * Stratified Train-Validation split.
    
    
### Datasets

* [HPA: 256x256 dataset](https://www.kaggle.com/ayuraj/HPA256x256DATASET)
* [HPA: 512x512 dataset](https://www.kaggle.com/ayuraj/HPA512X512DATASET)

### To use Artifacts

* For Random Split

```Python
import wandb
run = wandb.init()
artifact = run.use_artifact('ayush-thakur/hpa/split:v0', type='dataset')
artifact_dir = artifact.download()
```

* For Stratified Split

```Python
import wandb
run = wandb.init()
artifact = run.use_artifact('ayush-thakur/hpa/stratified_split:v0', type='dataset')
artifact_dir = artifact.download()
```

![](https://i.imgur.com/xO31ZUL.png)

## ❄️ Imports and Setups

In [None]:
%%capture
!pip install wandb --upgrade

In [None]:
import os
import re
import cv2
import glob
import imageio
import numpy as np
import pandas as pd
import seaborn as sns
from PIL import Image
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from skimage.transform import resize
from sklearn.model_selection import train_test_split

%matplotlib inline

In [None]:
import wandb
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
wandb_api = user_secrets.get_secret("wandb_api")

wandb.login(key=wandb_api)

In [None]:
WORKING_DIR_PATH = '../input/hpa-single-cell-image-classification/'
IMAGE_HEIGHT = 512
IMAGE_WIDTH = 512

In [None]:
# Ref: https://www.kaggle.com/divyanshuusingh/eda-image-segmentation
label_names= {
0: "Nucleoplasm",
1: "Nuclear membrane",
2: "Nucleoli",
3: "Nucleoli fibrillar center",
4: "Nuclear speckles",
5: "Nuclear bodies",
6: "Endoplasmic reticulum",
7: "Golgi apparatus",
8: "Intermediate filaments",
9: "Actin filaments",
10: "Microtubules",
11: "Mitotic spindle",
12: "Centrosome",
13: "Plasma membrane",
14: "Mitochondria",
15: "Aggresome",
16: "Cytosol",
17: "Vesicles and punctate cytosolic patterns",
18: "Negative"
}

Path to image channels(stain)

# 🚅 Train Dataset

In [None]:
red_images = sorted(glob.glob(WORKING_DIR_PATH+'train/*_red.png'))
green_images = sorted(glob.glob(WORKING_DIR_PATH+'train/*_green.png'))
blue_images = sorted(glob.glob(WORKING_DIR_PATH+'train/*_blue.png'))
yellow_images = sorted(glob.glob(WORKING_DIR_PATH+'train/*_yellow.png'))

print(len(red_images), len(green_images), len(blue_images), len(yellow_images))

In [None]:
# Test if the image ids are aligned properly
for r, g, b, y in zip(red_images, green_images, blue_images, yellow_images):
    if re.findall(r'[^\/]+(?=\_.)', r)[0] == re.findall(r'[^\/]+(?=\_.)', g)[0] == re.findall(r'[^\/]+(?=\_.)', b)[0] == re.findall(r'[^\/]+(?=\_.)', y)[0]:
        pass
    else:
        print(r)

In [None]:
TRAIN_SAVE_DIR = '/kaggle/tmp/hpa_512x512_dataset/train/'

os.makedirs(TRAIN_SAVE_DIR+'rgb', exist_ok=True)

!ls /kaggle/tmp/hpa_512x512_dataset/train/

In [None]:
for i in tqdm(range(len(red_images))):
    # Image ID 
    image_id = re.findall(r'[^\/]+(?=\_.)', red_images[i])[0]
    
    # Get red, blue and green channel images. 
    red = np.array(Image.open(red_images[i]))
    green = np.array(Image.open(green_images[i]))
    blue = np.array(Image.open(blue_images[i]))
    
    # Stack the channels to form RGB image.
    image_rgb = np.dstack((red, green, blue))
    # Resize
    image_rgb = cv2.resize(image_rgb, (IMAGE_HEIGHT, IMAGE_WIDTH), interpolation=cv2.INTER_AREA)
    # Save image
    cv2.imwrite(TRAIN_SAVE_DIR+'rgb/'+image_id+'.png', image_rgb)

In [None]:
print(len(os.listdir(TRAIN_SAVE_DIR+'rgb/')))

# ⛽ Test Dataset

In [None]:
red_images = sorted(glob.glob(WORKING_DIR_PATH+'test/*_red.png'))
green_images = sorted(glob.glob(WORKING_DIR_PATH+'test/*_green.png'))
blue_images = sorted(glob.glob(WORKING_DIR_PATH+'test/*_blue.png'))
yellow_images = sorted(glob.glob(WORKING_DIR_PATH+'test/*_yellow.png'))

print(len(red_images), len(green_images), len(blue_images), len(yellow_images))

In [None]:
# Test if the image ids are aligned properly
for r, g, b, y in zip(red_images, green_images, blue_images, yellow_images):
    if re.findall(r'[^\/]+(?=\_.)', r)[0] == re.findall(r'[^\/]+(?=\_.)', g)[0] == re.findall(r'[^\/]+(?=\_.)', b)[0] == re.findall(r'[^\/]+(?=\_.)', y)[0]:
        pass
    else:
        print(r)

In [None]:
TEST_SAVE_DIR = '/kaggle/tmp/hpa_512x512_dataset/test/'

os.makedirs(TEST_SAVE_DIR+'rgb', exist_ok=True)

!ls /kaggle/tmp/hpa_512x512_dataset/test/

In [None]:
for i in tqdm(range(len(red_images))):
    # Image ID 
    image_id = re.findall(r'[^\/]+(?=\_.)', red_images[i])[0]
    
    # Get red, blue and green channel images. 
    red = np.array(Image.open(red_images[i]))
    green = np.array(Image.open(green_images[i]))
    blue = np.array(Image.open(blue_images[i]))
    
    # Stack the channels to form RGB image.
    image_rgb = np.dstack((red, green, blue))
    # Resize
    image_rgb = cv2.resize(image_rgb, (IMAGE_HEIGHT, IMAGE_WIDTH), interpolation=cv2.INTER_NEAREST)
    # Save image
    cv2.imwrite(TEST_SAVE_DIR+'rgb/'+image_id+'.png', image_rgb)

In [None]:
print(len(os.listdir(TEST_SAVE_DIR+'rgb/')))

# 🎪 Create Kaggle Dataset

In [None]:
# Copy Kaggle API token to ~/.kaggle
! mkdir -p /root/.kaggle/
! cp ../input/apitoken/kaggle.json /root/.kaggle/kaggle.json
# Initialize dataset creation
! kaggle datasets init -p /kaggle/tmp/hpa_512x512_dataset

In [None]:
!ls /kaggle/tmp/hpa_512x512_dataset/

In [None]:
%%bash
echo "{
  \"title\": \"HPA: 512x512 dataset\",
  \"id\": \"ayuraj/HPA512X512DATASET\",
  \"licenses\": [
    {
      \"name\": \"CC0-1.0\"
    }
  ]
}" > /kaggle/tmp/hpa_512x512_dataset/dataset-metadata.json

In [None]:
!kaggle datasets create -p /kaggle/tmp/hpa_512x512_dataset/ -u --dir-mode tar
# ! kaggle datasets version -p /kaggle/tmp/hpa_512x512_dataset -m "add rgb images"  --dir-mode tar

In [None]:
!rm -rf /root/.kaggle/kaggle.json

# 🎳 Dataset Versioning with W&B

In this section we will create train and validation dataset using `train.csv`. We will use Weights and Biases Artifacts for dataset versioning. 

🐤 Quick introduction on Weights and Biases Artifacts

You can use W&B Artifacts to store and keep track of datasets, models, and evaluation results across machine learning pipelines. Think of an artifact as a versioned folder of data. You can store entire datasets directly in artifacts, or use artifact references to point to data in other systems.

Learn more about W&B artifacts [here](https://docs.wandb.ai/artifacts). Check out this [YouTube tutorial](https://www.youtube.com/watch?v=Hd94gatGMic&list=PLD80i8An1OEGajeVo15ohAQYF1Ttle0lk&index=3) as well.

In [None]:
df_train = pd.read_csv(WORKING_DIR_PATH+'train.csv')
df_train.head()

In [None]:
# Ref: https://www.kaggle.com/thedrcat/hpa-single-cell-classification-eda
def plot_data_distribution(df):
    labels = [str(i) for i in range(19)]

    # The number of times a label appears alone.
    unique_counts = {}
    for lbl in labels:
        unique_counts[lbl] = len(df[df.Label == lbl])

    # The total number of times a label appears.
    full_counts = {}
    for lbl in labels:
        count = 0
        for row_label in df['Label']:
            if lbl in row_label.split('|'): count += 1
        full_counts[lbl] = count

    counts = list(zip(map(int,full_counts.keys()), full_counts.values(), unique_counts.values()))
    counts = np.array(sorted(counts, key=lambda x:-x[1]))
    counts = pd.DataFrame(counts, columns=['label', 'full_count', 'unique_count'])

    sns.set(style="whitegrid")
    f, ax = plt.subplots(figsize=(16, 12))

    sns.set_color_codes("pastel")
    sns.barplot(x="full_count", y="label", data=counts, order=counts.label.values,
                label="full count", color="b", orient = 'h')

    # Plot the crashes where alcohol was involved
    sns.set_color_codes("muted")
    sns.barplot(x="unique_count", y="label", data=counts, order=counts.label.values,
                label="unique count", color="b", orient = 'h')

    # Add a legend and informative axis label
    ax.legend(ncol=2, loc="lower right", frameon=True)
    ax.set(ylabel="",
           xlabel="Counts")
    sns.despine(left=True, bottom=True)
    
    return unique_counts, full_counts

In [None]:
_, full_count = plot_data_distribution(df_train)

### Log as Artifact

* Log `train.csv` as artifacts since this is the raw dataset. 
* It will be followed by different splits of this raw dataset. We want to train and validate our model on meaningful split of the dataset. 

In [None]:
run = wandb.init(entity='ayush-thakur', project='hpa', job_type='dataset_creation')
artifact = wandb.Artifact('raw', type='dataset')
artifact.add_file(WORKING_DIR_PATH+'train.csv')
run.log_artifact(artifact)
run.join()

## 🎱 Random Train-Validation Split

In [None]:
df_train_shuffled = df_train.sample(frac=1)
df_train_shuffled.head()

In [None]:
train_split, val_split = train_test_split(df_train_shuffled, test_size=0.2)

In [None]:
print(f'Training split got {len(train_split.values)} and valdiation split got {len(val_split.values)}')

### Plot Train Split distribution

In [None]:
_, train_full_count = plot_data_distribution(train_split)

### Plot Validation Split distribution

In [None]:
_, val_full_count = plot_data_distribution(val_split)

### Log the splits as Artifact

In [None]:
train_split.to_csv('train_split.csv', index=False)
val_split.to_csv('val_split.csv', index=False)

run = wandb.init(entity='ayush-thakur', project='hpa', job_type='dataset_split')

artifact_raw = run.use_artifact('ayush-thakur/hpa/raw:v0', type='dataset')

artifact = wandb.Artifact('split', type='dataset')
artifact.add_file('train_split.csv')
artifact.add_file('val_split.csv')
run.log_artifact(artifact)
run.join()

## ⚽ Stratified Train-Validation Split

Stratify based on combination of labels. The unique combinations will be put into train.
Another similar stratification idea can be found in this [Stack Overflow thread](https://stackoverflow.com/questions/54890899/not-able-to-use-stratified-k-fold-on-multi-label-classifier).

In [None]:
# Ref: https://www.kaggle.com/samusram/hpa-classifier-explainability-segmentation/comments#Plan

label_combinations = df_train['Label'].map(lambda x: str(sorted(list(x))))
f'There are {sum(label_combinations.value_counts() == 1)} images with unique label combinations out of {len(label_combinations)}.'

label_combinations_counts = label_combinations.value_counts()
unique_label_combs = label_combinations_counts.index[(label_combinations_counts == 1).values]

train_ids_unique_combs = df_train['ID'].loc[label_combinations.map(lambda x: x in unique_label_combs)]

non_unique_combo_bool_idx = label_combinations.map(lambda x: x not in unique_label_combs)
train_ids, val_ids = train_test_split(df_train['ID'].loc[non_unique_combo_bool_idx].values, 
                                        test_size=0.2, 
                                        stratify=label_combinations.loc[non_unique_combo_bool_idx], # sorting present classes in lexicographical order, just to be sure
                                        random_state=42)

train_ids = np.concatenate((train_ids, train_ids_unique_combs))

print(f'Number of training samples: {len(train_ids)} and validation samples: {len(val_ids)}')

In [None]:
stratified_train_split = df_train.loc[df_train['ID'].isin(train_ids)]
stratified_val_split = df_train.loc[df_train['ID'].isin(val_ids)]

In [None]:
_, stratified_train_full_count = plot_data_distribution(stratified_train_split)

In [None]:
_, stratified_val_full_count = plot_data_distribution(stratified_val_split)

### Log the Stratified splits as Artifact

In [None]:
stratified_train_split.to_csv('stratified_train_split.csv', index=False)
stratified_val_split.to_csv('stratified_val_split.csv', index=False)

run = wandb.init(entity='ayush-thakur', project='hpa', job_type='dataset_stratified_split')

artifact_raw = run.use_artifact('ayush-thakur/hpa/raw:v0', type='dataset')

artifact = wandb.Artifact('stratified_split', type='dataset')
artifact.add_file('stratified_train_split.csv')
artifact.add_file('stratified_val_split.csv')
run.log_artifact(artifact)
run.join()