## Few important points about the problem.
- This is a weakly supervised multi-label classification problem. (Weakly supervised learning is a machine learning framework where the model is trained using examples that are only partially annotated or labeled. Here, we are only provided with image level labels.)

- Each sample consists of four files. Each file represents a different filter on the subcellular protein patterns represented by the sample. 
  - The format should be [filename]_[filter color].png for the PNG files. Colors are <span style="background-color: #FF0000">red for microtubule channels</span>, <span style="background-color: #0000FF">blue for nuclei channels</span>, <span style="background-color: #FFFF00">yellow for Endoplasmic Reticulum (ER) channels</span>, and <span style="background-color: #00FF00">green for the protein of interest.</span>
  - The green filter should hence be used to predict the label, and the other filters are used as references.
- Since this is a multi-label problem, each image is given a set of labels. Following are the index to label mappings used for this problem.

|                     Labels                   |
|----------------------------------------------|
| 0. Nucleoplasm                               |
| 1. Nuclear membrane                          |
| 2. Nucleoli                                  |
| 3. Nucleoli fibrillar center                 |
| 4. Nuclear speckles                          |
| 5. Nuclear bodies                            |
| 6. Endoplasmic reticulum                     |
| 7. Golgi apparatus                           |
| 8. Intermediate filaments                    |
| 9. Actin filaments 10. Microtubules          |
| 11. Mitotic spindle                          |
| 12. Centrosome                               |
| 13. Plasma membrane                          |
| 14. Mitochondria                             |
| 15. Aggresome                                |
| 16. Cytosol                                  |
| 17. Vesicles and punctate cytosolic patterns |
| 18. Negative                                 |


### Import Libraries

In [None]:
import numpy as np
import pandas as pd
import cv2

import matplotlib.pyplot as plt
import seaborn as sns
import plotly as py
import plotly.express as px
py.offline.init_notebook_mode (connected = True)

import tqdm
%matplotlib inline

In [None]:
path = '../input/hpa-single-cell-image-classification/'

In [None]:
train_df = pd.read_csv(path + 'train.csv')

### Converting multi labels in one hot encodings

In [None]:
label_to_name = {
    '0': "Nucleoplasm",
    '1': "Nuclear membrane",
    '2': "Nucleoli",
    '3': "Nucleoli fibrillar center",
    '4': "Nuclear speckles",
    '5': "Nuclear bodies",
    '6': "Endoplasmic reticulum",
    '7': "Golgi apparatus",
    '8': "Intermediate filaments",
    '9': "Actin filaments",
    '10': "Microtubules",
    '11': "Mitotic spindle",
    '12': "Centrosome",
    '13': "Plasma membrane",
    '14': "Mitochondria",
    '15': "Aggresome",
    '16': "Cytosol",
    '17': "Vesicles and punctate cytosolic patterns",
    '18': "Negative",
}

In [None]:
def view_df(df, idx=0, full=False):
    '''
    Helper function to view dataframe in a row or to view just a particular row in the dataframe
    
    '''
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):
        if full:
            display(df.head())
        else:
            display(pd.DataFrame(df.iloc[idx]).T)

In [None]:
train = train_df.copy()
lab_idx = label_to_name.keys()

train['Label'] = train['Label'].map(lambda x: x.split('|'))

for label in lab_idx:
    train[label_to_name[label]] = train['Label'].map(lambda result: 1 if label in result else 0)

In [None]:
view_df(train, 4)

In [None]:
view_df(train, full=True)

In [None]:
import plotly.graph_objects as go
import plotly.figure_factory as ff

def barplot(x, y, c, x_title, y_title, title):
    fig = px.bar(x=x, y=y, opacity=0.90, color=c, \
            labels={'x': x_title, 'y': y_title})
    fig.update_layout(title_text=title, title_x=0.5)
    fig.show()

def piechart(x, y, c, title):
    fig = px.pie(names=x, values=y, color=c)
    fig.update_layout(title_text=title, title_x=0.5)
    fig.show()
    
def heatmap(x):
    z = train.drop(['ID', 'Label'], axis=1).corr()
    z_text = np.around(z, decimals=2)
    fig = go.Figure(data = go.Heatmap(x = x, y = x, z = z, zmin=-1, zmax=1, colorscale = 'rainbow')) 
  
    fig.show()

### Number of Images per Label

In [None]:
values = [train[col].value_counts()[1] for col in train.columns[2:]]
names = list(label_to_name.values())

barplot(values, names, names, 'Number of images', 'Name of labels', 'Number of images per label')
piechart(names, values, names, 'Number of images per label')

#### Observation from above:
- Images with Nucleoplasm label are highest in number (8497)
- Negative labelled images are quite low (34)
- About 50% of the images consists of 3 labels (Nucleoplasm, Cytosol, Plasma membrane)

### Number of labels per image

In [None]:
from collections import Counter
num_labels = Counter([len(n) for n in train['Label']])
print(num_labels.keys())
print(num_labels.values())

barplot(num_labels.keys(), num_labels.values(), num_labels.keys(), 'Number of labels', 'Number of images', 'Number of labels per image')
piechart(num_labels.keys(), num_labels.values(), num_labels.keys(), 'Number of labels per image')

#### Observation from above
- About 10412 images have only one label.
- About 99% of images have at most 3 labels and 88.7% of images have at most 2 labels.

### Correlation between labels 

In [None]:
values = [train[col].value_counts()[1] for col in train.columns[2:]]
names = list(label_to_name.values())

heatmap(names)

#### Observations from above
- Nucleoplasm and Nuclear speckles are higly correlated.
- Cytosol and mitochondria are also correlated to some extent.

### Train Image dataset


In [None]:
def read_image(img_name):
    
    green = cv2.imread('../input/hpa-single-cell-image-classification/train/{}_green.png'.format(img_name), cv2.IMREAD_GRAYSCALE )
    red = cv2.imread('../input/hpa-single-cell-image-classification/train/{}_red.png'.format(img_name), cv2.IMREAD_GRAYSCALE )
    blue = cv2.imread('../input/hpa-single-cell-image-classification/train/{}_blue.png'.format(img_name), cv2.IMREAD_GRAYSCALE )
    yellow = cv2.imread('../input/hpa-single-cell-image-classification/train/{}_yellow.png'.format(img_name), cv2.IMREAD_GRAYSCALE )
    
    return green, red, blue, yellow


def remove_ticks(ax):
    ax.set_xticklabels([])
    ax.set_yticklabels([])
    ax.tick_params(left=False, bottom=False)

Below images are images related to specific channels

In [None]:
from matplotlib.colors import LinearSegmentedColormap
img_name = '0060269e-bbbc-11e8-b2ba-ac1f6b6435d0'
print(f"Image name is: {img_name}")
green, red, blue, yellow = read_image(img_name)

#reset seaborn style
sns.reset_orig()

# creating custom color map
c1 = {'red':   ((0.0,  0.0, 0.0),
                   (1.0,  0.0, 0.0)),

         'green': ((0.0,  0.0, 0.0),
                   (0.75, 1.0, 1.0),
                   (1.0,  1.0, 1.0)),

         'blue':  ((0.0,  0.0, 0.0),
                   (1.0,  0.0, 0.0))}

c2 = {'red':   ((0.0,  0.0, 0.0),
                   (0.75, 1.0, 1.0),
                   (1.0,  1.0, 1.0)),

         'green': ((0.0,  0.0, 0.0),
                   (1.0,  0.0, 0.0)),

         'blue':  ((0.0,  0.0, 0.0),
                   (1.0,  0.0, 0.0))}

c3 = {'red':   ((0.0,  0.0, 0.0),
                   (1.0,  0.0, 0.0)),

         'green': ((0.0,  0.0, 0.0),
                   (1.0,  0.0, 0.0)),

         'blue':  ((0.0,  0.0, 0.0),
                   (0.75, 1.0, 1.0),
                   (1.0,  1.0, 1.0))}

c4 = {'red': ((0.0,  0.0, 0.0),
                   (0.75, 1.0, 1.0),
                   (1.0,  1.0, 1.0)),

         'green': ((0.0,  0.0, 0.0),
                   (0.75, 1.0, 1.0),
                   (1.0,  1.0, 1.0)),

         'blue':  ((0.0,  0.0, 0.0),
                   (1.0,  0.0, 0.0))}

green_map = LinearSegmentedColormap('Green', c1)
red_map = LinearSegmentedColormap('Red', c2)
blue_map = LinearSegmentedColormap('Blue', c3)
yellow_map = LinearSegmentedColormap('Yellow', c4)

f, axarr = plt.subplots(nrows=2, ncols=2, figsize=(14, 14))
axarr[0,0].imshow(green, cmap=green_map)
axarr[0,0].set_title('Green: Protein of interest')
remove_ticks(axarr[0,0])
axarr[0,1].imshow(red, cmap=red_map)
axarr[0,1].set_title('Red: Microtubule')
remove_ticks(axarr[0,1])
axarr[1,0].imshow(blue, cmap=red_map)
axarr[1,0].set_title('Blue: Nuclei')
remove_ticks(axarr[1,0])
axarr[1,1].imshow(yellow, cmap=yellow_map)
axarr[1,1].set_title('Yellow: Endoplasmic Reticulum (ER)')
remove_ticks(axarr[1,1])
plt.plot()

The reference for the above is taken from [this](https://www.kaggle.com/jschnab/exploring-the-human-protein-atlas-images) notebook

In [None]:
def threshold(img, img_name, cmap='Greys'):
    ret,thresh1 = cv2.threshold(img,40, 255, cv2.THRESH_BINARY)
    ret,thresh2 = cv2.threshold(img,40, 255, cv2.THRESH_TRUNC)
    thresh3 = cv2.adaptiveThreshold(img,255,cv2.ADAPTIVE_THRESH_MEAN_C,\
            cv2.THRESH_BINARY,21,4)
    thresh4 = cv2.adaptiveThreshold(img,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,\
            cv2.THRESH_BINARY,21,4)
    
    
    f, ax = plt.subplots(nrows=1, ncols=4, figsize=(23, 23)) 
    ax[0].imshow(thresh1, cmap=cmap)
    ax[0].set_title(f'Binary Threshold: {img_name}')
    remove_ticks(ax[0])
    ax[1].imshow(thresh2, cmap=cmap)
    ax[1].set_title(f'Trunc Threshold: {img_name}')
    remove_ticks(ax[1])
    ax[2].imshow(thresh3, cmap=cmap)
    ax[2].set_title(f'Adaptive Mean Thresholding: {img_name}')
    remove_ticks(ax[2])
    ax[3].imshow(thresh4, cmap=cmap)
    ax[3].set_title(f'Adaptive Gaussian Thresholding: {img_name}')
    remove_ticks(ax[3])
    plt.show()

#### Thresholded images across all the 4 channels

In [None]:
print("Thresholding with channels")
threshold(green, 'Protein', green_map)
threshold(red, 'Microtubules', red_map)
threshold(blue, 'Nuclei', blue_map)
threshold(yellow, 'Endoplasmic Reticulum', yellow_map)

#### Thresholding images across all 4 channels with Grey color map

In [None]:
print("Thresholding with B/W")
threshold(green, 'Protein')
threshold(red, 'Microtubules')
threshold(blue, 'Nuclei')
threshold(yellow, 'Endoplasmic Reticulum')

### Otsu’s Binarization

- In the above thresholding method, a specific threshold was choosen arbitrary. Otsu's binarization automatically calculates a threshold value from image histogram for a bimodal image. 


In [None]:
img_name = '0060269e-bbbc-11e8-b2ba-ac1f6b6435d0'
print(f"Image name is: {img_name}")
green, red, blue, yellow = read_image(img_name)

In [None]:
# taking the picture for endoplasmic reticulum
img = yellow

# global thresholding
ret1,th1 = cv2.threshold(img,127,255,cv2.THRESH_BINARY)

# Otsu's thresholding
ret2,th2 = cv2.threshold(img,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)

# Otsu's thresholding after Gaussian filtering
blur = cv2.GaussianBlur(img,(5,5),0)
ret3,th3 = cv2.threshold(blur,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)

# plot all the images and their histograms
images = [img, 0, th1,
          img, 0, th2,
          blur, 0, th3]

titles = ['Original Noisy Image','Histogram','Global Thresholding (v=127)',
          'Original Noisy Image','Histogram',"Otsu's Thresholding",
          'Gaussian filtered Image','Histogram',"Otsu's Thresholding"]

f, ax = plt.subplots(nrows=3, ncols=3, figsize=(20, 20))

for i in range(3):
    ax[i, 0].imshow(images[i*3], cmap='Greys')
    ax[i, 0].set_title(titles[i*3])
    remove_ticks(ax[i, 0])
    ax[i, 1].hist(images[i*3].ravel(), 256)
    ax[i, 1].set_title(titles[i*3+1])
    remove_ticks(ax[i, 1])
    ax[i, 2].imshow(images[i*3+2], cmap='Greys')
    ax[i, 2].set_title(titles[i*3+2])
    remove_ticks(ax[i, 2])
plt.show()

To study more about Otsu's Binarization, go to [this link](https://opencv-python-tutroals.readthedocs.io/en/latest/py_tutorials/py_imgproc/py_thresholding/py_thresholding.html#otsus-binarization). 

### Vizualize RGB images for each class of the label

In [None]:
for i, label in enumerate(train.columns[2:]): 
    f, ax = plt.subplots(nrows=1, ncols=1, figsize=(5, 5))
    img_name = train[train[label]==1].iloc[0]['ID']
    green, red, blue, yellow = read_image(img_name)
    img = np.dstack((green,red, blue))
    
    ax.imshow(img)
    ax.set_title(label+" : Image name> "+img_name)
    remove_ticks(ax)
    plt.show()

### U-Net (from scratch)

- The u-net is convolutional network architecture for fast and precise segmentation of images. 
- U-net was used for segmenting biomedical images in the original paper.

Following are the steps that can be followed here:
* conv 3x3 applied 2 times, followed by RelU.
* do a max pooling with filter 2x2 stride of 2 for downsampling
* Repeat it for 5 steps
* Then perform up conv


In [None]:
import torch
import torch.nn as nn
im = '../input/hpa-single-cell-image-classification/train/000a6c98-bb9b-11e8-b2b9-ac1f6b6435d0_red.png'


In [None]:
def double_conv(in_channel, out_channel):
    conv = nn.Sequential(
        nn.Conv2d(in_channel, out_channel, kernel_size=3),
        nn.ReLU(inplace=True),
        nn.Conv2d(out_channel, out_channel, kernel_size=3),
        nn.ReLU(inplace=True),
    )
    
    return conv
    
    
def crop_tensor(original, target):
    target_size = target.size()[2]
    original_size = original.size()[2]
    delta = original_size - target_size
    
    delta = delta // 2
    return original[:, :, delta: original_size-delta, delta: original_size-delta]



class UNet(nn.Module):
    def __init__(self):
        super(UNet, self).__init__()
        
        self.max_pool_2x2 = nn.MaxPool2d(
            kernel_size=2, 
            stride=2
        )
        self.down_conv_1 = double_conv(1, 64)
        self.down_conv_2 = double_conv(64, 128)
        self.down_conv_3 = double_conv(128, 256)
        self.down_conv_4 = double_conv(256, 512)
        self.down_conv_5 = double_conv(512, 1024)
        
        
        self.conv_trans_1 = nn.ConvTranspose2d(
            1024, 
            512, 
            kernel_size=2, 
            stride=2
        )
        
        self.up_conv_1 = double_conv(1024, 512)
        
        self.conv_trans_2 = nn.ConvTranspose2d(
            512, 
            256, 
            kernel_size=2, 
            stride=2
        )
        self.up_conv_2 = double_conv(512, 256)
        
        self.conv_trans_3 = nn.ConvTranspose2d(
            256, 
            128, 
            kernel_size=2, 
            stride=2
        )
        self.up_conv_3 = double_conv(256, 128)
        
        self.conv_trans_4 = nn.ConvTranspose2d(
            128, 
            64, 
            kernel_size=2, 
            stride=2
        )
        self.up_conv_4 = double_conv(128, 64)
        
        self.out = nn.Conv2d(
            64, 
            2, 
            kernel_size=1
        )
        
    
    def forward(self, image):
        # encoder
        
        x1 = self.down_conv_1(image)
        m1 = self.max_pool_2x2(x1)
        
        x2 = self.down_conv_2(m1)
        m2 = self.max_pool_2x2(x2)
        
        x3 = self.down_conv_3(m2)
        m3 = self.max_pool_2x2(x3)
        
        x4 = self.down_conv_4(m3)
        m4 = self.max_pool_2x2(x4)
        
        x5 = self.down_conv_5(m4)        
        
        # decoder
        # all the x's are passed in the decoder part too
        
        # x4 is 64x64|x is 56x56| we need to crop x4 to same size as x
        x = self.conv_trans_1(x5)
        x4 = crop_tensor(x4, x)

        x = torch.cat([x, x4], dim=1)
        x = self.up_conv_1(x)
        
        x = self.conv_trans_2(x)
        x3 = crop_tensor(x3, x)
        
        x = torch.cat([x, x3], dim=1)
        x = self.up_conv_2(x)
        
        x = self.conv_trans_3(x)
        x2 = crop_tensor(x2, x)

        x = torch.cat([x, x2], dim=1)
        x = self.up_conv_3(x)
        
        x = self.conv_trans_4(x)
        x1 = crop_tensor(x1, x)

        x = torch.cat([x, x1], dim=1)
        x = self.up_conv_4(x)
        
        out = self.out(x)
        
        return out

In [None]:
from torchvision import transforms

width = 572
height = 572

image = cv2.imread(im, 0)
image = cv2.resize(image, (width, height))
plt.imshow(image, cmap='magma')
tran = transforms.ToTensor()

img_tensor = tran(image).unsqueeze(0)
print(img_tensor.shape)

model = UNet()
model(img_tensor)

[Cell Segmenation](https://github.com/CellProfiling/HPA-Cell-Segmentation) can be used to extract segment masks for each cell.

To be continued .   .  . .. 