Credits:
* https://www.kaggle.com/rdizzl3/hpa-segmentation-masks-no-internet
* https://www.kaggle.com/frlemarchand/generate-masks-from-weak-image-level-labels/


# Installation

In [None]:
!pip install -q "../input/pycocotools/pycocotools-2.0-cp37-cp37m-linux_x86_64.whl"
!pip install -q "../input/hpapytorchzoozip/pytorch_zoo-master"
!pip install -q "../input/hpacellsegmentatormaster/HPA-Cell-Segmentation-master"

In [None]:
! pip3 install torchnet

In [None]:
import os
# Making pretrained weights work without needing to find the default filename
if not os.path.exists('/root/.cache/torch/hub/checkpoints/'):
        os.makedirs('/root/.cache/torch/hub/checkpoints/')
# !cp '../input/resnet50/resnet50.pth' '/root/.cache/torch/hub/checkpoints/resnet50-19c8e357.pth'
!cp '../input/resnet34/resnet34.pth' '/root/.cache/torch/hub/checkpoints/resnet34-333f7ec4.pth'

In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm.autonotebook import tqdm
import imageio

## Dataloader

In [None]:
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from torchvision import transforms
import torch
from PIL import Image

In [None]:
def get_one_hot(labels):
    max_classes = 18 + 1

    one_hotted_labels = np.empty((len(labels), max_classes))

    for i, label in enumerate(labels):
        lbls_idxs = list(map(int, label.split("|")))
        lbl = np.zeros(max_classes)
        lbl[lbls_idxs] = 1
        one_hotted_labels[i] = lbl

    return torch.tensor(one_hotted_labels).float()


class HPADataSet(Dataset):
    def __init__(self, image_dir, images, labels):
        self.image_dir = image_dir
        self.images =  images
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def build_image(self, img_id):
        r = np.array(Image.open(f"{self.image_dir}/{img_id}_red.png")) # mitochondria
        b = np.array(Image.open(f"{self.image_dir}/{img_id}_blue.png")) # er
        y = np.array(Image.open(f"{self.image_dir}/{img_id}_yellow.png")) # nuclei
        g = np.array(Image.open(f"{self.image_dir}/{img_id}_green.png")) # protein of interest

        img = torch.tensor(np.stack([r,b,g])/255, dtype=torch.float)

        return img


    def __getitem__(self, idx):
        image_id = self.images[idx]
        label = self.labels[idx]
        image = self.build_image(image_id)

        return image, label

    @classmethod
    def from_csv(cls, csv, image_dir):
        df = pd.read_csv(csv)
        images_id = df["ID"].to_numpy()
        one_hot_labels = get_one_hot(df["Label"])

        return cls(image_dir, images_id, one_hot_labels)


In [None]:
train_images_dir = "/kaggle/input/hpa-512512"
train_csv_path = "/kaggle/input/hpa-single-cell-image-classification/train.csv"
df = pd.read_csv(train_csv_path)

In [None]:
# import matplotlib.pyplot as plt
# x,y = dataSet[0]
# imm = transforms.ToPILImage()(x)
# plt.imshow(imm)

Tensorbaord Stuff

In [None]:
# From Github Gist: https://gist.github.com/hantoine/4e7c5bc6748861968e61e60bab89e9b0
from urllib.request import urlopen
from io import BytesIO
from zipfile import ZipFile
from subprocess import Popen
from os import chmod
from os.path import isfile
import json
import time
import psutil

def download_and_unzip(url, extract_to='.'):
    http_response = urlopen(url)
    zipfile = ZipFile(BytesIO(http_response.read()))
    zipfile.extractall(path=extract_to)


def run_cmd_async_unsafe(cmd):
    return Popen(cmd, shell=True)


def is_process_running(process_name):
    running_process_names = (proc.name() for proc in psutil.process_iter())
    return process_name in running_process_names

def launch_tensorboard():
    tb_process, ngrok_process = None, None
    
    # Launch TensorBoard
    if not is_process_running('tensorboard'):
        tb_command = 'tensorboard --logdir ./logs/ --host 0.0.0.0 --port 6006'
        tb_process = run_cmd_async_unsafe(tb_command)
    
    # Install ngrok
    if not isfile('./ngrok'):
        ngrok_url = 'https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.zip'
        download_and_unzip(ngrok_url)
        chmod('./ngrok', 0o755)

    # Create ngrok tunnel and print its public URL
    if not is_process_running('ngrok'):
        ngrok_process = run_cmd_async_unsafe('./ngrok http 6006')
        time.sleep(1) # Waiting for ngrok to start the tunnel
    ngrok_api_res = urlopen('http://127.0.0.1:4040/api/tunnels', timeout=10)
    ngrok_api_res = json.load(ngrok_api_res)
    assert len(ngrok_api_res['tunnels']) > 0, 'ngrok tunnel not found'
    tb_public_url = ngrok_api_res['tunnels'][0]['public_url']
    print(f'TensorBoard URL: {tb_public_url}')

    return tb_process, ngrok_process





# tb_process, ngrok_process = launch_tensorboard()

In [None]:
! ls /kaggle/working

Metrics

In [None]:
from sklearn.metrics import confusion_matrix
import torchnet.meter as meter

class Metric:
    """Base class for metrics"""
    def __init__(self):
        self.running_total = 0
        self.call_count = 0

    def __call__(self, predictions, labels, ):
        """Calculate streaming result"""
        self.call_count += 1
        res = self.calculation(predictions, labels)
        self.running_total += res
        return self.running_total/self.call_count

    def calculation(self, predictions, labels):
        """Calculation implementation"""
        raise NotImplementedError

    def reset(self):
        """Reset Streaming Metrics"""
        self.running_total = 0
        self.call_count = 0

class MeanAP(Metric):
    """Mean Average Precision"""

    def __init__(self):
        super().__init__() 
#         self.mAp = meter.mAPMeter()
    
    def __str__(self):
        return "Mean Average Precision"
    
    def calculation(self, predictions, labels, logits=True):
        if logits:
            predictions = torch.nn.Sigmoid()(predictions)
        # hack to get mter to work with metric my manager
        mAp = meter.mAPMeter()
        mAp.add(predictions, labels)
        return mAp.value()
#         self.mAp.reset()
#         return res
        
class MetricManager:
    """Mangers all metrics during training"""
    def __init__(self, metrics, writer=None):
        """
        Args:
            metrics (list(Metrics): list of metrics
            writer (Summary):
        """
        self.metrics = metrics
        self.writer = writer

    def update(self, preds, labels, step):
        for m in self.metrics:
            self._update_metric(m, preds.detach().cpu(), labels.detach().cpu(), step)

    def _update_metric(self, metric, preds, labels, step):
        result = metric(preds, labels)
        if self.writer:
            self.writer.add_scalar(str(metric), result, step)
        # _logger.DEBUG(f'{str(metric): {result}}')

    def reset(self):
        """Call reset method on all metrics"""
        _ = [m.reset() for m in self.metrics]

In [None]:
from torch.utils.tensorboard import SummaryWriter

# default `log_dir` is "runs" - we'll be more specific here
writer = SummaryWriter('./logs/train-run2')

## loss

In [None]:
# will need to use torch.nn.BCEWithLogitsLoss for multilabel loss and will need to try create mulitlabel focal loss later

## Model

In [None]:
import torchvision.models as models

resnet18 = models.resnet18()# will prob use pretrained
resnet18.fc = torch.nn.Linear(512,19) # need to create new custom backbone

resnet18.cuda()
1

## Trainer

In [None]:
import time

class Training:
    def __init__(self, metrics, loss, optim, data, epochs, model, save_dir):
        """Training runner
        Args:
            metrics (MetricManager):
            loss (torch.nn.modules.loss):
            optim (torch.optim):
            data (DataLoader):
            epochs (int):
            model (pytorch model):
            save_dir (str): directory to save model
        """

        self.metrics = metrics
        self.loss = loss
        self.optim = optim
        self.data = data
        self.model = model
        self.epochs = epochs
        self.save_dir = save_dir

        self.step = 0

    def train_step(self, batch):
        data, labels = batch
        data = data.cuda() 
        labels = labels.cuda()
        logits = self.model(data)
        loss = self.loss(logits, labels)
        self.metrics.update(logits, labels, self.step)
#         if self.step% 20 == 0: print(loss.item())
        if self.metrics.writer:
            self.metrics.writer.add_scalar("loss", loss.item(), self.step)

        self.optim.zero_grad()  # zero gradients
        loss.backward()  # calculate gradients
        self.optim.step()  # updated weights

    def save_checkpoint(self):
        """Save checkpoint with current step number"""
        torch.save(self.model.state_dict(), f'{self.save_dir}/model-{self.step}')

    def run_eval(self):
        print("running evaluation.....")
    
    def train_loop(self):
        for i in range(self.epochs):
            start = time.time()
            print(f'Epoch {i}/{self.epochs}')
            for batch in self.data:
                self.train_step(batch)
                self.step += 1
            self.metrics.reset() 
            self.save_checkpoint()
            self.run_eval()
            end = time.time()
            print(f"epoch took {(end-start)/60} min")


In [None]:
# Training start
mAp = MeanAP()
metrics = MetricManager([mAp], writer=writer)
loss = torch.nn.BCEWithLogitsLoss()

optim = torch.optim.Adam(resnet18.parameters(), lr=0.001)

dataset =  HPADataSet.from_csv(train_csv_path,train_images_dir)
dl = DataLoader(dataset, batch_size=64, shuffle=True) #not ok

trainer = Training(metrics, loss,  optim, dl,  10, resnet18, "./")

In [None]:
trainer.train_loop()

# Inference

In [None]:
def build_image_names(image_id: str) -> list:
    # mt is the mitchondria
    mt = f'/kaggle/input/hpa-single-cell-image-classification/test/{image_id}_red.png'    
    # er is the endoplasmic reticulum
    er = f'/kaggle/input/hpa-single-cell-image-classification/test/{image_id}_yellow.png'    
    # nu is the nuclei
    nu = f'/kaggle/input/hpa-single-cell-image-classification/test/{image_id}_blue.png'    
    return [[mt], [er], [nu]]

In [None]:
import hpacellseg.cellsegmentator as cellsegmentator
from hpacellseg.utils import label_cell, label_nuclei

NUC_MODEL = '../input/hpacellsegmentatormodelweights/dpn_unet_nuclei_v1.pth'
CELL_MODEL = '../input/hpacellsegmentatormodelweights/dpn_unet_cell_3ch_v1.pth'

segmentator = cellsegmentator.CellSegmentator(
    NUC_MODEL,
    CELL_MODEL,
    scale_factor=0.25,
    device='cuda',
    padding=True,
    multi_channel_model=True
)

In [None]:
import base64
import numpy as np
from pycocotools import _mask as coco_mask
import typing as t
import zlib


def encode_binary_mask(mask: np.ndarray) -> t.Text:
  """Converts a binary mask into OID challenge encoding ascii text."""

  # check input mask --
  if mask.dtype != np.bool:
    raise ValueError(
        "encode_binary_mask expects a binary mask, received dtype == %s" %
        mask.dtype)

  mask = np.squeeze(mask)
  if len(mask.shape) != 2:
    raise ValueError(
        "encode_binary_mask expects a 2d mask, received shape == %s" %
        mask.shape)

  # convert input mask to expected COCO API input --
  mask_to_encode = mask.reshape(mask.shape[0], mask.shape[1], 1)
  mask_to_encode = mask_to_encode.astype(np.uint8)
  mask_to_encode = np.asfortranarray(mask_to_encode)

  # RLE encode mask --
  encoded_mask = coco_mask.encode(mask_to_encode)[0]["counts"]

  # compress and base64 encoding --
  binary_str = zlib.compress(encoded_mask, zlib.Z_BEST_COMPRESSION)
  base64_str = base64.b64encode(binary_str)
  return base64_str.decode('ascii')

# Inference

In [None]:
tpath = Path('../input/hpa-single-cell-image-classification')

In [None]:
sub = pd.read_csv(tpath/'sample_submission.csv')

In [None]:
sub = sub.sample(frac=0.03)
sub.ImageWidth.value_counts()

In [None]:
sub_dfs = []
for dim in sub.ImageWidth.unique():
    df = sub[sub['ImageWidth'] == dim].copy().reset_index(drop=True)
    sub_dfs.append(df)

In [None]:
def get_class_im(image_dir, img_id):
        r = np.array(Image.open(f"{image_dir}/{img_id}_red.png").resize((512, 512))) # mitochondria
        b = np.array(Image.open(f"{image_dir}/{img_id}_blue.png").resize((512, 512))) # er
        y = np.array(Image.open(f"{image_dir}/{img_id}_yellow.png").resize((512, 512))) # nuclei
        g = np.array(Image.open(f"{image_dir}/{img_id}_green.png").resize((512, 512))) # protein of interest
        
        
        stacked = np.stack([r,b,g])/255

        img = torch.tensor([stacked], dtype=torch.float).cuda() # add extra batch dimension

        return img

    

In [None]:
TEST_IMAGE_DIR = "../input/hpa-single-cell-image-classification/test/"
POSTIVE_THRESHOLD = 0.5
bs = 1
for sub in sub_dfs:
    print(f'Starting prediction for image size: {sub.ImageWidth.loc[0]}')
    for start in range(0, len(sub), bs):
        if start + bs > len(sub): end = len(sub)
        else: end = start + bs
            
        images = []
        for row in range(start, end):
            image_id = sub['ID'].loc[row]
            img = build_image_names(image_id=image_id)
            images.append(img)

        images = np.stack(images).squeeze()
        images = np.transpose(images).tolist()
#         print(images)
        
#         image_class = get_class_im(TEST_IMAGE_DIR, image_id)
#         logits = resnet18(image_class)
#         confidences = torch.nn.Sigmoid()(logits).cpu().detach().numpy()
#         pred_classes = np.argwhere(confidences > POSTIVE_THRESHOLD)
        

        nuc_segmentations = segmentator.pred_nuclei(images[2]) # input here list needs to be nuclei array only (blue)
        cell_segmentations = segmentator.pred_cells(images) # input here needs to be list on images in RYB

        predstrings = []
        for i in tqdm(range(len(cell_segmentations))): # for each image
            _, cell_mask = label_cell(nuc_segmentations[i], cell_segmentations[i])
            predstring = ''
            for j in range(np.max(cell_mask)): # for each cell
                bmask = (cell_mask == j)
                enc = encode_binary_mask(bmask)
                # this is where we add our predictions
                for i in pred_classes:
                    pred_class = i[:1]
                    confidence = confidences[i]
                    print(f'confidence: {confidence}, pred_class {pred_class}')
                    predstring += f'{pred_class} {confidence} {enc } '
            predstrings.append(predstring)

        assert len(predstrings) == len(sub.loc[start:end-1])
        sub['PredictionString'].loc[start:end-1] = predstrings

 

In [None]:
all_subs = pd.concat(sub_dfs, ignore_index=True, sort=False)

In [None]:
all_subs.tail()

In [None]:
all_subs.to_csv('submission.csv', index=False)

In [None]:
all_subs.head()

In [None]:
# all_subs.tail()