# Expanding of moscow dataset

## Prepare

In [1]:
!pip install torchinfo
!pip install -U segmentation-models-pytorch
!pip install lungmask
!pip install openpyxl

Collecting torchinfo
  Downloading torchinfo-1.7.1-py3-none-any.whl (22 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.7.1
Collecting segmentation-models-pytorch
  Downloading segmentation_models_pytorch-0.3.0-py3-none-any.whl (97 kB)
     |████████████████████████████████| 97 kB 2.6 MB/s             
[?25hCollecting efficientnet-pytorch==0.7.1
  Downloading efficientnet_pytorch-0.7.1.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l- done
Collecting timm==0.4.12
  Downloading timm-0.4.12-py3-none-any.whl (376 kB)
     |████████████████████████████████| 376 kB 19.8 MB/s            
[?25hCollecting pretrainedmodels==0.7.4
  Downloading pretrainedmodels-0.7.4.tar.gz (58 kB)
     |████████████████████████████████| 58 kB 4.8 MB/s             
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: efficientnet-pytorch, pretrainedmodels
  Building wheel for efficientnet-pytorch (setup.py)

In [2]:
import os
import json
import numpy as np
import pandas as pd
import nibabel as nib
# DL
import torch
from torch import nn
from torchinfo import summary
from torch.utils.data import Dataset, DataLoader
from torchvision import models
import SimpleITK as sitk
from lungmask import mask as unet_mask
# Visualization
import matplotlib.pyplot as plt
from PIL import Image
from mpl_toolkits.axes_grid1 import ImageGrid
from tqdm.notebook import tqdm
from IPython.display import clear_output, display
import seaborn as sns

In [3]:
TGCOVID_PATH = "../input/tgcovid"
KAGGLECOVID_PATH = "../input/covid19-ct-scans"
MOSCOWCOVID_PATH = "../input/covid19moscow/COVID19_1110"

In [4]:
def expand_path(*right_part):
    def _expand_path(left_part):
        return os.path.join(*right_part, left_part) \
            if left_part is not np.nan else np.nan
    return _expand_path

def load_tgcovid_data(json_file):
    datapath = os.path.join(TGCOVID_PATH, "data", "data")
    path_images = os.path.join(datapath, 'images')
    path_labels = os.path.join(datapath, 'labels')
    with open(os.path.join(TGCOVID_PATH, json_file), 'r') as f:
        dict_data = json.load(f)

    data = pd.DataFrame(dict_data)
    # expand path to full
    data["image"] = data["image"].apply(expand_path(path_images))
    data["label"] = data["label"].apply(expand_path(path_labels))
    # remove .gz
    data["image"] = data["image"].str[:-3]
    data["label"] = data["label"].str[:-3]
    return data

def load_kagglecovid_data(csv_file):
    datapath = os.path.join(KAGGLECOVID_PATH, csv_file)
    data = pd.read_csv(datapath)
    data.rename(columns={"ct_scan":"image", "infection_mask":"label"}, inplace=True)
    data.drop("lung_and_infection_mask", inplace=True, axis=1)
    return data

def load_moscowcovid_data(xlsx_file):
    datapath = os.path.join(MOSCOWCOVID_PATH, xlsx_file)
    data = pd.read_excel(datapath)
    data.rename(columns={"study_file":"image", "mask_file":"label"}, inplace=True)
    data.drop(["category", "study_id"], axis=1, inplace=True)
    data["image"] = data["image"].str[1:].apply(expand_path(MOSCOWCOVID_PATH))
    data["label"] = data["label"].str[1:].apply(expand_path(MOSCOWCOVID_PATH))
    # remove .gz
    data["image"] = data["image"].str[:-3]
    data["label"] = data["label"].str[:-3]
    return data

In [5]:
dataset = load_moscowcovid_data("dataset_registry.xlsx")
dataset

Unnamed: 0,image,label
0,../input/covid19moscow/COVID19_1110/studies/CT...,
1,../input/covid19moscow/COVID19_1110/studies/CT...,
2,../input/covid19moscow/COVID19_1110/studies/CT...,
3,../input/covid19moscow/COVID19_1110/studies/CT...,
4,../input/covid19moscow/COVID19_1110/studies/CT...,
...,...,...
1105,../input/covid19moscow/COVID19_1110/studies/CT...,
1106,../input/covid19moscow/COVID19_1110/studies/CT...,
1107,../input/covid19moscow/COVID19_1110/studies/CT...,
1108,../input/covid19moscow/COVID19_1110/studies/CT...,


## Visualization 

In [6]:
def normalize(x):
    min_in = np.min(x)
    max_in = np.max(x)
    return (x - min_in) / (max_in - min_in + 1e-8)

def slice2rgb(image, normalize_data=True):
    image = image.astype(np.float32)
    image = normalize(image) if normalize_data else image
    image *= 255
    image = np.dstack((image, image, image)).astype(np.uint8)
    return Image.fromarray(image)

def mask2blue(mask):
    zeros = np.zeros_like(mask)
    mask = np.dstack((zeros, zeros, mask * 255)).astype(np.uint8)
    return Image.fromarray(mask)
    
def blend(image, mask, normalize_data=True):
    return Image.blend(
        slice2rgb(image, normalize_data=True),
        mask2blue(mask),
        alpha=.2
    )

In [7]:
def save_ndarray_as_nii(data, path):
    image = sitk.GetImageFromArray(data)
    sitk.WriteImage(image, path)

## Lung segmentation

In [8]:
HIGH_ACCURACY = False
MODEL = ('unet','LTRCLobes')

In [9]:
!mkdir lung_mask

In [10]:
def lung_segmentation(dataset, model, high_accuracy_mode=False):
    model = unet_mask.get_model(*model)
    segmentation_image_paths = []
    for idx in tqdm(dataset.index):
        image_path = dataset.loc[idx, "image"]
        if not high_accuracy_mode:
            segmentation = unet_mask.apply(sitk.ReadImage(image_path), model)
        else:
            segmentation = unet_mask.apply_fuse(sitk.ReadImage(image_path))
        segmentation_image_path = os.path.join("lung_mask", os.path.basename(image_path))
        segmentation_image_paths.append([idx, image_path, segmentation_image_path])
        save_ndarray_as_nii(segmentation, segmentation_image_path)
    segmentation_image_paths = pd.DataFrame(
        segmentation_image_paths, columns=["study_id", "image", "lung_mask"]).set_index("image")
    segmentation_image_paths.to_csv('lung_data.csv')
    return segmentation_image_paths

In [11]:
lung_data = lung_segmentation(dataset, MODEL, HIGH_ACCURACY)

Downloading: "https://github.com/JoHof/lungmask/releases/download/v0.0/unet_ltrclobes-3a07043d.pth" to /root/.cache/torch/hub/checkpoints/unet_ltrclobes-3a07043d.pth


  0%|          | 0.00/119M [00:00<?, ?B/s]

  0%|          | 0/1110 [00:00<?, ?it/s]


  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [00:05<00:10,  5.39s/it][A
100%|██████████| 3/3 [00:05<00:00,  1.90s/it]

100%|██████████| 21/21 [00:00<00:00, 479.26it/s]

  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [00:00<00:00,  3.82it/s][A
100%|██████████| 3/3 [00:00<00:00,  5.44it/s]

100%|██████████| 28/28 [00:00<00:00, 505.70it/s]

  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [00:00<00:00,  3.89it/s][A
100%|██████████| 3/3 [00:00<00:00,  5.59it/s]

  0%|          | 0/99 [00:00<?, ?it/s][A
100%|██████████| 99/99 [00:00<00:00, 406.94it/s]

  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [00:00<00:00,  3.91it/s][A
100%|██████████| 3/3 [00:00<00:00,  5.10it/s]

  0%|          | 0/56 [00:00<?, ?it/s][A
100%|██████████| 56/56 [00:00<00:00, 324.65it/s]

  0%|          | 0/3 [00:00<?, ?it/s][A
 33%|███▎      | 1/3 [00:00<00:00,  3.79it/s][A
100%|██████████| 3/3 [00:00<00:00,  5.24it/s]

  0%|          | 0/52 [00:00<?

In [12]:
lung_data

Unnamed: 0_level_0,study_id,lung_mask
image,Unnamed: 1_level_1,Unnamed: 2_level_1
../input/covid19moscow/COVID19_1110/studies/CT-0/study_0001.nii,0,lung_mask/study_0001.nii
../input/covid19moscow/COVID19_1110/studies/CT-0/study_0002.nii,1,lung_mask/study_0002.nii
../input/covid19moscow/COVID19_1110/studies/CT-0/study_0003.nii,2,lung_mask/study_0003.nii
../input/covid19moscow/COVID19_1110/studies/CT-0/study_0004.nii,3,lung_mask/study_0004.nii
../input/covid19moscow/COVID19_1110/studies/CT-0/study_0005.nii,4,lung_mask/study_0005.nii
...,...,...
../input/covid19moscow/COVID19_1110/studies/CT-3/study_1106.nii,1105,lung_mask/study_1106.nii
../input/covid19moscow/COVID19_1110/studies/CT-3/study_1107.nii,1106,lung_mask/study_1107.nii
../input/covid19moscow/COVID19_1110/studies/CT-3/study_1108.nii,1107,lung_mask/study_1108.nii
../input/covid19moscow/COVID19_1110/studies/CT-4/study_1109.nii,1108,lung_mask/study_1109.nii
