In [1]:
import os
import zarr
import random
import json
import warnings
import numpy as np
import pandas as pd
import torch.nn as nn
from pathlib import Path
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader, Dataset
import torch
import torchvision.transforms.functional as F
import random
import sys
from collections import defaultdict

warnings.filterwarnings("ignore")
sys.path.append("./src/")

from src.config import CFG
from src.dataloader import (
    read_zarr,
    read_info_json,
    scale_coordinates,
    create_dataset,
    create_segmentation_map,
    EziiDataset,
    drop_padding,
)
from src.network import UNet_2D, aug
from src.utils import save_images
from src.metric import score, create_cls_pos, create_cls_pos_sikii, create_df

sample_submission = pd.read_csv("../../inputs/sample_submission.csv")

In [2]:
train_dataset = EziiDataset(
    exp_names=CFG.train_exp_names,
    base_dir="../../inputs/train",
    particles_name=CFG.particles_name,
    resolution=CFG.resolution,
    zarr_type=CFG.train_zarr_types,
    train=True,
)

valid_dataset = EziiDataset(
    exp_names=CFG.valid_exp_names,
    # exp_names=CFG.train_exp_names,
    base_dir="../../inputs/train",
    particles_name=CFG.particles_name,
    resolution=CFG.resolution,
    zarr_type=CFG.valid_zarr_types,
    train=True,
)

test_dataset = EziiDataset(
    exp_names=["TS_6_4", "TS_5_4", "TS_69_2"],
    base_dir="../../inputs/test",
    particles_name=CFG.particles_name,
    resolution=CFG.resolution,
    zarr_type=CFG.valid_zarr_types,
    train=False,
)

from tqdm import tqdm

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=False)
valid_loader = DataLoader(valid_dataset, batch_size=1, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)


for row in tqdm(valid_loader):
    normalized_tomogram = row["normalized_tomogram"]
    break

[('TS_5_4', 'denoised'), ('TS_73_6', 'denoised'), ('TS_99_9', 'denoised'), ('TS_6_4', 'denoised'), ('TS_69_2', 'denoised')]
[('TS_86_3', 'denoised'), ('TS_6_6', 'denoised')]
[('TS_6_4', 'denoised'), ('TS_5_4', 'denoised'), ('TS_69_2', 'denoised')]


  0%|          | 0/2 [00:00<?, ?it/s]


In [3]:
class PadToSize(nn.Module):
    def __init__(self, resolution):
        super().__init__()
        if resolution == "0":
            self.size = 640
        elif resolution == "1":
            self.size = 320
        elif resolution == "2":
            self.size = 160

    def forward(self, x):
        return F.pad(x, (0, 0, self.size - x.shape[-1], self.size - x.shape[-2]))

In [4]:
model = UNet_2D().to("cuda")
model.eval()
model.load_state_dict(torch.load("./best_model.pth"))


optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(
    weight=torch.tensor([0.5, 32, 32, 32, 32, 32, 32]).to("cuda")
)
# criterion = DiceLoss()

best_model = None
best_loss = np.inf
batch_size = 4

valid_loss = []
valid_pred_tomogram = defaultdict(list)
valid_gt_tomogram = defaultdict(list)
model.eval()
tq = tqdm(range(len(valid_loader) * normalized_tomogram.shape[0]))
for data in valid_loader:
    exp_name = data["exp_name"][0]
    tomogram = data["normalized_tomogram"].to("cuda")
    segmentation_map = data["segmentation_map"].to("cuda").long()

    for i in range(tomogram.shape[1]):
        input_ = tomogram[:, i].unsqueeze(0)
        gt = segmentation_map[:, i]

        input_ = PadToSize(CFG.resolution)(input_)
        gt = PadToSize(CFG.resolution)(gt)
        output = model(input_)
        output = nn.functional.softmax(output, dim=1)
        loss = criterion(output, gt)

        valid_loss.append(loss.item())
        tq.set_description(f"Loss: {np.mean(valid_loss)}")
        tq.update(1)

        output = drop_padding(output, CFG.resolution)

        valid_pred_tomogram[exp_name].append(output.cpu().detach().numpy())
        valid_gt_tomogram[exp_name].append(gt.cpu().detach().numpy())
tq.close()

Loss: 1.4369846264953199: : 184it [00:04, 45.10it/s]                   


In [5]:
def create_gt_df(base_dir, exp_names):
    result_df = None
    particle_names = CFG.particles_name

    for exp_name in exp_names:
        for particle in particle_names:
            np_corrds = read_info_json(
                base_dir=base_dir, exp_name=exp_name, particle_name=particle
            )  # (n, 3)
            # 各行にexp_nameとparticle_name追加
            particle_df = pd.DataFrame(np_corrds, columns=["z", "y", "x"])
            particle_df["experiment"] = exp_name
            particle_df["particle_type"] = particle

            if result_df is None:
                result_df = particle_df
            else:
                result_df = pd.concat([result_df, particle_df], axis=0).reset_index(
                    drop=True
                )

    result_df = result_df.reset_index()  # index	experiment	particle_type	x	y	z
    result_df = result_df[["index", "experiment", "particle_type", "x", "y", "z"]]

    return result_df

In [6]:
gt_df = create_gt_df("../../inputs/train/overlay/ExperimentRuns/", CFG.valid_exp_names)
gt_df = gt_df[gt_df["particle_type"] != "beta-amylase"].reset_index(drop=True)

# Validation

In [7]:
# pred_df = pd.read_csv("../../inputs/train_submission.csv")


def calc_score(initial_sikii):
    all_pred_df = None

    for exp_name in CFG.valid_exp_names:
        pred_tomogram = valid_pred_tomogram[exp_name]
        pred_tomogram = np.array(pred_tomogram)  # (92, 1, 7, 315, 315)
        pred_tomogram = pred_tomogram.squeeze(1)  # (92, 7, 315, 315)

        pred_cls_pos, pred_Ascale_pos = create_cls_pos_sikii(
            pred_tomogram, sikii_dict=initial_sikii
        )
        pred_df = create_df(pred_Ascale_pos, exp_name)
        # pred_df = create_df(pred_cls_pos, exp_name)

        if all_pred_df is None:
            all_pred_df = pred_df
        else:
            all_pred_df = pd.concat([all_pred_df, pred_df], axis=0).reset_index(
                drop=True
            )

    pred_df = all_pred_df[all_pred_df["particle_type"] != "beta-amylase"]
    pred_df = pred_df.drop_duplicates(subset=["x", "y", "z"], keep="first").reset_index(
        drop=True
    )

    pred_df = pred_df.reset_index()

    score_ = score(
        pred_df, gt_df, row_id_column_name="index", distance_multiplier=1, beta=4
    )

    return score_


def calc_score_by_exp(initial_sikii):
    exp_scores = {}

    for exp_name in CFG.valid_exp_names:
        gt_df = create_gt_df("../../inputs/train/overlay/ExperimentRuns/", [exp_name])

        pred_tomogram = valid_pred_tomogram[exp_name]
        pred_tomogram = np.array(pred_tomogram)  # (92, 1, 7, 315, 315)
        pred_tomogram = pred_tomogram.squeeze(1)  # (92, 7, 315, 315)

        pred_cls_pos, pred_Ascale_pos = create_cls_pos_sikii(
            pred_tomogram, sikii_dict=initial_sikii
        )
        pred_df = create_df(pred_Ascale_pos, exp_name)

        pred_df = pred_df[pred_df["particle_type"] != "beta-amylase"]
        pred_df = pred_df.drop_duplicates(
            subset=["x", "y", "z"], keep="first"
        ).reset_index(drop=True)

        pred_df = pred_df.reset_index()

        score_ = score(
            pred_df, gt_df, row_id_column_name="index", distance_multiplier=1, beta=4
        )

        exp_scores[exp_name] = score_

    return exp_scores

In [8]:
constant = 0.35

initial_sikii = {
    "apo-ferritin": constant,
    "beta-amylase": constant,
    "beta-galactosidase": constant,
    "ribosome": constant,
    "thyroglobulin": constant,
    "virus-like-particle": constant,
}

score_ = calc_score(initial_sikii)
score_

0.45866232874482044

In [9]:
constant = 0.5666666666666667

initial_sikii = {
    "apo-ferritin": constant,
    "beta-amylase": constant,
    "beta-galactosidase": constant,
    "ribosome": constant,
    "thyroglobulin": constant,
    "virus-like-particle": constant,
}

score_ = calc_score_by_exp(initial_sikii)
score_

{'TS_86_3': 0.47874661254915135, 'TS_6_6': 0.41500624782698015}

In [10]:
best_sikii = 0
best_score = -np.inf

for sikii in np.linspace(0.3, 0.7, 100):
    initial_sikii = {
        "apo-ferritin": sikii,
        "beta-amylase": sikii,
        "beta-galactosidase": sikii,
        "ribosome": sikii,
        "thyroglobulin": sikii,
        "virus-like-particle": sikii,
    }
    score_ = calc_score(initial_sikii)
    if score_ > best_score:
        best_score = score_
        best_sikii = sikii
    print(sikii, score_)

best_sikii, best_score

0.3 0.4488041526186131
0.30404040404040406 0.44949682741275393
0.30808080808080807 0.44949682741275393
0.31212121212121213 0.44949682741275393
0.31616161616161614 0.4503081060255468
0.3202020202020202 0.4503081060255468
0.3242424242424242 0.4540170051993387
0.3282828282828283 0.4540170051993387
0.3323232323232323 0.4540170051993387
0.33636363636363636 0.45559292245148547
0.3404040404040404 0.45559292245148547
0.34444444444444444 0.45866232874482044
0.34848484848484845 0.45866232874482044
0.3525252525252525 0.4578713956771759
0.35656565656565653 0.4578713956771759
0.3606060606060606 0.4569151550552006
0.36464646464646466 0.4569151550552006
0.3686868686868687 0.45610387644240774
0.3727272727272727 0.4584410431984484
0.37676767676767675 0.4591610168265637
0.3808080808080808 0.4592900905113679
0.38484848484848483 0.4592900905113679
0.3888888888888889 0.4533801504764954
0.3929292929292929 0.4519084144366827
0.396969696969697 0.45433779305429106
0.401010101010101 0.45433779305429106
0.405050

(0.6878787878787879, 0.466950781754246)

# Train-Dataset

In [11]:
train_loss = []
valid_pred_tomogram = defaultdict(list)
valid_gt_tomogram = defaultdict(list)
model.eval()
tq = tqdm(range(len(train_loader) * normalized_tomogram.shape[0]))
for data in train_loader:
    exp_name = data["exp_name"][0]
    tomogram = data["normalized_tomogram"].to("cuda")
    segmentation_map = data["segmentation_map"].to("cuda").long()

    for i in range(tomogram.shape[1]):
        input_ = tomogram[:, i].unsqueeze(0)
        gt = segmentation_map[:, i]

        input_ = PadToSize(CFG.resolution)(input_)
        gt = PadToSize(CFG.resolution)(gt)
        output = model(input_)
        output = nn.functional.softmax(output, dim=1)

        tq.update(1)

        output = drop_padding(output, CFG.resolution)

        valid_pred_tomogram[exp_name].append(output.cpu().detach().numpy())
        valid_gt_tomogram[exp_name].append(gt.cpu().detach().numpy())
tq.close()

460it [00:07, 59.79it/s]                     


In [12]:
def calc_score(initial_sikii):
    all_pred_df = None

    for exp_name in CFG.train_exp_names:
        pred_tomogram = valid_pred_tomogram[exp_name]
        pred_tomogram = np.array(pred_tomogram)  # (92, 1, 7, 315, 315)
        pred_tomogram = pred_tomogram.squeeze(1)  # (92, 7, 315, 315)

        pred_cls_pos, pred_Ascale_pos = create_cls_pos_sikii(
            pred_tomogram, sikii_dict=initial_sikii
        )
        pred_df = create_df(pred_Ascale_pos, exp_name)
        # pred_df = create_df(pred_cls_pos, exp_name)

        if all_pred_df is None:
            all_pred_df = pred_df
        else:
            all_pred_df = pd.concat([all_pred_df, pred_df], axis=0).reset_index(
                drop=True
            )

    pred_df = all_pred_df[all_pred_df["particle_type"] != "beta-amylase"]
    pred_df = pred_df.drop_duplicates(subset=["x", "y", "z"], keep="first").reset_index(
        drop=True
    )

    pred_df = pred_df.reset_index()

    score_ = score(
        pred_df, gt_df, row_id_column_name="index", distance_multiplier=1, beta=4
    )

    return score_

In [13]:
gt_df = create_gt_df("../../inputs/train/overlay/ExperimentRuns/", CFG.train_exp_names)
gt_df = gt_df[gt_df["particle_type"] != "beta-amylase"].reset_index(drop=True)

In [14]:
best_sikii = 0
best_score = -np.inf

for sikii in np.linspace(0.4, 0.7, 25):
    initial_sikii = {
        "apo-ferritin": sikii,
        "beta-amylase": sikii,
        "beta-galactosidase": sikii,
        "ribosome": sikii,
        "thyroglobulin": sikii,
        "virus-like-particle": sikii,
    }
    score_ = calc_score(initial_sikii)
    if score_ > best_score:
        best_score = score_
        best_sikii = sikii
    print(sikii, score_)

0.4 0.6543844246766255
0.41250000000000003 0.6549221422634022
0.42500000000000004 0.6564169239748399
0.4375 0.6574246775220118
0.45 0.6578671434486718
0.4625 0.659248976409452
0.475 0.6602885787538503
0.4875 0.6613929874548311
0.5 0.6661711385502267
0.5125 0.665976278072587
0.525 0.6698629193212403
0.5375 0.6674739827869989
0.55 0.6660099914068336
0.5625 0.6702943196608186
0.575 0.6692913830618004
0.5874999999999999 0.6730880460379656
0.6 0.6752936576923284
0.6125 0.6759415416209785
0.625 0.6761188654709452
0.6375 0.6757987258973024
0.6499999999999999 0.6760811465856547
0.6625 0.6782482133164439
0.6749999999999999 0.6801664007366349
0.6875 0.6774307163016765
0.7 0.6739177909252928


In [15]:
best_sikii, best_score

(0.6749999999999999, 0.6801664007366349)

## Test-Dataset

In [16]:
valid_pred_tomogram = defaultdict(list)
model.eval()
tq = tqdm(range(len(test_loader) * normalized_tomogram.shape[0]))
for data in test_loader:
    exp_name = data["exp_name"][0]
    tomogram = data["normalized_tomogram"].to("cuda")

    for i in range(tomogram.shape[1]):
        input_ = tomogram[:, i].unsqueeze(0)

        input_ = PadToSize(CFG.resolution)(input_)
        output = model(input_)
        output = nn.functional.softmax(output, dim=1)

        tq.update(1)

        output = drop_padding(output, CFG.resolution)

        valid_pred_tomogram[exp_name].append(output.cpu().detach().numpy())
tq.close()

all_pred_df = None

for exp_name in ["TS_6_4", "TS_5_4", "TS_69_2"]:
    pred_tomogram = valid_pred_tomogram[exp_name]
    pred_tomogram = np.array(pred_tomogram)  # (92, 1, 7, 315, 315)
    pred_tomogram = pred_tomogram.squeeze(1)  # (92, 7, 315, 315)

    pred_cls_pos, pred_Ascale_pos = create_cls_pos_sikii(
        pred_tomogram, sikii_dict=initial_sikii
    )
    pred_df = create_df(pred_Ascale_pos, exp_name)
    # pred_df = create_df(pred_cls_pos, exp_name)

    if all_pred_df is None:
        all_pred_df = pred_df
    else:
        all_pred_df = pd.concat([all_pred_df, pred_df], axis=0).reset_index(drop=True)

pred_df = all_pred_df[all_pred_df["particle_type"] != "beta-amylase"]
pred_df = pred_df.drop_duplicates(subset=["x", "y", "z"], keep="first").reset_index(
    drop=True
)

pred_df = pred_df.reset_index()

276it [00:03, 74.77it/s]                     


In [17]:
pred_df

Unnamed: 0,index,experiment,particle_type,x,y,z
0,0,TS_6_4,apo-ferritin,3140.013978,3139.945710,910.024552
1,1,TS_6_4,apo-ferritin,4093.333333,1930.000000,146.666667
2,2,TS_6_4,apo-ferritin,4543.809524,2137.142857,221.904762
3,3,TS_6_4,apo-ferritin,3884.444444,2446.666667,253.333333
4,4,TS_6_4,apo-ferritin,4029.142857,2884.571429,264.571429
...,...,...,...,...,...,...
435,435,TS_69_2,virus-like-particle,5110.666667,5665.958333,875.500000
436,436,TS_69_2,virus-like-particle,3583.475546,2367.762747,906.680541
437,437,TS_69_2,virus-like-particle,5912.488954,974.727541,910.986745
438,438,TS_69_2,virus-like-particle,5903.970588,3581.176471,919.044118
