In [1]:
import os
from glob import glob
from tqdm import tqdm

import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import albumentations as A
from albumentations.pytorch import ToTensorV2

import sys
# sys.path.append('/root/dacon-epitope/dacon-sem')
sys.path.append('/home/youngkim21/dacon/dacon-sem')
from src.datasets.sem_dataset import SEMDataset
from src.datasets.sem_datamodule import SEMDataModule

In [2]:
# data_path = os.path.abspath('/shared/Samsung/')
data_path = os.path.abspath('/home/youngkim21/dacon/sem-data')
simulation_sem_paths = os.path.join(data_path, 'simulation_data', 'SEM', '*', '*', '*.png')
simulation_sem_paths = np.array(sorted(glob(simulation_sem_paths)))
simulation_depth_paths = os.path.join(data_path, 'simulation_data', 'Depth', '*', '*', '*.png')
simulation_depth_paths = np.array(sorted(glob(simulation_depth_paths) + glob(simulation_depth_paths)))
simulation_data_len = len(simulation_sem_paths)

train_sem_paths = os.path.join(data_path, 'train', 'SEM', '*', '*', '*.png')
train_sem_paths = np.array(sorted(glob(train_sem_paths)))
train_data_len = len(train_sem_paths)

test_sem_paths = os.path.join(data_path, 'test', 'SEM', '*.png')
test_sem_paths = np.array(sorted(glob(test_sem_paths)))
test_data_len = len(test_sem_paths)

case_list = ['Case_1', 'Case_2', 'Case_3', 'Case_4']

simulation_sem_by_case = []
for case in case_list:
    t_path = os.path.join(data_path, 'simulation_data', 'SEM', case, '*', '*.png')
    t_path = np.array(sorted(glob(t_path)))
    simulation_sem_by_case.append(t_path)

simulation_depth_by_case = []
for case in case_list:
    t_path = os.path.join(data_path, 'simulation_data', 'Depth', case, '*', '*.png')
    t_path = np.array(sorted(glob(t_path)))
    simulation_depth_by_case.append(t_path)

depth_list = ['Depth_110', 'Depth_120', 'Depth_130', 'Depth_140']

train_sem_by_depth = []
for depth in depth_list:
    t_path = os.path.join(data_path, 'train', 'SEM', depth, '*', '*.png')
    t_path = np.array(sorted(glob(t_path)))
    train_sem_by_depth.append(t_path)

# train_average_depth = pd.read_csv('/shared/Samsung/train/average_depth.csv')
train_average_depth = pd.read_csv('/home/youngkim21/dacon/sem-data/train/average_depth.csv')

print([len(t_path) for t_path in simulation_sem_by_case])
print([len(t_path) for t_path in simulation_depth_by_case])
print([len(t_path) for t_path in train_sem_by_depth])

[43326, 43326, 43326, 43326]
[21663, 21663, 21663, 21663]
[15166, 15166, 15166, 15166]


In [29]:
simulation_sem_itr0 = []
simulation_sem_itr1 = []
for case in simulation_sem_by_case:
    tmp0, tmp1 = [], []
    for sem in case:
        itr = sem[-5]
        if itr == '0':
            tmp0.append(sem)
        elif itr == '1':
            tmp1.append(sem)
    simulation_sem_itr0.append(tmp0)
    simulation_sem_itr1.append(tmp1)
    print(len(tmp0), len(tmp1))


21663 21663
21663 21663
21663 21663
21663 21663


## SEM

In [3]:
def get_avg_std(cases, data_paths):
    avg_by_case = []
    std_by_case = []
    for case, paths in zip(cases, data_paths):
        avgs, stds = [], []
        for path in paths:
            depth = cv2.imread(path)
            avg = np.average(depth)
            std = np.std(depth)
            avgs.append(avg)
            stds.append(std)
        case_avg = np.average(avgs)
        case_std = np.average(stds)
        avg_by_case.append(case_avg)
        std_by_case.append(case_std)
        print(case, ":", case_avg, case_std)
    return avg_by_case, std_by_case

In [4]:
print("Average SEM pixel values of test dataset")
avg_sem_test, std_sem_test = get_avg_std([''], [test_sem_paths])

Average SEM pixel values of test dataset
 : 115.58954166078786 65.71617850978603


In [35]:
print("Average SEM pixel values of train dataset")
avg_sem_by_depth, std_sem_by_depth = get_avg_std(depth_list, train_sem_by_depth)

Average SEM pixel values of train dataset
Depth_110 : 118.22702867586366 63.23097702989158
Depth_120 : 116.38813855065669 65.11493673578214
Depth_130 : 114.66024697056281 66.63681021623475
Depth_140 : 113.03569715823893 67.85634442212138


In [36]:
print("Average SEM pixel values of simulation dataset")
avg_sem_by_case, std_sem_by_case = get_avg_std(case_list, simulation_sem_by_case)

Average SEM pixel values of simulation dataset
Case_1 : 102.34009511465402 55.621073505906395
Case_2 : 100.60741177379806 57.1245849694382
Case_3 : 99.10810123342453 58.415000122302736
Case_4 : 97.80896715720908 59.533876171705614


In [37]:
print("Average SEM pixel values of simulation itr0 dataset")
_, _ = get_avg_std(case_list, simulation_sem_itr0)

Average SEM pixel values of simulation itr0 dataset
Case_1 : 102.34952178338727 55.626230581255626
Case_2 : 100.6010867651107 57.12143721511824
Case_3 : 99.11708963354482 58.42005907961548
Case_4 : 97.81057041330499 59.53315949478247


In [38]:
print("Average SEM pixel values of simulation itr1 dataset")
_, _ = get_avg_std(case_list, simulation_sem_itr1)

Average SEM pixel values of simulation itr1 dataset
Case_1 : 102.33066844592078 55.61591643055716
Case_2 : 100.6137367824854 57.12773272375818
Case_3 : 99.09911283330426 58.40994116499001
Case_4 : 97.80736390111318 59.5345928486288


In [39]:
print("Average Depth pixel values of simulation dataset")
avg_depth_by_case, std_depth_by_case = get_avg_std(case_list, simulation_depth_by_case)

Average Depth pixel values of simulation dataset
Case_1 : 101.7664403489864 47.10571291605642
Case_2 : 108.28920790388032 51.39030375234918
Case_3 : 114.81366495409051 55.67178387136982
Case_4 : 121.34116433272469 59.953447258273386


In [18]:
avg_depth_by_depth = [110., 120., 130., 140.]
avg_sem_by_depth = np.array(avg_sem_by_depth)
avg_depth_by_depth = np.array(avg_depth_by_depth)
avg_sem_by_case = np.array(avg_sem_by_case)
avg_depth_by_case = np.array(avg_depth_by_case)
avg_sem_by_depth, avg_depth_by_depth, avg_sem_by_case, avg_depth_by_case

(array([118.22702868, 116.38813855, 114.66024697, 113.03569716]),
 array([110., 120., 130., 140.]),
 array([102.34009511, 100.60741177,  99.10810123,  97.80896716]),
 array([101.76644035, 108.2892079 , 114.81366495, 121.34116433]))

In [19]:
[avg_sem_by_depth / avg_depth_by_depth,
avg_sem_by_case / avg_depth_by_case]

[array([1.07479117, 0.96990115, 0.8820019 , 0.80739784]),
 array([1.00563697, 0.92906222, 0.86320824, 0.80606584])]

In [21]:
[avg_sem_by_depth / avg_sem_by_case,
avg_depth_by_depth / avg_depth_by_case]

[array([1.15523665, 1.15685451, 1.15692103, 1.15567826]),
 array([1.08090643, 1.10814367, 1.1322694 , 1.15377169])]

In [40]:
np.array(std_sem_by_depth) / np.array(std_sem_by_case)

array([1.13681691, 1.13987588, 1.14074827, 1.13979382])

- fact 1. Train SEM data이 Simulation SEM data보다 1.15배 더 밝다
- fact 2. 우리가 예측해야하는 train/test의 depth가 simulation의 depth보다 더 밝다. 다만 case별로 밝은 정도의 차이가 있다.

In [45]:
# simulation SEM mean & std
np.average(avg_sem_by_case) / 255., np.average(std_sem_by_case) / 255.

(0.3920240934108683, 0.2261711125189735)

In [46]:
# train SEM maen & std
np.average(avg_sem_by_depth) / 255., np.average(std_sem_by_depth) / 255.

(0.4532461876032569, 0.2576853611804214)

In [51]:
case_list = ['Case_1', 'Case_2', 'Case_3', 'Case_4']
num_list = ['80', '81', '82', '83', '84']

for case in case_list:
    for num in num_list:
        paths = glob(f'/home/youngkim21/dacon/sem-data/simulation_data/Depth/{case}/{num}/*.png')
        avgs, stds = [], []
        for path in paths:
            depth = cv2.imread(path)
            avg = np.average(depth)
            std = np.std(depth)
            avgs.append(avg)
            stds.append(std)
        case_avg = np.average(avgs)
        case_std = np.average(stds)

        print(case, num, case_avg)

Case_1 80 101.4511178530504
Case_1 81 101.69497633404718
Case_1 82 101.99451359731857
Case_1 83 101.90410260713206
Case_1 84 101.65717146638126
Case_2 80 107.94215072391471
Case_2 81 108.21910676415725
Case_2 82 108.5386858672062
Case_2 83 108.44413858066626
Case_2 84 108.16160835034184
Case_3 80 114.44260625757336
Case_3 81 114.7297829844828
Case_3 82 115.08149289075496
Case_3 83 114.97915133416363
Case_3 84 114.68217986691889
Case_4 80 120.9366334879287
Case_4 81 121.25754219945183
Case_4 82 121.63384282077807
Case_4 83 121.51796542514953
Case_4 84 121.19560470388623
