In [1]:
import os
from glob import glob

import cv2
import numpy as np
import matplotlib.pyplot as plt
import albumentations as A
from albumentations.pytorch import ToTensorV2

import sys
sys.path.append('/root/dacon-epitope/dacon-sem')
from src.datasets.sem_dataset import SEMDataset
from src.datasets.sem_datamodule import SEMDataModule

In [2]:
data_path = os.path.abspath('/shared/Samsung/')
simulation_sem_paths = os.path.join(data_path, 'simulation_data', 'SEM', '*', '*', '*.png')
simulation_sem_paths = np.array(sorted(glob(simulation_sem_paths)))
simulation_depth_paths = os.path.join(data_path, 'simulation_data', 'Depth', '*', '*', '*.png')
simulation_depth_paths = np.array(sorted(glob(simulation_depth_paths) + glob(simulation_depth_paths)))
simulation_data_len = len(simulation_sem_paths)

train_sem_paths = os.path.join(data_path, 'train', 'SEM', '*', '*', '*.png')
train_sem_paths = np.array(sorted(glob(train_sem_paths)))
train_data_len = len(train_sem_paths)

test_sem_paths = os.path.join(data_path, 'test', 'SEM', '*.png')
test_sem_paths = np.array(sorted(glob(test_sem_paths)))
test_data_len = len(test_sem_paths)

In [3]:
simulation_data_len, train_data_len, test_data_len

(173304, 60664, 25988)

In [4]:
depth_list = ['Depth_110', 'Depth_120', 'Depth_130', 'Depth_140']
train_sem_by_depths = []

for depth in depth_list:
    t_path = os.path.join(data_path, 'train', 'SEM', depth, '*', '*.png')
    t_path = np.array(sorted(glob(t_path)))
    train_sem_by_depths.append(t_path)

print([len(t_path) for t_path in train_sem_by_depths])

[15166, 15166, 15166, 15166]


In [5]:
print([t_path[0] for t_path in train_sem_by_depths])

['/shared/Samsung/train/SEM/Depth_110/site_00000/SEM_043510.png', '/shared/Samsung/train/SEM/Depth_120/site_00000/SEM_021742.png', '/shared/Samsung/train/SEM/Depth_130/site_00000/SEM_065114.png', '/shared/Samsung/train/SEM/Depth_140/site_00000/SEM_000165.png']


In [6]:
for depth in depth_list:
    print(len(os.listdir(f'/shared/Samsung/train/SEM/{depth}')))

516
515
514
514


## Normalization

In [7]:
simul_ds = SEMDataset(simulation_sem_paths, simulation_depth_paths)
len(simul_ds)

173304

In [8]:
train_ds = SEMDataset(train_sem_paths)
len(train_ds)

60664

In [10]:
[np.mean(sem.numpy(), axis=(1,2)) for path, sem, depth in simul_ds]

[array([0.39578912], dtype=float32),
 array([0.39042416], dtype=float32),
 array([0.41139483], dtype=float32),
 array([0.40735748], dtype=float32),
 array([0.41086033], dtype=float32),
 array([0.39057508], dtype=float32),
 array([0.38358414], dtype=float32),
 array([0.41487727], dtype=float32),
 array([0.37195334], dtype=float32),
 array([0.3924383], dtype=float32),
 array([0.39306808], dtype=float32),
 array([0.41387868], dtype=float32),
 array([0.4014139], dtype=float32),
 array([0.40581203], dtype=float32),
 array([0.4152914], dtype=float32),
 array([0.4002814], dtype=float32),
 array([0.40693763], dtype=float32),
 array([0.39911383], dtype=float32),
 array([0.40740627], dtype=float32),
 array([0.40141842], dtype=float32),
 array([0.40535024], dtype=float32),
 array([0.3961999], dtype=float32),
 array([0.40720883], dtype=float32),
 array([0.41180786], dtype=float32),
 array([0.3820023], dtype=float32),
 array([0.39103124], dtype=float32),
 array([0.37980554], dtype=float32),
 array(

In [13]:
def get_mean_std(dataset):
  mean_per_img = [np.mean(image.numpy(), axis=(1,2)) for _,image,_ in dataset]
  std_per_img = [np.std(image.numpy(), axis=(1,2)) for _,image,_ in dataset]

  mean = np.mean([m[0] for m in mean_per_img])
  std = np.mean([s[0] for s in std_per_img])

  print(mean)
  print(std)

In [14]:
get_mean_std(simul_ds)

0.3920241
0.22617114
