# Data processing
Here we preprocess the data and save it into a .pth file, to be able to load it for experiments. We also split into train, test and eval for images and seqs.

In [1]:
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
import pandas as pd
import cv2

import torch
import torch.nn.functional as F

from utils.plotting import flow2img, overlaySegment
from utils.encoding import dice_coeff
from utils.layers import warp, warpImage
from utils.load_models import load_flownet2
from utils.preprocessing import preprocessing_flownet

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

In [2]:
# load the ids of available patients
available = pd.read_csv("/home/nicke/MasterThesis/available_US_probands.csv")
available = available.drop('Unnamed: 0', axis=1)
available

Unnamed: 0,Id,Patient,Device,Leg,Anatomy,Landmark,Datatype
0,3,1,guys,LT,CFV / GSV + SFA / PFA,LM3,healthy
1,9,2,guys,RT,SFV1,LM5,healthy
2,10,2,guys,RT,SFV2,LM6,healthy
3,20,3,guys,LT,SFV1,LM5,healthy
4,25,4,guys,RT,SFV1,LM5,healthy
...,...,...,...,...,...,...,...
470,4186,296,clarius-hd,RT,BACKGROUND,BACKGROUND,background
471,4242,300,clarius-hd,RT,BACKGROUND,BACKGROUND,background
472,4300,304,clarius-hd,RT,BACKGROUND,BACKGROUND,background
473,4333,306,clarius-hd,RT,BACKGROUND,BACKGROUND,background


In [3]:
# split the IDs for training and testing aka evaluation.
np.random.seed(42)
id_list = available['Id'].values
train_id = np.random.choice(id_list, (int(len(id_list) * 0.9)), replace=False)

test_id = available[~available['Id'].isin(train_id)]['Id'].values

print("There are {} IDs for training".format(len(train_id)))
print("There are {} IDs for testing".format(len(test_id)))

There are 427 IDs for training
There are 48 IDs for testing


In [4]:
# Check how many Image pairs are there for training.
num_train_pairs = 0

for train in train_id:
    
    path = os.path.join("/share/data_ultraschall/compressions", str(train), "frames")
    total_images = os.listdir(path)
    
    # if there are enough frames available, we take three moving and fixed pairs from it
    if len(total_images) > 30:
        num_train_pairs += 3
        
    # if there are between 2ß and 10, 2 fixed,moving pairs are selected
    elif len(total_images) > 20:
        num_train_pairs += 2
        
    # else only one can be taken
    else:
        num_train_pairs += 1
        
print("Over all we have {} number of training pairs".format(num_train_pairs))

Over all we have 1195 number of training pairs


In [5]:
# Check how many Image pairs are there for testing.
num_test_pairs = 0

for train in test_id:
    
    path = os.path.join("/share/data_ultraschall/compressions", str(train), "frames")
    total_images = os.listdir(path)
    
    # if there are enough frames available, we take three moving and fixed pairs from it
    if len(total_images) > 30:
        num_test_pairs += 3
        
    # if there are between 2ß and 10, 2 fixed,moving pairs are selected
    elif len(total_images) > 20:
        num_test_pairs += 2
        
    # else only one can be taken
    else:
        num_test_pairs += 1
        
print("Over all we have {} number of testing/eval pairs".format(num_test_pairs))

Over all we have 133 number of testing/eval pairs


In [14]:
def load_image_and_seg(path, image):
    
    # load image with PIL and normalize
    img = np.array(Image.open(os.path.join(path,"frames", image))) / 255
    #print(os.path.join(path,"frames", image))
    #print(os.path.join(path,"segmentations", "1", image))
    
    # load seg and normalize over the labels
    seg =np.array(Image.open(os.path.join(path,"segmentations", "1", image))) / 200
    
    return img, seg
    
def get_image_seg_pairs(path):
    all_files = sorted(os.listdir(os.path.join(path,"frames")))
    
    # Select how many image pairs there are for te patient
    if len(all_files) > 30:
        size = 3
    elif len(all_files) > 20:
        size = 2
    else:
        size = 1
        
    # select random index for the image and make sure none is doubled
    rand_idx = np.random.choice(np.arange(0,len(all_files)), size=size, replace=False)
    file_pairs = []
    
    # for every fixed image index, we need a moving image index.
    for idx in rand_idx:
        
        # The time intervall at the moment is 4 (6) frames for the moving
        moving_idx = idx + 6
        if moving_idx > len(all_files)-1:
            moving_idx = idx - 6
            if moving_idx < 0:
                moving_idx = 0

        file_pairs.append([all_files[idx],all_files[moving_idx]])
    
    frame_pairs = []
    seg_pairs = []
    
    # load the seg and frame for fixed and moving
    for fixed_file, moving_file in file_pairs:
        
        fixed, fixed_seg = load_image_and_seg(path, fixed_file)
        moving, moving_seg = load_image_and_seg(path, moving_file)
        
        if fixed_seg.max() == 0:
            continue
        if moving_seg.max() == 0:
            continue
        
        
        # and store them together
        frame_pairs.append([fixed,moving])
        seg_pairs.append([fixed_seg,moving_seg])
    
    return np.array(frame_pairs), np.array(seg_pairs)

In [15]:
# load test frames and segmentations

test_frames = []
test_segs = []
for idx,test in enumerate(test_id):
    
    path = os.path.join("/share/data_ultraschall/compressions", str(test))
    imgs, segs = get_image_seg_pairs(path)
    for pair in imgs:
        test_frames.append(pair)
    for pair in segs:
        test_segs.append(pair)

In [16]:
# Transform them into torch tensors and make sure they match
test_frames = torch.from_numpy(np.array(test_frames))
test_segs = torch.from_numpy(np.array(test_segs))

assert test_frames.shape == test_segs.shape

In [17]:
# load the train frames and segmentations

train_frames = []
train_segs = []
for idx, train in enumerate(train_id):
    
    path = os.path.join("/share/data_ultraschall/compressions", str(train))
    imgs, segs = get_image_seg_pairs(path)
    for pair in imgs:
        train_frames.append(pair)
    for pair in segs:
        train_segs.append(pair)

In [18]:
# convert into tensor and check if they match

train_frames = torch.from_numpy(np.array(train_frames))
train_segs = torch.from_numpy(np.array(train_segs))

assert train_frames.shape == train_segs.shape

In [19]:
train_frames.shape

torch.Size([1090, 2, 150, 150])

In [20]:
# save train
torch.save(train_frames, "/share/data_ultraschall/nicke_ma/data/train_frames_disp_6.pth")
torch.save(train_segs, "share/data_ultraschall/nicke_ma/data/train_segs_disp_6.pth")

In [21]:
# save test
torch.save(test_frames, "share/data_ultraschall/nicke_ma/data/test_frames_disp_6.pth")
torch.save(test_segs, "share/data_ultraschall/nicke_ma/data/test_segs_disp_6.pth")