# Compile Data into HDF5 File
Loads given data into a single HDF5 file

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
from pathlib import Path

import h5py
import pickle

from IPython.display import clear_output
from sklearn.model_selection import train_test_split
from skimage.transform import resize
from tqdm.notebook import tqdm

from utils.get_file import GetTV

In [None]:
inversion_data_path = Path('outputs/inversion_data')
hdf5_path = Path('outputs/hdf5')

In [None]:
tv = GetTV('tv_images/l-mode')
files = tv.list_files()
file_lengths = [tv.file_len(f, False) for f in files]
cumulative_lengths = np.insert(np.cumsum(file_lengths), 0, 0)
tv_dim = tv.load(files[0], 'vid').shape
inversion_dim = tv.load(files[0], 'inverted').shape

In [None]:
print(cumulative_lengths)
print(tv_dim)

### Raw and Points

In [None]:
hdf5_file_name = hdf5_path / 'tv_raw.hdf5'

with h5py.File(hdf5_file_name, 'w') as hf:
    tv_dataset = hf.create_dataset("tv_images", shape=(np.sum(file_lengths), tv_dim[0], tv_dim[1]), dtype='uint8')
    points_dataset = hf.create_dataset("points", shape=(np.sum(file_lengths), 4), dtype='float32')
    for idx, file in enumerate(files):
        frames = tv._load_data(file, 'frames')
        tv_image = tv._load_data(file, 'vid')
        tv_image_process = np.asarray(tv_image) # faster process and convert to binary
        
        pkl_path = (inversion_data_path / file.stem).with_suffix('.pkl')
        with open(pkl_path, 'rb') as pkl_file:
            label_info = pickle.load(pkl_file)
        points = np.concatenate((label_info['l_location'], label_info['r_location']),1)
        
        for i in range(file_lengths[idx]):
            tv_dataset[cumulative_lengths[idx]+i] = tv_image_process[i]
            points_dataset[cumulative_lengths[idx]+i] = points[i]

### Inversion and Points

In [None]:
# With auto-labeled points

hdf5_file_name = hdf5_path / 'compiled_inversion_no_image.hdf5'
hf = h5py.File(hdf5_file_name, 'w') # open h5py file
rz_dataset = hf.create_dataset("rz", shape=(np.sum(file_lengths), 4), dtype='float32')
intensity_dataset = hf.create_dataset("intensity", shape=(np.sum(file_lengths), 2), dtype='float32')

# Add datasets to the groups
for idx, file in enumerate(files):
    pkl_path = (inversion_data_path / file.stem).with_suffix('.pkl')
    with open(pkl_path, 'rb') as pkl_file:
            label_info = pickle.load(pkl_file)
    points = np.concatenate((label_info['l_location'], label_info['r_location']),1)
    points_i = np.concatenate((label_info['l_intensity'], label_info['r_intensity']))

    for i in range(file_lengths[idx]):
            rz_dataset[cumulative_lengths[idx]+i] = points[i]
            intensity_dataset[cumulative_lengths[idx]+i] = points_i[i]
hf.close()

In [None]:
# With manual points
csv_location = Path('outputs/manual_labeled_points')
tv_location = Path('tv_images/l-mode-train')
hdf5_file_name = hdf5_path / 'inversion_manual.hdf5'

tv = GetTV(tv_location)
csvs = GetTV(csv_location)
csv_files = csvs.list_files()
tv_files = tv.list_files()
print(tv_files)
csv_lens  = []
for file in csv_files:
    with open(file) as object:
        csv_lens.append(sum(1 for line in object)-1)
csv_len = sum(csv_lens)
inverted_dim = tv.load(tv_files[0], 'inverted').shape
tv_file = tv_files[0]
csv_file = csv_files[0]
cumulative_lengths = np.insert(np.cumsum(csv_lens), 0, 0)
f_open = pd.read_csv(csv_file).to_numpy()
with h5py.File(hdf5_file_name, 'w') as hf:
    
    tv_dataset = hf.create_dataset("inverted", shape=(csv_len, inverted_dim[1], inverted_dim[2]), dtype='float32')
    points_dataset = hf.create_dataset("points", shape=(csv_len, 2), dtype='float32')
    for idx, file in enumerate(tv_files):
        inverted = tv.load(file, 'inverted')
        process = np.asarray(inverted) # faster process and convert to binary
        f_open = pd.read_csv(csv_files[idx]).to_numpy()
        for i in range(csv_lens[idx]):
            tv_dataset[cumulative_lengths[idx]+i] = process[i]
            l1, l2 = int(f_open[i][1])/4, int(f_open[i][2])/4
            points_dataset[cumulative_lengths[idx]+i] = [l1, l2]
            
# with open(csv_file, mode ='r')as file:
#     csvFile = csv.reader(file)
#     for lines in csvFile:
#         plt.imshow(tv.load(tv_file, 'inverted')[0])
#         plt.scatter(int(lines[1])/4, int(lines[2])/4, c='r')

### Raw and Synthetic and Points

In [None]:
hdf5_file_name = hdf5_path / 'x_outer_radiation.hdf5'
hf = h5py.File(hdf5_file_name, 'w') # open h5py file

# Add datasets to the groups
for idx, file in enumerate(files):
    frames = tv._load_data(file, 'frames')
    tv_image = tv._load_data(file, 'vid_frames')
    tv_image_process = np.asarray(tv_image) # faster process and convert to binary
    pkl_path = (inversion_data_path / file.stem).with_suffix('.pkl')
    with open(pkl_path, 'rb') as pkl_file:
            label_info = pickle.load(pkl_file)
    points = np.concatenate((label_info['x_location'], label_info['r_location']),1)
    points_i = np.concatenate((label_info['x_intensity'], label_info['r_intensity']))
    
    for i in range(file_lengths[idx]):
        tv_dataset[cumulative_lengths[idx]+i] = tv_image_process[i]
        points_dataset[cumulative_lengths[idx]+i] = points[i]
        intensity_dataset[cumulative_lengths[idx]+i] = points_i[i]
hf.close()

### Raw and Inversion and Points for Direct

In [None]:
# Manual Point Model
modelpath = Path('outputs/models')
file_name = 'lr_inversion_manual.pkl'
with open(modelpath / file_name, 'rb') as f:
    inversion_model = pickle.load(f)

In [None]:
# homogenous data
points_train_test = []
tv_train_test = []
inverted_train_test = []
for idx, file in enumerate(files):
    frames = tv.load(file, 'frames').astype('int')
    tv_image = tv.load(file, 'vid')[frames]
    inversion = tv.load(file, 'inverted')
    inversion_vid2 = inversion.reshape((len(inversion), -1))
    for i in range(len(frames)):
        tv_train_test.append(tv_image[i])
        inverted_train_test.append(inversion[i])
        points_train_test.append(inversion_model.predict(inversion_vid2[i].reshape(1, -1)))
        
print(np.array(points_train_test).shape)
print(np.array(tv_train_test).shape)
print(np.array(inverted_train_test).shape)

In [None]:
tv_only = []
for idx, file in enumerate(files):
    frames = np.setdiff1d(tv.load(file, 'vid_frames').astype(int), tv.load(file, 'frames').astype(int))
    tv_image = tv.load(file, 'vid')[frames]
    for i in range(len(frames)):
        tv_only.append(tv_image[i])
        
print(np.array(tv_only).shape)

In [None]:
with h5py.File(hdf5_path / 'tv_inv_outer.h5', 'w') as f:
    f.create_dataset('vid', data=tv_train_test)
    f.create_dataset('inverted', data=inverted_train_test)
    f.create_dataset('points', data=points_train_test)
    f.create_dataset('vid_only', data=tv_only)

In [None]:
# split by file
train_files, test_files = train_test_split(files, test_size=0.2, random_state=30)
print(train_files)
print(test_files)

points_train = []
tv_train = []
inverted_train = []
points_test = []
tv_test = []
inverted_test = []

for idx, file in enumerate(train_files):
    frames = tv.load(file, 'frames').astype('int')
    tv_image = tv.load(file, 'vid')[frames]
    inversion = tv.load(file, 'inverted')
    inversion_vid2 = inversion.reshape((len(inversion), -1))
    for i in range(len(frames)):
        tv_train.append(tv_image[i])
        inverted_train.append(inversion[i])
        points_train.append(inversion_model.predict(inversion_vid2[i].reshape(1, -1)))
        
for idx, file in enumerate(test_files):
    frames = tv.load(file, 'frames').astype('int')
    tv_image = tv.load(file, 'vid')[frames]
    inversion = tv.load(file, 'inverted')
    inversion_vid2 = inversion.reshape((len(inversion), -1))
    for i in range(len(frames)):
        tv_test.append(tv_image[i])
        inverted_test.append(inversion[i])
        points_test.append(inversion_model.predict(inversion_vid2[i].reshape(1, -1)))

In [None]:
with h5py.File(hdf5_path / 'tv_inv_outer.h5', 'w') as f:
    f.create_dataset('vid_train', data=tv_train)
    f.create_dataset('inverted_train', data=inverted_train)
    f.create_dataset('points_train', data=points_train)
    f.create_dataset('vid_test', data=tv_test)
    f.create_dataset('inverted_test', data=inverted_test)
    f.create_dataset('points_test', data=points_test)

### Raw and Inversion and Points for Cycle-GAN

Similar to direct, but normalizes images and changes resolution

In [None]:
tv_train_test = []
inverted_train_test = []
for idx, file in tqdm(enumerate(files)):
    frames = tv.load(file, 'frames').astype('int')
    tv_image = (tv.load(file, 'vid')[frames] - 127.5)/127.5
    inversion = (tv.load(file, 'inverted') - 7.5)/7.5
    
    tv_image = np.flip(resize(tv_image, (len(frames), 256, 256), order=0, preserve_range=True), axis=(2,1))
    inversion = resize(inversion, (len(frames), 256, 256), order=0, preserve_range=True)
    
    for i in range(len(frames)):
        tv_train_test.append(tv_image[i])
        inverted_train_test.append(inversion[i])
        
print(np.array(tv_train_test).shape)
print(np.array(inverted_train_test).shape)

In [None]:
# tv_only = []
# for idx, file in tqdm(enumerate(files)):
#     frames = np.setdiff1d(tv.load(file, 'vid_frames').astype(int), tv.load(file, 'frames').astype(int))
#     tv_image = (tv.load(file, 'vid')[frames] - 127.5)/127.5
    
#     tv_image = np.flip(resize(tv_image, (len(frames), 256, 256), order=0, preserve_range=True), axis=(2,1))
    
#     for i in range(len(frames)):
#         tv_only.append(tv_image[i])
        
# print(np.array(tv_only).shape)

In [None]:
idx = 2000
print(np.max(tv_train_test), np.min(tv_train_test))
print(np.max(inverted_train_test[idx]), np.min(inverted_train_test[idx]))
plt.subplot(1,2,1)
plt.imshow(tv_train_test[idx], origin='lower')
plt.subplot(1,2,2)
plt.imshow(inverted_train_test[idx], origin='lower')
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tv_train_test, inverted_train_test, test_size=0.2, random_state=42)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tv_train_test, inverted_train_test, test_size=0.2, random_state=42)
with h5py.File(hdf5_path / 'img_2_img_proof.h5', 'w') as f:
    f.create_dataset('A_train', data=X_train)
    f.create_dataset('A_test', data=X_test)
    f.create_dataset('B_train', data=y_train)
    f.create_dataset('B_test', data=y_test)

Full Training Set and Full TV Dataset

In [None]:
with h5py.File(hdf5_path / 'img_2_img_full_train.h5', 'w') as f:
    f.create_dataset('A_train', data=tv_train_test)
    f.create_dataset('B_train', data=inverted_train_test)

In [None]:
tv_only = []
for idx, file in tqdm(enumerate(files)):
    frames = tv.load(file, 'vid_frames').astype(int)
    tv_image = (tv.load(file, 'vid') - 127.5)/127.5
    
    tv_image = np.flip(resize(tv_image, (len(frames), 256, 256), order=0, preserve_range=True), axis=(2,1))
    
    for i in range(len(frames)):
        tv_only.append(tv_image[i])
        
print(np.array(tv_only).shape)

In [None]:
with h5py.File(hdf5_path / 'img_2_img_full_tv.h5', 'w') as f:
    f.create_dataset('A_test', data=tv_only)

### TV Synthetic HDF5

In [None]:
randomization = False

file_name_1 = Path('outputs/hdf5/s_outs_v3_limited.h5')
file_name_2 = Path('outputs/hdf5/x_outer_radiation.hdf5')

out_path = Path('outputs')

with h5py.File(file_name_1, 'r') as f:
    synthetic_images = f['image'][:] * 2 - 1
    
with h5py.File(file_name_2, 'r') as f:
    points = f['points'][:]
    tv_images = f['tv_images'][:] / 127.5 - 1

print(len(synthetic_images), len(tv_images))
file_len = 1840

crop_synthetic = resize(synthetic_images[:file_len], (file_len, 256, 256))
crop_tv = np.flip(resize(tv_images[:file_len], (file_len, 256, 256)), axis=1)


In [None]:
print(synthetic_images.min(), synthetic_images.max())
print(tv_images.min(), tv_images.max())

In [None]:
idx = 1799
axs1 = plt.subplot(1,2,1)
axs1.imshow(crop_synthetic[idx], origin='lower')
print(np.min(crop_synthetic[idx]), np.max(crop_synthetic[idx]))

axs2 = plt.subplot(1,2,2)
axs2.imshow(crop_tv[idx], origin='lower')
print(np.min(crop_tv[idx]), np.max(crop_tv[idx]))

plt.show()

In [None]:
print(synth_train.shape)

In [None]:
# tts_percent = 0.99
tvs_percent = 0.8

# synth_dat, synth_val, tv_dat, tv_val = train_test_split(crop_synthetic, crop_tv, train_size=tts_percent, random_state=42)
synth_train, synth_test, tv_train, tv_test = train_test_split(crop_synthetic, crop_tv, train_size=tvs_percent, random_state=42)

# print(len(synth_dat), len(synth_val), len(tv_dat), len(tv_val))
print(len(synth_train), len(synth_test), len(tv_train), len(tv_test))
with h5py.File(out_path / 'tv_synth.h5', 'w') as f:
    f.create_dataset('synth_train', data=synth_train)
    f.create_dataset('synth_test', data=synth_test)
    # f.create_dataset('synth_val', data=synth_val)
    f.create_dataset('tv_train', data=tv_train)
    f.create_dataset('tv_test', data=tv_test)
    # f.create_dataset('tv_val', data=tv_val)

In [None]:
axs1 = plt.subplot(1,2,1)
axs1.imshow(crop_tv[211], origin='lower')
axs2 = plt.subplot(1,2,2)
axs2.imshow(crop_synthetic[10], origin='lower')
plt.show()

In [None]:
file_name = 'outputs/tv_synth.h5'

with h5py.File(file_name, 'r') as f:
    synth_train = f['synth_train'][:]
    synth_test = f['synth_test'][:]
    # synth_val = f['synth_val'][:]
    tv_train = f['tv_train'][:]
    tv_test = f['tv_test'][:]
    # tv_val = f['tv_val'][:]


In [None]:
idx = 884
axs1 = plt.subplot(1,2,1)
axs1.imshow(synth_train[idx], origin='lower')
# print(np.min(synth_val[idx]), np.max(synth_val[idx]))

axs2 = plt.subplot(1,2,2)
axs2.imshow(tv_train[idx], origin='lower')
# print(np.min(tv_val[idx]), np.max(tv_val[idx]))

plt.show()
