# Load data 


The goal of this notebook is to move a collection of paired input and target images ready to be used for training and inference via the Pix2PixHD GAN model. The core functions used within this notebook are the train_test_move_commands2 which generates the slurm command to move each image accordingly.</br>

Please see the Pix2PixHD readme for the full requirements on how to set the data loaders but in short the files path must be as follows: </br>

**Train:**

   **/path/to/files/train_A** </br>
   
   **/path/to/files/train_B**
   
**Val:**

   **/path/to/files/val_A**   
   
   **/path/to/files/val_B**

**Test:**

   **/path/to/files/test_A**   
   
   **/path/to/files/test_B** </br>
   
### <span style ='color:red'>IMPORTANT!
   
The files are saved ***BARCODE_rXXcXXfXXp01.tiff*** for example 3CG0021772_r01c01f02p01.tiff. This is to ensure that files from different barcodes do not overwright other files. Following this is very important for downstream implementation. </br>

I have chosen to combine the image paths along with all the meta data into a CSV and extract random samples based on my experiment design. The dataset  but you can follow any approach you wish as long as you save the images in the format outlined above.
    



### Import paramters and helper functions

In [1]:
import os
import shutil
import sys
import numpy as np
import pandas as pd
root_dir = '../../../'
sys.path.append(root_dir)

from slurm.sbatch import submit_array
from slurm.commands import move_16bitimage
from slurm.commands import combine_AB
from utils.df_utils  import df_channel_create as channel_create
from utils.df_utils import df_image_folder_match as image_folder_match
from utils.util_utils import extract_samples, find_file, train_test_move_commands2,create_directory



### Define paths

**You will need to as a MINIMUM UPDATE**:
- conda_path
- repo_path
- model_name
- tissue: either Lung, Breast or Ovarian
- output_path: where to save images


In [2]:
## Mandatory to update
conda_path = os.path.join(
    '/hpc/projects/upt/bioimaging_analytics/Tesaro-DNA-Damage/conda_envs/',
    'pytorch_cyclegan'
)

repo_path = os.path.join(
    '/hpc/scratch/nvme3/smt29021/phase1_repo_1122/Structure-Specific-Contrast-Enhancement/workflow/phase2_sampling_normalisation/',
)

model_name = "bf_fitc_30k_nontoxic"
tissue = "Lung"

output_path = os.path.join(
    "/hpc/scratch/hdd3/smt29021/Tesaro-DNA-Damage/Data/",
#     "APPROACH_Pix2PixHD", 
    tissue, "Step1_Preprocessing/ACT1_Normalise",
    model_name
)

# toxicity_layout = os.path.join('/hpc/scratch/rdip1/smt29021/Tesaro-DNA-Damage/Data/APPROACH_Pix2PixHD/Lung/Step1_Preprocessing/ACT3_Evaluate/dataset_sizes_test/bf_dna_16bit_lung_val2_7000imgs/error_analysis/lung_val_plate_setting_toxicity.csv')


## Non-mandatory to update
py_path = os.path.join(repo_path,
                       'move_tesaro.py'
)

random_val_samples_path = os.path.join(
    "/hpc/scratch/rdip1/smt29021/Tesaro-DNA-Damage/Data/DATA_Processing/"
)

good_lung_val_csv_path = os.path.join(
"/hpc/projects/upt/bioimaging_analytics/Tesaro-DNA-Damage/error_analysis/br_val_good_bf.csv"
)


### Load Tesaro CSV

In [3]:
#Import Good Lung Images
img_data = pd.read_csv(good_lung_val_csv_path)
print("Number of Images: "+str(img_data.shape[0]))
img_data.head()

Number of Images: 159264


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,image_url,new_path,new_loc,URL_x,Row,Col,FieldID,...,ObjectiveMagnification,ExposureTime,image_folder1,final_cell_line,tissue_final,UID,plate_setting,key,key_final,toxic
0,1,1,1,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,3CG0019090_r01c01f01p01.tiffTesaro-DNA-Damage-...,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,r01c01f01p01-ch2sk1fk1fl1.tiff,1,1,1,...,20,"[[0.987330,0,0,-11.9],[0,-0.987330,0,1.4],[0,0...",ELN15196_Br_DNA_Damage_Val_072220,3CG0019090,Breast,0f153a11-a6a5-4314-83ee-64717436b270,Val,3CG0019090_r01c01f01p01-ch2sk1fk1fl1.tiff,3CG0019090_r01c01f01p01.tiff,1
1,5,5,5,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,3CG0019090_r01c01f02p01.tiffTesaro-DNA-Damage-...,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,r01c01f02p01-ch2sk1fk1fl1.tiff,1,1,2,...,20,"[[0.987330,0,0,-11.9],[0,-0.987330,0,1.4],[0,0...",ELN15196_Br_DNA_Damage_Val_072220,3CG0019090,Breast,9789fe3b-dd96-4baf-b5eb-dfe077b5947d,Val,3CG0019090_r01c01f02p01-ch2sk1fk1fl1.tiff,3CG0019090_r01c01f02p01.tiff,1
2,9,9,9,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,3CG0019090_r01c01f03p01.tiffTesaro-DNA-Damage-...,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,r01c01f03p01-ch2sk1fk1fl1.tiff,1,1,3,...,20,"[[0.987330,0,0,-11.9],[0,-0.987330,0,1.4],[0,0...",ELN15196_Br_DNA_Damage_Val_072220,3CG0019090,Breast,b75e3e1d-8760-44c9-88a3-b9417e1da384,Val,3CG0019090_r01c01f03p01-ch2sk1fk1fl1.tiff,3CG0019090_r01c01f03p01.tiff,1
3,13,13,13,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,3CG0019090_r01c01f04p01.tiffTesaro-DNA-Damage-...,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,r01c01f04p01-ch2sk1fk1fl1.tiff,1,1,4,...,20,"[[0.987330,0,0,-11.9],[0,-0.987330,0,1.4],[0,0...",ELN15196_Br_DNA_Damage_Val_072220,3CG0019090,Breast,286f6190-b279-4855-99d2-630d89e1b9f2,Val,3CG0019090_r01c01f04p01-ch2sk1fk1fl1.tiff,3CG0019090_r01c01f04p01.tiff,1
4,17,17,17,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,3CG0019090_r01c01f05p01.tiffTesaro-DNA-Damage-...,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,r01c01f05p01-ch2sk1fk1fl1.tiff,1,1,5,...,20,"[[0.987330,0,0,-11.9],[0,-0.987330,0,1.4],[0,0...",ELN15196_Br_DNA_Damage_Val_072220,3CG0019090,Breast,a2afa1a1-60ef-425e-b97a-1fdecbdb001d,Val,3CG0019090_r01c01f05p01-ch2sk1fk1fl1.tiff,3CG0019090_r01c01f05p01.tiff,1


### Load txt file containing row index of chosen Bright-field samples and then generate the corresponding target paths

In [6]:
test_plates_ov = ['3CG0022742','3CG0022796','3CG0022700']
test_plates_lu  = ['3CG0021760','3CG0021816','3CG0021766']
test_plates_br  = ['3CG0019162','3CG0019164','3CG0019156']

img_data = img_data[img_data["ChannelName"] == "Brightfield"]

breast_plates = img_data[~img_data['final_cell_line'].isin(test_plates_br)]
breast_plates_toxic = breast_plates[breast_plates['toxic'] ==1]
breast_plates_nontoxic = breast_plates[breast_plates['toxic'] ==0]
print(f"Total Toxic {breast_plates_toxic['toxic'].sum()}")
print(f"Total Non-Toxic {breast_plates_nontoxic['toxic'].count()}")

Total Toxic 49901
Total Non-Toxic 99000


In [7]:
'/hpc/scratch/rdip1/smt29021/Tesaro-DNA-Damage/Data/DATA_Processing/'
#Generate bf samples
indexes, lung_val_30000_bf = extract_samples(
    "br_val_nontoxicsamples_30000.txt",breast_plates_nontoxic,random_val_samples_path)

#Generate target channels and inputs for move
lung_val_sig = image_folder_match(img_data,lung_val_30000_bf,"image_url","Brightfield")
print(lung_val_sig.shape)
lung_val_sig.head()

(30000, 34)


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,image_url,new_path,new_loc,URL_x,Row,Col,FieldID,...,ObjectiveMagnification,ExposureTime,image_folder1,final_cell_line,tissue_final,UID,plate_setting,key,key_final,toxic
27,109,109,109,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,3CG0019090_r01c04f01p01.tiffTesaro-DNA-Damage-...,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,r01c04f01p01-ch2sk1fk1fl1.tiff,1,4,1,...,20,"[[0.987330,0,0,-11.9],[0,-0.987330,0,1.4],[0,0...",ELN15196_Br_DNA_Damage_Val_072220,3CG0019090,Breast,7c68fdff-0f79-424a-8d48-531e9671998b,Val,3CG0019090_r01c04f01p01-ch2sk1fk1fl1.tiff,3CG0019090_r01c04f01p01.tiff,0
31,125,125,125,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,3CG0019090_r01c04f05p01.tiffTesaro-DNA-Damage-...,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,r01c04f05p01-ch2sk1fk1fl1.tiff,1,4,5,...,20,"[[0.987330,0,0,-11.9],[0,-0.987330,0,1.4],[0,0...",ELN15196_Br_DNA_Damage_Val_072220,3CG0019090,Breast,977d2f1f-ddfd-43cb-87d3-4721b9675289,Val,3CG0019090_r01c04f05p01-ch2sk1fk1fl1.tiff,3CG0019090_r01c04f05p01.tiff,0
32,129,129,129,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,3CG0019090_r01c04f06p01.tiffTesaro-DNA-Damage-...,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,r01c04f06p01-ch2sk1fk1fl1.tiff,1,4,6,...,20,"[[0.987330,0,0,-11.9],[0,-0.987330,0,1.4],[0,0...",ELN15196_Br_DNA_Damage_Val_072220,3CG0019090,Breast,6de85e67-30d0-4b03-94ef-09da183b2e70,Val,3CG0019090_r01c04f06p01-ch2sk1fk1fl1.tiff,3CG0019090_r01c04f06p01.tiff,0
34,137,137,137,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,3CG0019090_r01c04f08p01.tiffTesaro-DNA-Damage-...,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,r01c04f08p01-ch2sk1fk1fl1.tiff,1,4,8,...,20,"[[0.987330,0,0,-11.9],[0,-0.987330,0,1.4],[0,0...",ELN15196_Br_DNA_Damage_Val_072220,3CG0019090,Breast,91db6d72-6d48-4384-957a-4654815a757c,Val,3CG0019090_r01c04f08p01-ch2sk1fk1fl1.tiff,3CG0019090_r01c04f08p01.tiff,0
36,145,145,145,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,3CG0019090_r01c05f01p01.tiffTesaro-DNA-Damage-...,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,r01c05f01p01-ch2sk1fk1fl1.tiff,1,5,1,...,20,"[[0.987330,0,0,-11.9],[0,-0.987330,0,1.4],[0,0...",ELN15196_Br_DNA_Damage_Val_072220,3CG0019090,Breast,9b18a222-f8ff-4301-b37b-89ac2033d644,Val,3CG0019090_r01c05f01p01-ch2sk1fk1fl1.tiff,3CG0019090_r01c05f01p01.tiff,0


In [None]:
 train.iloc[0,1]

### Hack to get the original paths for lung 30k samples


In [8]:
def strings(python,input_file,output_file):
    return f'python {python} --input_file {input_file} --output_file {output_file}'

splits = [
    "train", "val", 
          "test"]

output_path = os.path.join(
    '/hpc/scratch/hdd3/smt29021/Tesaro-DNA-Damage/Data',
    'Breast/Step1_Preprocessing/ACT1_Normalise',
    'bf_fitc_30k_nontoxic/'
)
for split in splits:
    create_directory(os.path.join(output_path,split+"_A"))
    create_directory(os.path.join(output_path,split+"_B"))

train = lung_val_sig.iloc[:21000,[3,-2]]
print(f'Train Length {train.shape[0]}')
val = lung_val_sig.iloc[21000:27000,[3,-2]]
print(f'Val Length {val.shape[0]}')
test = lung_val_sig.iloc[27000:,[3,-2]]
print(f'Test Length {test.shape[0]}')
print(test.iloc[0,:])
command_list =[]
for index, tr in enumerate(train.iloc[:,0]):
    input_path_A = tr
    target_path_A = os.path.join(output_path, 'train_A',train.iloc[index,-1])
    command = strings(py_path, input_path_A, target_path_A)
    command_list.append(command)
    
    input_path_B = train.iloc[index,0][:-15]+"3"+train.iloc[index,0][-14:]
    target_path_B = os.path.join(output_path, 'train_B',train.iloc[index,-1])
    command = strings(py_path, input_path_B, target_path_B)
    command_list.append(command)
    
for index, tr in enumerate(val.iloc[:,0]):
    input_path_A = tr
    target_path_A = os.path.join(output_path, 'val_A',val.iloc[index,-1])
    command = strings(py_path, input_path_A, target_path_A)
    command_list.append(command)
    

    input_path_B = val.iloc[index,0][:-15]+"3"+val.iloc[index,0][-14:]
    target_path_B = os.path.join(output_path, 'val_B',val.iloc[index,-1])
    command = strings(py_path, input_path_B, target_path_B)
    command_list.append(command)
    
for index, tr in enumerate(test.iloc[:,0]):
    input_path_A = tr
    target_path_A = os.path.join(output_path, 'test_A',test.iloc[index,-1])
    command = strings(py_path, input_path_A, target_path_A)
    command_list.append(command)
    
    input_path_B = test.iloc[index,0][:-15]+"4"+test.iloc[index,0][-14:]
    target_path_B = os.path.join(output_path, 'test_B',test.iloc[index,-1])
    command = strings(py_path, input_path_B, target_path_B)
    command_list.append(command)
print(f'Total Commands {len(command_list)}')
command_list[:2]

Train Length 21000
Val Length 6000
Test Length 3000
image_url    /hpc/projects/upt/bioimaging_analytics/Tesaro-...
key_final                         3CG0019172_r09c16f09p01.tiff
Name: 144262, dtype: object
Total Commands 60000


['python /hpc/scratch/nvme3/smt29021/phase1_repo_1122/Structure-Specific-Contrast-Enhancement/workflow/phase2_sampling_normalisation/move_tesaro.py --input_file /hpc/projects/upt/bioimaging_analytics/Tesaro-DNA-Damage/2020-09/ELN15196_Br_DNA_Damage_Val_072220/ELN15196_Br_DNA_Damage_Val_data/ELN01154_72h_Br_DNA_Val/3CG0019090__2020-07-24T03_47_13-Measurement2/Images/r01c04f01p01-ch2sk1fk1fl1.tiff --output_file /hpc/scratch/hdd3/smt29021/Tesaro-DNA-Damage/Data/Breast/Step1_Preprocessing/ACT1_Normalise/bf_fitc_30k_nontoxic/train_A/3CG0019090_r01c04f01p01.tiff',
 'python /hpc/scratch/nvme3/smt29021/phase1_repo_1122/Structure-Specific-Contrast-Enhancement/workflow/phase2_sampling_normalisation/move_tesaro.py --input_file /hpc/projects/upt/bioimaging_analytics/Tesaro-DNA-Damage/2020-09/ELN15196_Br_DNA_Damage_Val_072220/ELN15196_Br_DNA_Damage_Val_data/ELN01154_72h_Br_DNA_Val/3CG0019090__2020-07-24T03_47_13-Measurement2/Images/r01c04f01p01-ch3sk1fk1fl1.tiff --output_file /hpc/scratch/hdd3/smt2

In [15]:
train_A = find_file(output_path+'/train_A/', ".tiff")
train_B = find_file(output_path+'/train_B/', ".tiff")
val_A = find_file(output_path+'/val_A/', ".tiff")
val_B = find_file(output_path+'/val_B/', ".tiff")
test_A = find_file(output_path+'/test_A/', ".tiff")
test_B = find_file(output_path+'/test_B/', ".tiff")
print(f'Total train_A: {len(train_A)}\nTotal train_B: {len(train_B)}\nTotal val_A: {len(val_A)}\nTotal val_B: {len(val_B)}\nTotal test_A: {len(test_A)}\nTotal test_B: {len(test_B)}\n')



Total train_A: 0
Total train_B: 0
Total val_A: 0
Total val_B: 0
Total test_A: 0
Total test_B: 0



### Generate folders, slurm command and run job

In [2]:
import os
import pandas as pd
from tifffile import imsave, imread

In [19]:
paths.iloc[row,col][:-15]+str(t)+paths.iloc[row,col][-14:]

'/hpc/projects/upt/bioimaging_analytics/Tesaro-DNA-Damage/2020-09/ELN15212_Lu_Ov__DNA_Damage_Val_091020/ELN15212_Lu_Ov_DNA_Damage_Val_Data/Lung/3CG0021748__2020-09-12T10_26_23-Measurement1/Images/r04c04f08p01-ch4sk1fk1fl1.tiff'

In [39]:
_30000_bf.iloc[21000,1]

'/hpc/projects/upt/bioimaging_analytics/Tesaro-DNA-Damage/2020-09/ELN15212_Lu_Ov__DNA_Damage_Val_091020/ELN15212_Lu_Ov_DNA_Damage_Val_Data/Lung/3CG0021748__2020-09-12T10_26_23-Measurement1/Images/r12c20f09p01-ch2sk1fk1fl1.tiff'

In [16]:
_30000_bf.head(1)

Unnamed: 0.1,Unnamed: 0,image_url,new_path,new_loc
12850,12850,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,3CG0021772_r15c21f09p01.tiffTesaro-DNA-Damage-...,/hpc/projects/upt/bioimaging_analytics/Tesaro-...


In [2]:
## Manual method
## Non-mandatory to update
repo_path = os.path.join('/hpc/projects/upt/samuel_tonks_experimental_space/repos/gskgithub/virtual_staining/workflow/phase2_sampling_normalisation/')
py_path = os.path.join(repo_path,
                       'move_tesaro.py'
)

csv_path = os.path.join(
    '/hpc/projects/upt/bioimaging_analytics/Tesaro-DNA-Damage/images_and_metadata.csv'
)

random_val_samples_path = os.path.join(
    "/hpc/scratch/rdip1/smt29021/Tesaro-DNA-Damage/Data/DATA_Processing/"
)

good_lung_val_csv_path = os.path.join(
    '/hpc/projects/upt/bioimaging_analytics/Tesaro-DNA-Damage/error_analysis/ov_val_good_bf.csv',
#     'lu_val_good_bf.csv'
)


### Retrospective move of bf_dna_16bit_lung_val2_30000imgs which are cropped to 1056 1056 need to be 1080.


#Import Tesaro CSV
full_dataset = pd.read_csv(csv_path)
print("Number of Images: "+str(full_dataset.shape[0]))
full_dataset.head()

### Toxic/Non-toxic sampling

img_data = pd.read_csv(good_lung_val_csv_path)
# img_data = img_data.loc[img_data['toxic']==0]
# print("Number of Images: "+str(img_data.shape[0]))
img_data.head()
#

 
### Load txt file containing row index of chosen Bright-field samples and then generate the corresponding target paths

'/hpc/scratch/rdip1/smt29021/Tesaro-DNA-Damage/Data/DATA_Processing/'
#Generate bf samples
indexes, _30000_bf = extract_samples("ov_val_toxicsamples_30000.txt",img_data,random_val_samples_path)
tissue = 'Ovarian'
# _30000_bf = _30000_bf.iloc[:,0]
# _30000_bf.columns
# _30000_bf.iloc[0,-6]
_30000_bf.head()





Number of Images: 2562351


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,image_url,new_path,new_loc,URL_x,Row,Col,FieldID,...,ObjectiveMagnification,ExposureTime,image_folder1,final_cell_line,tissue_final,UID,plate_setting,key,key_final,toxic
40275,161101,161101,161101,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,3CG0022700_r12c10f09p01.tiffTesaro-DNA-Damage-...,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,r12c10f09p01-ch2sk1fk1fl1.tiff,12,10,9,...,20,"[[0.987911,0,0,-21.6],[0,-0.987911,0,16.5],[0,...",ELN15212_Lu_Ov__DNA_Damage_Val_091020,3CG0022700,Ovarian,5fabb1bc-f883-4f40-a5c7-6b0dc35033d6,Val,3CG0022700_r12c10f09p01-ch2sk1fk1fl1.tiff,3CG0022700_r12c10f09p01.tiff,0
76362,305449,305449,305449,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,3CG0022724_r04c07f01p01.tiffTesaro-DNA-Damage-...,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,r04c07f01p01-ch2sk1fk1fl1.tiff,4,7,1,...,20,"[[0.987911,0,0,-21.6],[0,-0.987911,0,16.5],[0,...",ELN15212_Lu_Ov__DNA_Damage_Val_091020,3CG0022724,Ovarian,87ed7e8b-e14d-4079-91a8-812356131f4f,Val,3CG0022724_r04c07f01p01-ch2sk1fk1fl1.tiff,3CG0022724_r04c07f01p01.tiff,1
69966,279865,279865,279865,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,3CG0022728_r06c13f08p01.tiffTesaro-DNA-Damage-...,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,r06c13f08p01-ch2sk1fk1fl1.tiff,6,13,8,...,20,"[[0.987911,0,0,-21.6],[0,-0.987911,0,16.5],[0,...",ELN15212_Lu_Ov__DNA_Damage_Val_091020,3CG0022728,Ovarian,c7e75d79-4ca6-472b-b1f3-4ff27494ddf5,Val,3CG0022728_r06c13f08p01-ch2sk1fk1fl1.tiff,3CG0022728_r06c13f08p01.tiff,1
40556,162225,162225,162225,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,3CG0022700_r13c18f02p01.tiffTesaro-DNA-Damage-...,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,r13c18f02p01-ch2sk1fk1fl1.tiff,13,18,2,...,20,"[[0.987911,0,0,-21.6],[0,-0.987911,0,16.5],[0,...",ELN15212_Lu_Ov__DNA_Damage_Val_091020,3CG0022700,Ovarian,873caf02-ef25-439c-86d8-dc87dcbdddaa,Val,3CG0022700_r13c18f02p01-ch2sk1fk1fl1.tiff,3CG0022700_r13c18f02p01.tiff,1
47655,190621,190621,190621,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,3CG0022708_r14c16f08p01.tiffTesaro-DNA-Damage-...,/hpc/projects/upt/bioimaging_analytics/Tesaro-...,r14c16f08p01-ch2sk1fk1fl1.tiff,14,16,8,...,20,"[[0.987911,0,0,-21.6],[0,-0.987911,0,16.5],[0,...",ELN15212_Lu_Ov__DNA_Damage_Val_091020,3CG0022708,Ovarian,7fd93bdd-7024-4b48-b2fd-1bd89e5c987a,Val,3CG0022708_r14c16f08p01-ch2sk1fk1fl1.tiff,3CG0022708_r14c16f08p01.tiff,1


In [18]:
img_data.shape

(2364600, 4)

In [3]:
dapi_new_path= os.path.join(f'/hpc/projects/upt/samuel_tonks_experimental_space/datasets/Tesaro-DNA-Damage/Data/{tissue}/Step1_Preprocessing/ACT1_Normalise/bf_dna_30k_toxic/')
fitc_new_path= os.path.join(f'/hpc/projects/upt/samuel_tonks_experimental_space/datasets/Tesaro-DNA-Damage/Data/{tissue}/Step1_Preprocessing/ACT1_Normalise/bf_fitc_30k_toxic/')
cy5_new_path = os.path.join(f'/hpc/projects/upt/samuel_tonks_experimental_space/datasets/Tesaro-DNA-Damage/Data/{tissue}/Step1_Preprocessing/ACT1_Normalise/bf_cy5_30k_toxic/')

create_directory(dapi_new_path)
create_directory(fitc_new_path)
create_directory(cy5_new_path)
for i in ['train','val']:
    create_directory(os.path.join(dapi_new_path,i+'_A'))
    create_directory(os.path.join(dapi_new_path,i+'_B'))
    create_directory(os.path.join(fitc_new_path,i+'_A'))
    create_directory(os.path.join(fitc_new_path,i+'_B'))
    create_directory(os.path.join(cy5_new_path,i+'_A'))
    create_directory(os.path.join(cy5_new_path,i+'_B'))

train_30000_bf = _30000_bf.iloc[:21000,:]
val_30000_bf = _30000_bf.iloc[21000:27000,:]


Successfully create the directory /hpc/projects/upt/samuel_tonks_experimental_space/datasets/Tesaro-DNA-Damage/Data/Ovarian/Step1_Preprocessing/ACT1_Normalise/bf_dna_30k_toxic/
Successfully create the directory /hpc/projects/upt/samuel_tonks_experimental_space/datasets/Tesaro-DNA-Damage/Data/Ovarian/Step1_Preprocessing/ACT1_Normalise/bf_fitc_30k_toxic/
Successfully create the directory /hpc/projects/upt/samuel_tonks_experimental_space/datasets/Tesaro-DNA-Damage/Data/Ovarian/Step1_Preprocessing/ACT1_Normalise/bf_cy5_30k_toxic/
Successfully create the directory /hpc/projects/upt/samuel_tonks_experimental_space/datasets/Tesaro-DNA-Damage/Data/Ovarian/Step1_Preprocessing/ACT1_Normalise/bf_dna_30k_toxic/train_A
Successfully create the directory /hpc/projects/upt/samuel_tonks_experimental_space/datasets/Tesaro-DNA-Damage/Data/Ovarian/Step1_Preprocessing/ACT1_Normalise/bf_dna_30k_toxic/train_B
Successfully create the directory /hpc/projects/upt/samuel_tonks_experimental_space/datasets/Tesaro-

'r15c24f09p01_input.tiff'

In [4]:

# # #Generate target channels and inputs for move
# _val_4ch = channel_create(_30000_bf,0)
# _val_sig = image_folder_match(full_dataset,_30000_bf,"image_url","Brightfield")

# Generate targets 
target_path_old = []
target_path_new = []
source_path_new = []
source_path_old = []
paths = train_30000_bf
print(f'Total Train {paths.shape[0]}')
col = 3
for row in range(paths.shape[0]):
    
    for t in [1,3,4]:
        if t ==1:
            # BF path
            source_path_old.append(paths.iloc[row,col])
            # Save BF in new loc with new name
            source_path_new.append(os.path.join(dapi_new_path,'train_A',paths.iloc[row,col].split('/')[-3][:11])+paths.iloc[row,col].split('/')[-1][:12]+'_input.tiff')
            # Stain path
            target_path_old.append(paths.iloc[row,col][:-15]+str(t)+paths.iloc[row,col][-14:])
            target_path_new.append(os.path.join(dapi_new_path,'train_B',paths.iloc[row,col].split('/')[-3][:11])+paths.iloc[row,col].split('/')[-1][:12]+'_real.tiff')
        if t ==3:
            source_path_old.append(paths.iloc[row,col])
            source_path_new.append(os.path.join(fitc_new_path,'train_A',paths.iloc[row,col].split('/')[-3][:11])+paths.iloc[row,col].split('/')[-1][:12]+'_input.tiff')
            target_path_old.append(paths.iloc[row,col][:-15]+str(t)+paths.iloc[row,col][-14:])
            target_path_new.append(os.path.join(fitc_new_path,'train_B',paths.iloc[row,col].split('/')[-3][:11])+paths.iloc[row,col].split('/')[-1][:12]+'_real.tiff')
        if t ==4:
            source_path_old.append(paths.iloc[row,col])
            source_path_new.append(os.path.join(cy5_new_path,'train_A',paths.iloc[row,col].split('/')[-3][:11])+paths.iloc[row,col].split('/')[-1][:12]+'_input.tiff')
            target_path_old.append(paths.iloc[row,col][:-15]+str(t)+paths.iloc[row,col][-14:])
            target_path_new.append(os.path.join(cy5_new_path,'train_B',paths.iloc[row,col].split('/')[-3][:11])+paths.iloc[row,col].split('/')[-1][:12]+'_real.tiff')
        else:
            continue
print(len(target_path_old))
print(len(target_path_new))
paths = val_30000_bf      
print(f'Total Val {paths.shape[0]}')
for row in range(paths.shape[0]):
    for t in [1,3,4]:
        if t ==1:
            
            source_path_old.append(paths.iloc[row,col])
            source_path_new.append(os.path.join(dapi_new_path,'val_A',paths.iloc[row,col].split('/')[-3][:11])+paths.iloc[row,col].split('/')[-1][:12]+'_input.tiff')
            target_path_old.append(paths.iloc[row,col][:-15]+str(t)+paths.iloc[row,col][-14:])
            target_path_new.append(os.path.join(dapi_new_path,'val_B',paths.iloc[row,col].split('/')[-3][:11])+paths.iloc[row,col].split('/')[-1][:12]+'.tiff')
        if t ==3:
            
            source_path_old.append(paths.iloc[row,col])
            source_path_new.append(os.path.join(fitc_new_path,'val_A',paths.iloc[row,col].split('/')[-3][:11])+paths.iloc[row,col].split('/')[-1][:12]+'_input.tiff')
            target_path_old.append(paths.iloc[row,col][:-15]+str(t)+paths.iloc[row,col][-14:])
            target_path_new.append(os.path.join(fitc_new_path,'val_B',paths.iloc[row,col].split('/')[-3][:11])+paths.iloc[row,col].split('/')[-1][:12]+'_real.tiff')
        if t ==4:
            
            source_path_old.append(paths.iloc[row,col])
            source_path_new.append(os.path.join(cy5_new_path,'val_A',paths.iloc[row,col].split('/')[-3][:11])+paths.iloc[row,col].split('/')[-1][:12]+'_input.tiff')
            target_path_old.append(paths.iloc[row,col][:-15]+str(t)+paths.iloc[row,col][-14:])
            target_path_new.append(os.path.join(cy5_new_path,'val_B',paths.iloc[row,col].split('/')[-3][:11])+paths.iloc[row,col].split('/')[-1][:12]+'_real.tiff')
        else:
            continue
# paths = test_30000_bf
# print(f'Total Test {paths.shape[0]}')

# for row in range(paths.shape[0]):
#     for t in [1,3,4]:
#         if t ==1:
            
#             source_path_old.append(paths.iloc[row,0])
#             source_path_new.append(os.path.join(dapi_new_path,'test_A',paths.iloc[row,-2]))
#             target_path_old.append(paths.iloc[row,0][:-15]+str(t)+paths.iloc[row,0][-14:])
#             target_path_new.append(os.path.join(dapi_new_path,'test_B',paths.iloc[row,-2]))
#         if t ==3:
            
#             source_path_old.append(paths.iloc[row,0])
#             source_path_new.append(os.path.join(fitc_new_path,'test_A',paths.iloc[row,-2]))
#             target_path_old.append(paths.iloc[row,0][:-15]+str(t)+paths.iloc[row,0][-14:])
#             target_path_new.append(os.path.join(fitc_new_path,'test_B',paths.iloc[row,-2]))
#         if t ==4:
            
#             source_path_old.append(paths.iloc[row,0])
#             source_path_new.append(os.path.join(cy5_new_path,'test_A',paths.iloc[row,-2]))
#             target_path_old.append(paths.iloc[row,0][:-15]+str(t)+paths.iloc[row,0][-14:])
#             target_path_new.append(os.path.join(cy5_new_path,'test_B',paths.iloc[row,-2]))
#         else:
#             continue

print(len(target_path_old))
print(len(target_path_new))
print(source_path_old[0],source_path_new[0])

print(source_path_old[100],source_path_new[100])




Total Train 21000
63000
63000
Total Val 6000
81000
81000
/hpc/projects/upt/bioimaging_analytics/Tesaro-DNA-Damage/2020-09/ELN15212_Lu_Ov__DNA_Damage_Val_091020/ELN15212_Lu_Ov_DNA_Damage_Val_Data/Ovarian/3CG0022700__2020-09-13T16_21_23-Measurement1/Images/r12c10f09p01-ch2sk1fk1fl1.tiff /hpc/projects/upt/samuel_tonks_experimental_space/datasets/Tesaro-DNA-Damage/Data/Ovarian/Step1_Preprocessing/ACT1_Normalise/bf_dna_30k_toxic/train_A/3CG0022700_r12c10f09p01_input.tiff
/hpc/projects/upt/bioimaging_analytics/Tesaro-DNA-Damage/2020-09/ELN15212_Lu_Ov__DNA_Damage_Val_091020/ELN15212_Lu_Ov_DNA_Damage_Val_Data/Ovarian/3CG0022744__2020-09-14T23_37_06-Measurement1/Images/r05c12f06p01-ch2sk1fk1fl1.tiff /hpc/projects/upt/samuel_tonks_experimental_space/datasets/Tesaro-DNA-Damage/Data/Ovarian/Step1_Preprocessing/ACT1_Normalise/bf_fitc_30k_toxic/train_A/3CG0022744_r05c12f06p01_input.tiff


In [5]:
print(target_path_old[30000], target_path_new[30000])

/hpc/projects/upt/bioimaging_analytics/Tesaro-DNA-Damage/2020-09/ELN15212_Lu_Ov__DNA_Damage_Val_091020/ELN15212_Lu_Ov_DNA_Damage_Val_Data/Ovarian/3CG0022752__2020-09-14T00_27_40-Measurement1/Images/r12c01f04p01-ch1sk1fk1fl1.tiff /hpc/projects/upt/samuel_tonks_experimental_space/datasets/Tesaro-DNA-Damage/Data/Ovarian/Step1_Preprocessing/ACT1_Normalise/bf_dna_30k_toxic/train_B/3CG0022752_r12c01f04p01_real.tiff


In [7]:
from tqdm import tqdm 
from tifffile import imread, imsave
for j in tqdm(range(len(source_path_old))):
    source = imread(source_path_old[j])
    imsave(source_path_new[j],source.astype(np.float32),imagej=True)
    target = imread(target_path_old[j])
    imsave(target_path_new[j],target.astype(np.float32),imagej=True)

100%|██████████| 81000/81000 [1:29:48<00:00, 15.78it/s]


### End