# Convert the messy old format of the dataset to something more organized

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import tensorflow as tf
from tensorflow import keras

import numpy as np
from pathlib import Path
import zarr
import matplotlib.gridspec as gridspec
from skimage.transform import rescale, resize, downscale_local_mean
import os

home = str(Path.home())
data_root = home + '/leukosight_data/'
dataset = zarr.open(data_root + 'crops_fieldstop_filtered.zarr', 'r')
rgb_dataset = zarr.open(data_root + 'rgb_crops.zarr', 'r')
csv_path = data_root + 'crops_record_with_batch_corrections.csv'
dataframe = pd.read_csv(csv_path, index_col=0, low_memory=False)
channel_names = dataset[list(dataset)[0]]['all_blobs'].attrs['channel_names']

all_filenames =  ['{}/{}'.format(int(dataset_index), crop_name) for dataset_index, crop_name 
         in zip(dataframe['dataset_index'].to_numpy(), dataframe['blob_name'].tolist())]

In [11]:
channel_names

['DF_50',
 'F1_BV711',
 'F2_BV650',
 'F3_BV605',
 'F4_BV570',
 'F5_BV510',
 'F6_BV421',
 'autofocus',
 'led_1',
 'led_10',
 'led_100',
 'led_101',
 'led_102',
 'led_103',
 'led_104',
 'led_105',
 'led_106',
 'led_107',
 'led_108',
 'led_109',
 'led_110',
 'led_111',
 'led_112',
 'led_113',
 'led_114',
 'led_115',
 'led_116',
 'led_117',
 'led_118',
 'led_119',
 'led_121',
 'led_122',
 'led_123',
 'led_124',
 'led_125',
 'led_126',
 'led_127',
 'led_128',
 'led_129',
 'led_13',
 'led_130',
 'led_131',
 'led_132',
 'led_133',
 'led_134',
 'led_135',
 'led_136',
 'led_137',
 'led_138',
 'led_139',
 'led_14',
 'led_140',
 'led_141',
 'led_142',
 'led_143',
 'led_144',
 'led_145',
 'led_146',
 'led_147',
 'led_148',
 'led_149',
 'led_15',
 'led_150',
 'led_151',
 'led_152',
 'led_153',
 'led_154',
 'led_155',
 'led_156',
 'led_157',
 'led_158',
 'led_159',
 'led_16',
 'led_160',
 'led_161',
 'led_162',
 'led_163',
 'led_164',
 'led_165',
 'led_166',
 'led_167',
 'led_168',
 'led_169',
 'led

## Process dataframe and resave

In [69]:
df_new = dataframe.copy()
#remove stuff for old zarr structure and add in new
replicate = dataframe.dataset_index.map(lambda x: '2' if (x == 24 or x == 26)  else '1')
df_new['base_path'] = 'batch_' + dataframe.batch.map(lambda x: str(int(x))) + '/' + \
        dataframe.marker + \
        '/replicate_'  + replicate + '/'
df_new['replicate'] = replicate

del df_new['blob_name']
del df_new['dataset_index']

df_new = df_new.rename(columns = {
    "blob_x": "position_in_fov_x",
    "blob_y": "position_in_fov_y",
    "blob_radius": "detection_radius",
    "affine_corrected_target": "batch_corrected_fluor",
    "shading_correction_index": "fluor_shading_correction_index",
    "histology_match": "matched_histology_cell"
                        })

del df_new['closest_cell']
del df_new['closest_cell_edge']
del df_new['jiggler_shifts']
del df_new['valid_fluor']
del df_new['Positive_cells']
del df_new['hist_gradient']
del df_new['position_index']
del df_new['closest_histology_cell_dist']

df_new.to_csv(data_root + 'BSBCM.csv', index=False)

In [61]:
reloaded = pd.read_csv(data_root + 'BSBCM.csv', low_memory=False)
reloaded

Unnamed: 0,position_in_fov_y,position_in_fov_x,detection_radius,BV711,BV650,BV605,BV570,BV510,BV421,BV711_without_local_background_sub,...,unmixed_6+0noreg_CD16,matched_histology_cell,batch_corrected_fluor,batch,marker,imaging_date,fluor_shading_correction_index,notes,base_path,replicate
0,1223.0,556.0,20.48,241593.293020,281547.395936,228047.434386,97697.166366,319369.144081,319178.831050,251476.815136,...,6.908218,False,7.762084,2.0,CD45,2018-09-12,2.0,,batch_2/CD14/replicate_1/,1
1,906.0,1072.0,20.48,144599.008650,195401.117208,177962.813102,101960.090295,263797.947238,184864.006992,142993.268798,...,7.769230,False,6.532650,2.0,CD45,2018-09-12,2.0,,batch_2/CD14/replicate_1/,1
2,633.0,1318.0,20.48,145361.807166,213105.862980,186174.308666,97885.395090,285886.832550,180144.248221,142855.355342,...,7.027197,False,6.487333,2.0,CD45,2018-09-12,2.0,,batch_2/CD14/replicate_1/,1
3,1669.0,1722.0,20.48,130254.810136,188840.775296,175390.466579,75963.579654,262379.153000,145236.367605,169498.347724,...,6.926256,False,6.044180,2.0,CD45,2018-09-12,2.0,,batch_2/CD14/replicate_1/,1
4,1544.0,545.0,20.48,143985.694660,201888.698443,185984.240590,96205.459160,273642.199298,153889.069723,187008.991331,...,6.908851,False,6.204303,2.0,CD45,2018-09-12,2.0,,batch_2/CD14/replicate_1/,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
412936,876.0,1676.0,20.48,97289.773989,135130.135960,125692.518441,75605.243251,194324.399096,55364.225157,84829.235324,...,7.771644,False,0.011737,2.0,CD14,2018-09-10,2.0,,batch_2/CD14/replicate_1/,1
412937,691.0,938.0,12.80,97424.385243,135525.191000,125396.289227,87826.064288,186296.412509,54954.694371,84474.384151,...,7.830926,False,0.000730,2.0,CD14,2018-09-10,2.0,,batch_2/CD14/replicate_1/,1
412938,499.0,1524.0,20.48,100311.236872,127607.780182,116528.410922,74324.658972,172782.549201,50361.421565,86323.371934,...,6.908400,False,0.366781,2.0,CD14,2018-09-10,2.0,,batch_2/CD14/replicate_1/,1
412939,1480.0,737.0,20.48,102895.611266,138936.773664,132534.426780,87788.229405,194518.127531,59124.173094,28830.633118,...,7.270655,False,0.071645,2.0,CD14,2018-09-10,2.0,,batch_2/CD14/replicate_1/,1


## Process Zarr data and resave

In [12]:
new_name = 'BSBCM.zarr'

channel_names = dataset['0']['all_blobs'].attrs['channel_names']
led_array_channel_names = channel_names[:22] + [channel_names[-3]]
fluor_channel_names = channel_names[22:28]
new_file = zarr.open(data_root + new_name, mode='r+')
# new_file.attrs['fluorescence_channels'] = fluor_channel_names
# new_file.attrs['led_array_channel_names'] = led_array_channel_names

for i in range(len(dataframe)):
    print(str(i) + ' of  {}\r'.format(len(dataframe)), end='')

    entry = dataframe.iloc[i]
#     rgb_entry = rgb_dataframe.iloc[i]
    
    replicate = 2 if ((entry['dataset_index'] ==  24) or (entry['dataset_index'] ==  26)) else 1
    
    if replicate == 1:
        continue
    
    base_path = 'batch_{}/{}/replicate_{}'.format(int(entry['batch']), entry['marker'], replicate)
    base_path_wrong = 'batch_{}/{}/replicate_{}'.format(int(entry['batch']), entry['marker'], 1)
    fluor_dest = base_path + '/fluor/cell_{}'.format(i)
    lf_dest = base_path + '/led_array/cell_{}'.format(i)
    dpc_dest = base_path + '/dpc/cell_{}'.format(i)
    hist_dest = base_path + '/histology/cell_{}'.format(i)
    
    fluor_dest_wrong = base_path_wrong + '/fluor/cell_{}'.format(i)
    lf_dest_wrong = base_path_wrong + '/led_array/cell_{}'.format(i)
    dpc_dest_wrong = base_path_wrong + '/dpc/cell_{}'.format(i)
    hist_dest_wrong = base_path_wrong + '/histology/cell_{}'.format(i)

    for path, path_wrong in zip(
        [fluor_dest, lf_dest, dpc_dest, hist_dest],
        [fluor_dest_wrong, lf_dest_wrong, dpc_dest_wrong, hist_dest_wrong]):
        if path_wrong in new_file:
            new_file[path] = new_file[path_wrong]
            del new_file[path_wrong]
    
#     lf_source = str(int(entry['dataset_index'])) + '/' + str(entry['blob_name'])
#     if rgb_entry['histology_match']:
#         histology_source = rgb_entry.histology_dataset_name + '/' + rgb_entry.closest_histology_cell_name
#     else:
#         histology_source = None
    

#     fluor_data = dataset[lf_source][22:28]
#     led_array_data = np.concatenate([dataset[lf_source][:22], dataset[lf_source][-3][None, ...]])
#     dpc_data = dataset[lf_source][-1][None, ...]

#     new_file.create_dataset(name=fluor_dest, data=fluor_data.astype(np.uint16), chunks=(1, 150, 150))
#     new_file.create_dataset(name=lf_dest, data=led_array_data.astype(np.uint16), chunks=(1, 150, 150))
#     new_file.create_dataset(name=dpc_dest, data=dpc_data, chunks=(1, 150, 150))
    
#     if histology_source is not None:
# #         print(hist_dest)
#         hist_data = rgb_dataset[histology_source]
#         new_file.create_dataset(name=hist_dest, data=hist_data, chunks=None)


412940 of  412941

In [10]:
base_path = 'batch_{}/{}/replicate_{}'.format(int(entry['batch']), entry['marker'], replicate)
base_path_wrong = 'batch_{}/{}/replicate_{}'.format(int(entry['batch']), entry['marker'], 1)
fluor_dest = base_path + '/fluor/cell_{}'.format(i)
lf_dest = base_path + '/led_array/cell_{}'.format(i)
dpc_dest = base_path + '/dpc/cell_{}'.format(i)
hist_dest = base_path + '/histology/cell_{}'.format(i)

fluor_dest_wrong = base_path_wrong + '/fluor/cell_{}'.format(i)
lf_dest_wrong = base_path_wrong + '/led_array/cell_{}'.format(i)
dpc_dest_wrong = base_path_wrong + '/dpc/cell_{}'.format(i)
hist_dest_wrong = base_path_wrong + '/histology/cell_{}'.format(i)

new_file[fluor_dest_wrong]
new_file[fluor_dest] = new_file[fluor_dest_wrong]
del new_file[fluor_dest]