### Import libraries

In [1]:
import pathlib
import pandas as pd
import numpy as np

import sys
sys.path.append("../utils")
from load_utils import compile_mitocheck_batch_data
from training_data_utils import get_labeled_cells

### Compile training data batches into one dataframe

In [2]:
training_data_features_path = pathlib.Path("../1.idr_streams/extracted_features/training_data")
training_data = compile_mitocheck_batch_data(training_data_features_path)
#training_data

### Find cells with Mitocheck-assigned labels

#### Cells may not have feature data found because the specific well is not hosted on IDr (failure to pass QC) or because of differences in Mitocheck/DeepProfiler segmentation

In [3]:
features_samples_path = pathlib.Path("../mitocheck_metadata/features.samples.txt")

labeled_cells = get_labeled_cells(training_data, features_samples_path)  
print(f"Shape of labeled_cells: {labeled_cells.shape}")

No feature data derived for cell at: LT0100_03, 93, 84, 241, 710
No feature data derived for cell at: LT0043_48, 166, 44, 296, 320
No feature data derived for cell at: LT0043_48, 166, 71, 279, 313
No feature data derived for cell at: LT0066_19, 287, 48, 773, 103
No feature data derived for cell at: LT0066_19, 287, 48, 471, 480
No feature data derived for cell at: LT0066_19, 287, 75, 701, 156
No feature data derived for cell at: LT0066_19, 287, 75, 1138, 689
No feature data derived for cell at: LT0066_19, 287, 75, 470, 779
No feature data derived for cell at: LT0066_19, 287, 75, 1228, 380
No feature data derived for cell at: LT0066_19, 287, 75, 958, 105
No feature data derived for cell at: LT0066_19, 287, 75, 1132, 429
No feature data derived for cell at: LT0066_19, 287, 75, 719, 347
No feature data derived for cell at: LT0066_19, 287, 87, 826, 373
No feature data derived for cell at: LT0066_19, 287, 87, 1229, 384
No feature data derived for cell at: LT0066_19, 287, 87, 587, 445
No feat

### Replace `Shape1` and `Shape3` with their respective classes
#### See [#16](https://github.com/WayScience/mitocheck_data/issues/16) for more details

In [4]:
labeled_cells = labeled_cells.replace("Shape1", "Binuclear")
labeled_cells = labeled_cells.replace("Shape3", "Polylobed")

### Preview labeled cells

In [5]:
labeled_cells

Unnamed: 0,Mitocheck_Phenotypic_Class,Object_Outline,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,...,efficientnet_1270,efficientnet_1271,efficientnet_1272,efficientnet_1273,efficientnet_1274,efficientnet_1275,efficientnet_1276,efficientnet_1277,efficientnet_1278,efficientnet_1279
0,Large,[[396 595]\n [395 596]\n [394 596]\n [393 596]...,397.288288,618.558559,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.037762,-0.136540,-0.216995,0.019637,-0.192598,0.605110,0.391345,-0.119954,-0.002111,0.748429
1,Large,[[361 563]\n [360 564]\n [359 564]\n [358 564]...,359.535714,585.062500,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.115016,-0.255390,-0.229136,0.021084,-0.158222,0.672657,-0.057864,-0.060401,-0.166709,0.468204
2,Large,[[379 662]\n [378 663]\n [377 663]\n [376 663]...,383.282051,685.222222,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.062305,0.077424,-0.211331,0.403525,-0.183601,0.303051,-0.040686,-0.109288,-0.165507,0.999497
3,Large,[[923 515]\n [922 516]\n [921 516]\n [920 516]...,934.568807,534.385321,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,-0.151192,-0.194984,-0.205683,0.246189,-0.170982,0.590972,-0.040195,-0.087733,-0.223349,1.189712
4,Large,[[483 96]\n [482 97]\n [481 97]\n [480 98]...,481.007143,121.978571,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,-0.069813,-0.160207,-0.177817,-0.127709,-0.204516,0.755550,0.106907,-0.173012,-0.149358,1.199448
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2914,OutOfFocus,[[382 185]\n [381 186]\n [380 186]\n [379 186]...,383.075269,220.198925,LT0601_01,217,49,1,LT0601_01_217,LT0601_01/LT0601_01_217_49.tif,...,-0.076265,0.908651,-0.003999,-0.163269,-0.198879,-0.234834,-0.060530,-0.220327,-0.110542,1.624140
2915,OutOfFocus,[[975 277]\n [974 278]\n [973 278]\n [972 278]...,975.747253,293.868132,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,-0.085767,0.644187,0.045952,0.000193,-0.220179,-0.199104,-0.089415,-0.133715,-0.096435,2.091706
2916,OutOfFocus,[[907 280]\n [906 281]\n [905 281]\n [904 281]...,898.614815,302.407407,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,-0.075185,1.151727,-0.055931,-0.100885,-0.231775,-0.186390,-0.047137,-0.124412,-0.099729,1.468004
2917,OutOfFocus,[[944 267]\n [943 268]\n [942 268]\n [941 268]...,946.758621,281.689655,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,-0.102286,1.273646,0.054357,0.003621,-0.238068,-0.190694,-0.113662,-0.198234,-0.100362,1.761842


### Save labeled training data

In [6]:
results_dir = pathlib.Path("results/")
results_dir.mkdir(exist_ok=True, parents=True)

compiled_training_data_path = pathlib.Path(f"{results_dir}/training_data.csv.gz")
labeled_cells.to_csv(compiled_training_data_path, compression="gzip")