### Import libraries

In [1]:
import pathlib
import pandas as pd
import numpy as np

import sys
sys.path.append("../utils")
from load_utils import compile_mitocheck_batch_data
from training_data_utils import get_labeled_cells

### Compile training data batches into one dataframe

In [2]:
training_data_features_path = pathlib.Path("../1.idr_streams/extracted_features/training_data/merged_features/")
training_data = compile_mitocheck_batch_data(training_data_features_path)
print(training_data.shape)
training_data.head()

(64513, 1458)


Unnamed: 0,Cell_UUID,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_Gene,...,DP__efficientnet_1270,DP__efficientnet_1271,DP__efficientnet_1272,DP__efficientnet_1273,DP__efficientnet_1274,DP__efficientnet_1275,DP__efficientnet_1276,DP__efficientnet_1277,DP__efficientnet_1278,DP__efficientnet_1279
0,59a461a1-0cc1-4171-8dc1-2ba2ef695a92,767,18,LT0030_17,58,36,1,LT0030_17_58,LT0030_17/LT0030_17_58_36.tif,OR2H1,...,-0.1051,-0.102676,-0.152182,0.118111,-0.198296,0.264993,-0.063839,-0.21394,-0.15479,1.660794
1,98990ced-515c-4a2a-9aa7-92dafae2aa67,1034,17,LT0030_17,58,36,1,LT0030_17_58,LT0030_17/LT0030_17_58_36.tif,OR2H1,...,-0.083561,-0.042327,-0.174123,0.188287,-0.204177,0.223755,-0.031176,-0.157768,-0.141178,1.333314
2,d98463a9-9eec-43d1-bc87-79126f31e09c,300,15,LT0030_17,58,36,1,LT0030_17_58,LT0030_17/LT0030_17_58_36.tif,OR2H1,...,-0.065058,0.007971,-0.191604,-0.159906,-0.169495,-0.027973,-0.029784,-0.121467,-0.151359,0.921443
3,b539d689-6676-4ce8-ac94-23aa14997a04,134,17,LT0030_17,58,36,1,LT0030_17_58,LT0030_17/LT0030_17_58_36.tif,OR2H1,...,-0.080231,0.112016,-0.202462,0.285825,-0.221654,0.404947,-0.026923,0.040602,-0.134523,1.175312
4,0a3afb9c-dc00-4a57-b25a-f3af04470b2b,223,23,LT0030_17,58,36,1,LT0030_17_58,LT0030_17/LT0030_17_58_36.tif,OR2H1,...,-0.088112,-0.056467,-0.1984,0.66689,-0.220922,0.306207,-0.033069,-0.168685,-0.193175,1.500618


### Find cells with Mitocheck-assigned labels

#### Cells may not have feature data found because the specific well is not hosted on IDr (failure to pass QC) or because of differences in Mitocheck/DeepProfiler segmentation

In [3]:
features_samples_path = pathlib.Path("../mitocheck_metadata/features.samples.txt")

labeled_cells = get_labeled_cells(training_data, features_samples_path, "DP__Object_Outline")  
print(f"Shape of labeled_cells: {labeled_cells.shape}")

No feature data derived for cell at: LT0100_03, 93, 84, 241, 710
No feature data derived for cell at: LT0014_12, 159, 70, 911, 678
No feature data derived for cell at: LT0043_48, 166, 44, 296, 320
No feature data derived for cell at: LT0043_48, 166, 71, 279, 313
No feature data derived for cell at: LT0066_19, 287, 48, 1137, 693
No feature data derived for cell at: LT0066_19, 287, 48, 773, 103
No feature data derived for cell at: LT0066_19, 287, 48, 471, 480
No feature data derived for cell at: LT0066_19, 287, 75, 300, 522
No feature data derived for cell at: LT0066_19, 287, 75, 701, 156
No feature data derived for cell at: LT0066_19, 287, 75, 1029, 548
No feature data derived for cell at: LT0066_19, 287, 75, 1138, 689
No feature data derived for cell at: LT0066_19, 287, 75, 470, 779
No feature data derived for cell at: LT0066_19, 287, 75, 958, 105
No feature data derived for cell at: LT0066_19, 287, 75, 719, 347
No feature data derived for cell at: LT0066_19, 287, 87, 826, 373
No featu

### Replace `Shape1` and `Shape3` with their respective classes
#### See [#16](https://github.com/WayScience/mitocheck_data/issues/16) for more details

In [4]:
labeled_cells = labeled_cells.replace("Shape1", "Binuclear")
labeled_cells = labeled_cells.replace("Shape3", "Polylobed")

### Preview labeled cells

In [5]:
labeled_cells

Unnamed: 0,Mitocheck_Phenotypic_Class,Cell_UUID,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,...,DP__efficientnet_1270,DP__efficientnet_1271,DP__efficientnet_1272,DP__efficientnet_1273,DP__efficientnet_1274,DP__efficientnet_1275,DP__efficientnet_1276,DP__efficientnet_1277,DP__efficientnet_1278,DP__efficientnet_1279
0,Large,21da27ab-873a-41f4-ab98-49170cae9a2d,397,618,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.037779,-0.136591,-0.217011,0.019583,-0.192538,0.604850,0.391432,-0.119948,-0.002020,0.747988
1,Large,82f7949b-4ea2-45c8-8dd9-7854caf49077,359,584,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.115009,-0.255369,-0.229094,0.020964,-0.158187,0.672871,-0.057893,-0.060406,-0.166688,0.467811
2,Large,cec7234f-fe35-4411-aded-f8112bb31219,383,685,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.062292,0.077288,-0.211343,0.403338,-0.183594,0.302867,-0.040703,-0.109285,-0.165469,0.998986
3,Large,43d9e7c9-c9ec-45ce-8820-048bfb896989,932,532,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,-0.134947,-0.187267,-0.206022,0.343387,-0.163575,0.552434,-0.036164,-0.097822,-0.232640,1.216684
4,Large,63ce6652-338e-4afd-9c77-dbc0e903bf92,477,130,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,-0.059909,-0.188595,-0.200547,-0.128233,-0.201621,0.768074,-0.041732,-0.153650,-0.161041,1.531892
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2911,OutOfFocus,380728fc-28b0-423f-b8a7-07be1af590d9,383,219,LT0601_01,217,49,1,LT0601_01_217,LT0601_01/LT0601_01_217_49.tif,...,-0.075324,0.912534,-0.017621,-0.166189,-0.197033,-0.233582,-0.062982,-0.221632,-0.113528,1.622744
2912,OutOfFocus,30ed67c7-8de2-4d78-bce9-3fa1aff28565,975,294,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,-0.082657,0.685206,0.000151,0.001153,-0.215521,-0.154319,-0.075875,-0.143585,-0.093561,2.256409
2913,OutOfFocus,2960b13e-6090-4592-b2a9-d1c4c1b24b50,898,302,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,-0.074541,1.154035,-0.059398,-0.091709,-0.230135,-0.187006,-0.046060,-0.123263,-0.104091,1.487117
2914,OutOfFocus,fbc9ce6a-2b29-4115-b218-4ee5b8c50ac1,946,281,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,-0.097351,1.274703,0.020771,0.007426,-0.236653,-0.184583,-0.108749,-0.199749,-0.102120,1.818080


### Save labeled training data

In [6]:
results_dir = pathlib.Path("results/")
results_dir.mkdir(exist_ok=True, parents=True)

compiled_training_data_path = pathlib.Path(f"{results_dir}/training_data.csv.gz")
labeled_cells.to_csv(compiled_training_data_path, compression="gzip")