### Import libraries

In [1]:
import pathlib
import pandas as pd
import numpy as np

import sys
sys.path.append("../utils")
from load_utils import compile_mitocheck_batch_data
from training_data_utils import get_labeled_cells

### Compile training data batches into one dataframe

In [2]:
training_data_features_path = pathlib.Path("../1.idr_streams/extracted_features__no_ic/training_data/merged_features/")
training_data = compile_mitocheck_batch_data(training_data_features_path)
print(training_data.shape)
training_data.head()

(67769, 1449)


Unnamed: 0,Cell_UUID,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_Gene,...,DP__efficientnet_1270,DP__efficientnet_1271,DP__efficientnet_1272,DP__efficientnet_1273,DP__efficientnet_1274,DP__efficientnet_1275,DP__efficientnet_1276,DP__efficientnet_1277,DP__efficientnet_1278,DP__efficientnet_1279
0,65f9458d-505f-476c-884e-66b901bcf8d2,767,17,LT0030_17,58,36,1,LT0030_17_58,LT0030_17/LT0030_17_58_36.tif,OR2H1,...,-0.138524,-0.025122,0.18597,0.067564,-0.032053,-0.179385,-0.077941,-0.198749,-0.094915,1.268368
1,73077159-1461-497b-92a9-8c7ff1ebffd5,1034,17,LT0030_17,58,36,1,LT0030_17_58,LT0030_17/LT0030_17_58_36.tif,OR2H1,...,-0.094204,-0.172604,-0.155792,0.16122,-0.21796,-0.092356,-0.043888,-0.18516,-0.14779,1.147897
2,54cb0847-f222-42f4-aea0-f1f6d9ee774b,300,15,LT0030_17,58,36,1,LT0030_17_58,LT0030_17/LT0030_17_58_36.tif,OR2H1,...,-0.073384,-0.082663,-0.175949,-0.1236,-0.167409,-0.101721,-0.034375,-0.163504,-0.135161,0.740687
3,fac84a82-544f-4ace-8f95-bc5c0439fff4,134,17,LT0030_17,58,36,1,LT0030_17_58,LT0030_17/LT0030_17_58_36.tif,OR2H1,...,-0.074092,0.109126,-0.156464,0.299307,-0.211889,0.288795,-0.036198,0.110673,-0.133284,1.158171
4,ad52ccdc-f7b1-44d8-b927-9f1adba79e03,223,23,LT0030_17,58,36,1,LT0030_17_58,LT0030_17/LT0030_17_58_36.tif,OR2H1,...,-0.108904,-0.071338,-0.184992,0.663547,-0.20541,0.06225,-0.075266,-0.213797,-0.204549,1.176204


### Find cells with Mitocheck-assigned labels

#### Cells may not have feature data found because the specific well is not hosted on IDr (failure to pass QC) or because of differences in Mitocheck/DeepProfiler segmentation

In [3]:
features_samples_path = pathlib.Path("../mitocheck_metadata/features.samples.txt")

labeled_cells = get_labeled_cells(training_data, features_samples_path, "DP__Object_Outline")  
print(f"Shape of labeled_cells: {labeled_cells.shape}")

No feature data derived for cell at: LT0043_48, 166, 44, 1011, 156
No feature data derived for cell at: LT0043_48, 166, 52, 1068, 97
No feature data derived for cell at: LT0048_13, 69, 72, 258, 60
No feature data derived for cell at: LT0014_12, 159, 70, 911, 678
No feature data derived for cell at: LT0043_48, 166, 44, 296, 320
No feature data derived for cell at: LT0043_48, 166, 71, 279, 313
No feature data derived for cell at: LT0066_19, 287, 48, 1137, 693
No feature data derived for cell at: LT0066_19, 287, 48, 773, 103
No feature data derived for cell at: LT0066_19, 287, 75, 1029, 548
No feature data derived for cell at: LT0066_19, 287, 75, 958, 105
No feature data derived for cell at: LT0066_19, 287, 75, 1132, 429
No feature data derived for cell at: LT0066_19, 287, 75, 719, 347
No feature data derived for cell at: LT0066_19, 287, 87, 826, 373
No feature data derived for cell at: LT0066_19, 287, 87, 1229, 384
No feature data derived for cell at: LT0066_19, 287, 87, 587, 445
No feat

### Replace `Shape1` and `Shape3` with their respective classes
#### See [#16](https://github.com/WayScience/mitocheck_data/issues/16) for more details

In [4]:
labeled_cells = labeled_cells.replace("Shape1", "Binuclear")
labeled_cells = labeled_cells.replace("Shape3", "Polylobed")

### Preview labeled cells

In [5]:
labeled_cells

Unnamed: 0,Mitocheck_Phenotypic_Class,Cell_UUID,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,...,DP__efficientnet_1270,DP__efficientnet_1271,DP__efficientnet_1272,DP__efficientnet_1273,DP__efficientnet_1274,DP__efficientnet_1275,DP__efficientnet_1276,DP__efficientnet_1277,DP__efficientnet_1278,DP__efficientnet_1279
0,Large,3ba3aae1-489a-4eae-87f5-f3c31c9c91e9,397,618,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.108196,-0.132785,-0.232866,-0.080617,-0.209660,0.339120,0.310256,-0.079103,0.019787,0.026272
1,Large,c16d0e78-70db-49ff-bc48-027342fe00da,359,584,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.147134,-0.249670,-0.248986,-0.054563,-0.167180,0.556449,-0.017504,-0.047726,-0.155885,-0.120599
2,Large,4a6e4da9-941d-4a6c-968e-cb1932839313,383,685,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.089825,0.083893,-0.214583,0.365488,-0.193813,0.087384,-0.071763,-0.067414,-0.154316,0.389125
3,Large,bd03dd73-f4fb-4a25-93ce-1720e55de69b,932,532,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,-0.158361,-0.186279,-0.202823,0.351545,-0.135864,0.554414,-0.065711,-0.051070,-0.219128,0.178317
4,Large,9721f66a-6a76-4a22-a257-e021a5da9e01,479,110,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,-0.095762,-0.132112,-0.125235,-0.123419,-0.211807,0.663511,0.019027,-0.086416,-0.144638,0.144213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2889,OutOfFocus,898c4f3a-08ab-4447-9307-ee4e56c02b53,383,219,LT0601_01,217,49,1,LT0601_01_217,LT0601_01/LT0601_01_217_49.tif,...,-0.079803,0.855090,0.091728,-0.157079,-0.198466,-0.236062,-0.061634,-0.212868,-0.097618,1.545395
2890,OutOfFocus,777fb679-c278-4895-90a0-bf3f42873f58,975,293,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,-0.098212,0.649787,0.105673,-0.015579,-0.222207,-0.154992,-0.100405,-0.137378,-0.085548,1.930205
2891,OutOfFocus,122e3161-b1b8-4ef3-908c-0dc898ce031f,899,302,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,-0.106724,1.086434,0.163471,-0.067443,-0.246160,-0.216338,-0.090214,-0.122389,-0.094831,0.819135
2892,OutOfFocus,8bed4ad7-4634-41f5-a7eb-85da98a97e88,946,281,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,...,-0.124820,1.069256,0.137648,-0.023456,-0.245319,-0.164458,-0.131811,-0.191671,-0.088147,1.555363


### Save labeled training data

In [6]:
results_dir = pathlib.Path("results/")
results_dir.mkdir(exist_ok=True, parents=True)

compiled_training_data_path = pathlib.Path(f"{results_dir}/training_data.csv.gz")
labeled_cells.to_csv(compiled_training_data_path, compression="gzip")