### Import libraries

In [1]:
import pathlib
import pandas as pd
import numpy as np

import sys
sys.path.append("../utils")
from load_utils import compile_mitocheck_batch_data
#from training_data_utils import get_labeled_cells

### Compile training data batches into one dataframe

In [2]:
training_data_features_path = pathlib.Path("../1.idr_streams/extracted_features/training_data")
training_data = compile_mitocheck_batch_data(training_data_features_path)
training_data.head(1)

Unnamed: 0,Object_Outline,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_Gene,...,efficientnet_1270,efficientnet_1271,efficientnet_1272,efficientnet_1273,efficientnet_1274,efficientnet_1275,efficientnet_1276,efficientnet_1277,efficientnet_1278,efficientnet_1279
0,[[511 8]\n [510 9]\n [509 9]\n [508 9]...,512.921053,21.578947,LT0065_04,20,49,1,LT0065_04_20,LT0065_04/LT0065_04_20_49.tif,CDCA8,...,-0.048642,-0.005226,-0.151536,0.067169,-0.224536,0.094148,-0.034553,-0.091014,-0.035447,0.466544


### Find cells with Mitocheck-assigned labels

In [17]:
import ast
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

def get_frame_metadata(frame_details: str):
    """
    get frame metadata from features samples movie details string

    Parameters
    ----------
    frame_details : str
        string from one line of features samples file
        ex: PLLT0010_27--ex2005_05_13--sp2005_03_23--tt17--c5___P00173_01___T00082___X0397___Y0618

    Returns
    -------
    plate: str
        plate of sample
    well_num: int
        well number of sample
    frame: int
        frame of sample
    center_x: int
        center x coord of cell
    center_y: int
        center y coord of cell
    """
    
    plate = frame_details.split("--")[0].replace("PL", "")
    well_num = int(frame_details.split("___")[1][1:6])
    frame = int(frame_details.split("___")[2][1:6]) + 1
    center_x = int(frame_details.split("___")[3][1:])
    center_y = int(frame_details.split("___")[4][1:])

    return plate, well_num, frame, center_x, center_y

def parse_outline_data(raw_outline_data: str) -> np.array:
    """
    parse outline data extracted with IDR stream into numpy array format

    Parameters
    ----------
    raw_outline_data : str
        string of outline data

    Returns
    -------
    np.array
        parsed outline data
    """
    outline_data = []
    
    raw_outline_data = raw_outline_data[1:-1]
    raw_outline_data = raw_outline_data.split("\n ")
    for coord_string in raw_outline_data:
        x = int(coord_string[1:-1].split()[0])
        y = int(coord_string[1:-1].split()[1])
        outline_data.append([x, y])
        
    return np.array(outline_data)

def center_in_outline(center_x: int, center_y: int, raw_outline_data: str) -> bool:
    outline_data = parse_outline_data(raw_outline_data)
    point = Point(center_x, center_y)
    cell_polygon = Polygon(outline_data)
    return cell_polygon.contains(point)

def get_labled_cells(training_data: pd.DataFrame, features_samples_path: pathlib.Path) -> pd.DataFrame:
    with open(features_samples_path) as labels_file:
        labeled_cells = []
        
        count = 0
        for line in labels_file:
            # get phenotpic label of cell from feature samples file line
            phenotypic_class = line.strip().split("\t")[0]
            # getframe info from feature samples file line
            frame_details = line.strip().split("\t")[1]
            plate, well_num, frame, center_x, center_y = get_frame_metadata(frame_details)
            # print(phenotypic_class)
            # print(plate)
            # print(well_num)
            # print(frame)
            # print(center_x)
            # print(center_y)
            
            # get all single cell features for this particular frame
            frame_cells = training_data.loc[
                (training_data["Metadata_Plate"] == plate)
                & (training_data["Metadata_Well"] == str(well_num))
                & (training_data["Metadata_Frame"] == str(frame))
            ]            
            # print(frame_cells.shape)
            
            for index, row in frame_cells.iterrows():
                raw_outline_data = row["Object_Outline"]
                if center_in_outline(center_x, center_y, raw_outline_data):
                    full_row = pd.concat([pd.Series([phenotypic_class]), row])
                    labeled_cells.append(full_row)
                    break
            
            if count == 4:
                break
            count += 1
                
    labeled_cells = pd.DataFrame(labeled_cells)
    labeled_cells = labeled_cells.rename(columns = {labeled_cells.columns[0]: "Mitocheck_Phenotypic_Class"})
    return labeled_cells


features_samples_path = pathlib.Path("../mitocheck_metadata/features.samples.txt")

labeled_cells = get_labled_cells(training_data, features_samples_path)  
labeled_cells

Unnamed: 0,Mitocheck_Phenotypic_Class,Object_Outline,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,...,efficientnet_1270,efficientnet_1271,efficientnet_1272,efficientnet_1273,efficientnet_1274,efficientnet_1275,efficientnet_1276,efficientnet_1277,efficientnet_1278,efficientnet_1279
0,Large,[[396 595]\n [395 596]\n [394 596]\n [393 596]...,397.288288,618.558559,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.037762,-0.13654,-0.216995,0.019637,-0.192598,0.60511,0.391345,-0.119954,-0.002111,0.748429
1,Large,[[361 563]\n [360 564]\n [359 564]\n [358 564]...,359.535714,585.0625,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.115016,-0.25539,-0.229136,0.021084,-0.158222,0.672657,-0.057864,-0.060401,-0.166709,0.468204
2,Large,[[379 662]\n [378 663]\n [377 663]\n [376 663]...,383.282051,685.222222,LT0010_27,173,83,1,LT0010_27_173,LT0010_27/LT0010_27_173_83.tif,...,-0.062305,0.077424,-0.211331,0.403525,-0.183601,0.303051,-0.040686,-0.109288,-0.165507,0.999497
3,Large,[[923 515]\n [922 516]\n [921 516]\n [920 516]...,934.568807,534.385321,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,-0.151192,-0.194984,-0.205683,0.246189,-0.170982,0.590972,-0.040195,-0.087733,-0.223349,1.189712
4,Large,[[483 96]\n [482 97]\n [481 97]\n [480 98]...,481.007143,121.978571,LT0013_38,42,75,1,LT0013_38_42,LT0013_38/LT0013_38_42_75.tif,...,-0.069813,-0.160207,-0.177817,-0.127709,-0.204516,0.75555,0.106907,-0.173012,-0.149358,1.199448


In [None]:
features_path = pathlib.Path("../mitocheck_metadata/features/")
training_set_dat_path = pathlib.Path("../mitocheck_metadata/trainingset_2007_06_21.dat")

labeled_cells = get_labeled_cells(features_path, training_set_dat_path, training_data)

### Replace `Shape1` and `Shape3` with their respective classes
#### See [#16](https://github.com/WayScience/mitocheck_data/issues/16) for more details

In [None]:
labeled_cells = labeled_cells.replace("Shape1", "Binuclear")
labeled_cells = labeled_cells.replace("Shape3", "Polylobed")

### Preview labeled cells

In [None]:
labeled_cells

### Save labeled training data

In [None]:
results_dir = pathlib.Path("results/")
results_dir.mkdir(exist_ok=True, parents=True)

compiled_training_data_path = pathlib.Path(f"{results_dir}/training_data.csv.gz")
labeled_cells.to_csv(compiled_training_data_path, compression="gzip")