### Import libraries


In [85]:
import pandas as pd
import numpy as np
import pathlib
from typing import Tuple, Any, List, Union

from sklearn.utils import shuffle

#### Helper functions


In [86]:
def get_training_data(load_path: pathlib.Path) -> pd.DataFrame:
    """get DP training data from csv at load path

    Args:
        load_path (pathlib.Path): path to training data csv

    Returns:
        pd.DataFrame: training dataframe
    """
    # read dataset into pandas dataframe
    training_data = pd.read_csv(load_path, index_col=0)

    # remove training data with ADCCM class as this class was not used for classification in original paper
    training_data = training_data[
        training_data["Mitocheck_Phenotypic_Class"] != "ADCCM"
    ]

    # replace shape1 and shape3 labels with their correct respective classes
    training_data = training_data.replace("Shape1", "Binuclear")
    training_data = training_data.replace("Shape3", "Polylobed")

    return training_data


def get_X_y_data(load_path: pathlib.Path) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """generate X (features) and y (labels) dataframes from training data

    Args:
        load_path (pathlib.Path): path to training data csv

    Returns:
        Tuple[pd.DataFrame, pd.DataFrame]: X, y dataframes
    """
    training_data = get_training_data(load_path)

    # all features from DeepProfiler have "efficientnet" in their column name
    morphology_features = [
        col for col in training_data.columns.tolist() if "efficientnet" in col
    ]

    # extract features
    X = training_data.loc[:, morphology_features].values

    # extract phenotypic class label
    y = training_data.loc[:, ["Mitocheck_Phenotypic_Class"]].values
    # make Y data
    y = np.ravel(y)

    # shuffle data because as it comes from MitoCheck same labels tend to be in grou
    X, y = shuffle(X, y, random_state=0)

    return X, y


def get_random_images_indexes(training_data: pd.DataFrame, num_images: int) -> List:
    """get image indexes from training dataset

    Args:
        training_data (pd.DataFrame): pandas dataframe of training data
        num_images (int): number of images to holdout

    Returns:
        List: list of image indexes to with held out images
    """
    unique_images = pd.unique(training_data["Metadata_Plate_Map_Name"])
    images = np.random.choice(unique_images, size=num_images, replace=False)

    image_indexes_list = []
    for image in images:
        image_indexes = training_data.index[
            training_data["Metadata_Plate_Map_Name"] == image
        ].tolist()
        image_indexes_list.extend(image_indexes)

    return image_indexes_list

In [87]:
# load x (features) and y (labels) dataframes
load_path = pathlib.Path("../1.format_data/data/training_data.csv.gz")
training_data = get_training_data(load_path)
print(training_data.shape)

# number of images to holdout
num_holdout_images = 5
# ratio of data to be reserved for testing (ex 0.15 = 15%)
test_ratio = 0.15

(4123, 1292)


In [88]:
# remove holdout indexes
holdout_image_indexes = get_random_images_indexes(training_data, num_holdout_images)
training_data = training_data.drop(pd.Index(data=holdout_image_indexes))
print(training_data.shape)

(3973, 1292)


In [89]:
# remove test indexes
num_test_indexes = int(len(training_data.index) * test_ratio)
test_indexes = np.random.choice(
    training_data.index, size=num_test_indexes, replace=False
)
training_data = training_data.drop(pd.Index(data=test_indexes))
train_indexes = np.array(training_data.index)
print(training_data.shape)

(3378, 1292)


In [90]:
# create pandas dataframe with all indexes and their respective labels
index_data = []
for index in holdout_image_indexes:
    index_data.append({"label": "holdout", "index": index})
for index in test_indexes:
    index_data.append({"label": "test", "index": index})
for index in train_indexes:
    index_data.append({"label": "train", "index": index})
index_data = pd.DataFrame(index_data)
index_data

Unnamed: 0,label,index
0,holdout,4186
1,holdout,4187
2,holdout,4188
3,holdout,4189
4,holdout,107
...,...,...
4118,train,4303
4119,train,4304
4120,train,4305
4121,train,4306


In [91]:
# make results dir for saving
results_dir = pathlib.Path("results/")
results_dir.mkdir(parents=True, exist_ok=True)
# save indexes as tsv file
index_data.to_csv(f"{results_dir}/data_split_indexes.tsv", sep="\t")