# Split feature data
## Create tsv file with indexes for held out data, training data, and testing data
### Import libraries

In [1]:
import pandas as pd
import numpy as np
import pathlib

from sklearn.utils import shuffle

import sys
sys.path.append("../utils")
from split_utils import get_features_data, get_random_images_indexes, get_representative_images, get_image_indexes

### Load data and set holdout/test parameters

In [2]:
# load x (features) and y (labels) dataframes
load_path = pathlib.Path("../0.download_data/data/training_data.csv.gz")
training_data = get_features_data(load_path)
print(training_data.shape)

# ratio of data to be reserved for testing (ex 0.15 = 15%)
test_ratio = 0.15

(7511, 1292)


In [3]:
training_data

Unnamed: 0,Mitocheck_Phenotypic_Class,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_Gene,...,efficientnet_1270,efficientnet_1271,efficientnet_1272,efficientnet_1273,efficientnet_1274,efficientnet_1275,efficientnet_1276,efficientnet_1277,efficientnet_1278,efficientnet_1279
0,MetaphaseAlignment,572.214286,58.185714,LT0066_19,287,1,1,LT0066_19_287,LT0066_19/LT0066_19_287_1.tif,ch-TOG,...,1.048350,-0.721622,0.749788,-1.377590,0.454974,0.188488,0.141427,-1.553405,2.346107,-1.774278
1,Artefact,1117.070423,342.732394,LT0066_19,287,1,1,LT0066_19_287,LT0066_19/LT0066_19_287_1.tif,ch-TOG,...,1.172767,-0.290257,-0.709041,-1.431541,-0.063308,-0.412793,0.452684,-1.906647,1.962141,-0.223039
2,Artefact,1116.500000,362.000000,LT0066_19,287,1,1,LT0066_19_287,LT0066_19/LT0066_19_287_1.tif,ch-TOG,...,1.093582,-0.323180,-0.663069,-1.427502,-0.901764,-0.355080,0.418053,-2.298449,1.098266,-0.069326
3,Artefact,1106.348485,370.469697,LT0066_19,287,1,1,LT0066_19_287,LT0066_19/LT0066_19_287_1.tif,ch-TOG,...,0.943948,-0.211267,-0.346355,-1.365543,-0.276932,0.023856,0.376514,-1.700348,1.833686,-0.625385
4,MetaphaseAlignment,937.692308,521.048077,LT0066_19,287,1,1,LT0066_19_287,LT0066_19/LT0066_19_287_1.tif,ch-TOG,...,0.947300,-0.564136,0.333336,-1.584454,0.891666,1.223252,-0.359166,-0.826366,2.115734,-1.241848
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7506,OutOfFocus,383.075269,220.198925,LT0601_01,217,49,1,LT0601_01_217,LT0601_01/LT0601_01_217_49.tif,ABCB8,...,0.525202,8.110262,1.777901,-1.512628,-0.225867,-1.612982,-0.679415,-2.581475,0.501395,1.981009
7507,OutOfFocus,975.747253,293.868132,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,failed QC,...,0.277908,5.959773,2.362540,-1.000032,-0.723652,-1.460720,-1.919148,-0.301130,0.779582,3.084642
7508,OutOfFocus,898.614815,302.407407,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,failed QC,...,0.553313,10.086836,1.170072,-1.317000,-0.994644,-1.406541,-0.104613,-0.056216,0.714624,1.612470
7509,OutOfFocus,946.758621,281.689655,LT0603_03,2,49,1,LT0603_03_2,LT0603_03/LT0603_03_2_49.tif,failed QC,...,-0.152000,11.078217,2.460915,-0.989282,-1.141723,-1.424882,-2.959780,-1.999795,0.702134,2.306039


In [5]:
training_data.loc[training_data["Metadata_DNA"] == "LT0066_19/LT0066_19_287_1.tif"]

Unnamed: 0,Mitocheck_Phenotypic_Class,Location_Center_X,Location_Center_Y,Metadata_Plate,Metadata_Well,Metadata_Frame,Metadata_Site,Metadata_Plate_Map_Name,Metadata_DNA,Metadata_Gene,...,efficientnet_1270,efficientnet_1271,efficientnet_1272,efficientnet_1273,efficientnet_1274,efficientnet_1275,efficientnet_1276,efficientnet_1277,efficientnet_1278,efficientnet_1279
0,MetaphaseAlignment,572.214286,58.185714,LT0066_19,287,1,1,LT0066_19_287,LT0066_19/LT0066_19_287_1.tif,ch-TOG,...,1.04835,-0.721622,0.749788,-1.37759,0.454974,0.188488,0.141427,-1.553405,2.346107,-1.774278
1,Artefact,1117.070423,342.732394,LT0066_19,287,1,1,LT0066_19_287,LT0066_19/LT0066_19_287_1.tif,ch-TOG,...,1.172767,-0.290257,-0.709041,-1.431541,-0.063308,-0.412793,0.452684,-1.906647,1.962141,-0.223039
2,Artefact,1116.5,362.0,LT0066_19,287,1,1,LT0066_19_287,LT0066_19/LT0066_19_287_1.tif,ch-TOG,...,1.093582,-0.32318,-0.663069,-1.427502,-0.901764,-0.35508,0.418053,-2.298449,1.098266,-0.069326
3,Artefact,1106.348485,370.469697,LT0066_19,287,1,1,LT0066_19_287,LT0066_19/LT0066_19_287_1.tif,ch-TOG,...,0.943948,-0.211267,-0.346355,-1.365543,-0.276932,0.023856,0.376514,-1.700348,1.833686,-0.625385
4,MetaphaseAlignment,937.692308,521.048077,LT0066_19,287,1,1,LT0066_19_287,LT0066_19/LT0066_19_287_1.tif,ch-TOG,...,0.9473,-0.564136,0.333336,-1.584454,0.891666,1.223252,-0.359166,-0.826366,2.115734,-1.241848
5,Prometaphase,1305.853333,656.426667,LT0066_19,287,1,1,LT0066_19_287,LT0066_19/LT0066_19_287_1.tif,ch-TOG,...,1.581095,0.635676,-0.597231,-1.204226,0.247975,0.923955,0.060671,-2.054225,1.040119,-0.528491
6,MetaphaseAlignment,933.880597,733.970149,LT0066_19,287,1,1,LT0066_19_287,LT0066_19/LT0066_19_287_1.tif,ch-TOG,...,0.695536,-0.73057,0.421914,-1.601067,0.695151,0.728484,0.540093,-1.484884,1.702713,-0.931499
7,Prometaphase,713.292308,793.892308,LT0066_19,287,1,1,LT0066_19_287,LT0066_19/LT0066_19_287_1.tif,ch-TOG,...,0.901425,0.043393,-0.263079,-1.048446,-1.372251,-0.327306,0.10213,0.231623,0.713168,-1.066116
8,MetaphaseAlignment,1222.704225,910.295775,LT0066_19,287,1,1,LT0066_19_287,LT0066_19/LT0066_19_287_1.tif,ch-TOG,...,0.929166,-0.425613,0.973351,-1.182827,0.108086,0.600098,0.253736,-1.482687,1.592563,-0.342597
4898,Prometaphase,1305.84,656.453333,LT0066_19,287,1,1,LT0066_19_287,LT0066_19/LT0066_19_287_1.tif,ch-TOG,...,1.556544,0.603248,-0.607631,-1.188498,0.253803,0.875446,0.062015,-2.047122,1.071468,-0.5574


In [13]:
class_values = {"MetaphaseAlignment": 0}
image_locations = training_data["Metadata_DNA"].unique()
np.random.shuffle(image_locations)
image_locations_first = image_locations[:len(image_locations)//2]
image_locations_second = image_locations[len(image_locations)//2:]

for phenotypic_class in training_data["Mitocheck_Phenotypic_Class"].unique():
    
    first_total = 0
    for image_location in image_locations_first:
        try:
            m_a_count = training_data.loc[training_data["Metadata_DNA"] == image_location]["Mitocheck_Phenotypic_Class"].value_counts()[phenotypic_class]
        except KeyError:
            continue
        first_total += m_a_count

    second_total = 0
    for image_location in image_locations_second:
        try:
            m_a_count = training_data.loc[training_data["Metadata_DNA"] == image_location]["Mitocheck_Phenotypic_Class"].value_counts()[phenotypic_class]
        except KeyError:
            continue
        second_total += m_a_count
    print(f"Counts for {phenotypic_class}: first half: {first_total}, first half: {second_total}")

Counts for MetaphaseAlignment:
Total in first half:230
Total in second half:109
Counts for Artefact:
Total in first half:169
Total in second half:109
Counts for Prometaphase:
Total in first half:315
Total in second half:361
Counts for Interphase:
Total in first half:361
Total in second half:445
Counts for Grape:
Total in first half:228
Total in second half:320
Counts for Polylobed:
Total in first half:989
Total in second half:927
Counts for Apoptosis:
Total in first half:202
Total in second half:301
Counts for Metaphase:
Total in first half:63
Total in second half:57
Counts for Binuclear:
Total in first half:373
Total in second half:420
Counts for Elongated:
Total in first half:84
Total in second half:56
Counts for Hole:
Total in first half:106
Total in second half:90
Counts for Folded:
Total in first half:62
Total in second half:21
Counts for SmallIrregular:
Total in first half:141
Total in second half:124
Counts for ADCCM:
Total in first half:150
Total in second half:120
Counts for A

In [None]:
# test_data is pandas dataframe with test split, stratified by Mitocheck_Phenotypic_Class
test_data = training_data.groupby("Mitocheck_Phenotypic_Class", group_keys=False).apply(
    lambda x: x.sample(frac=test_ratio)
)
test_indexes = test_data.index
# remove test indexes
training_data = training_data.drop(pd.Index(data=test_indexes))

train_indexes = np.array(training_data.index)
print(training_data.shape)

In [None]:
# create pandas dataframe with all indexes and their respective labels
index_data = []
for index in test_indexes:
    index_data.append({"label": "test", "index": index})
for index in train_indexes:
    index_data.append({"label": "train", "index": index})
index_data = pd.DataFrame(index_data)
index_data

### Save indexes

In [None]:
# make results dir for saving
results_dir = pathlib.Path("indexes/")
results_dir.mkdir(parents=True, exist_ok=True)
# save indexes as tsv file
index_data.to_csv(f"{results_dir}/data_split_indexes.tsv", sep="\t")