# Split feature data
## Create tsv file with indexes for held out data, training data, and testing data
### Import libraries

In [None]:
import pandas as pd
import numpy as np
import pathlib

from sklearn.utils import shuffle

import sys
sys.path.append("../utils")
from split_utils import get_features_data, get_random_images_indexes, get_representative_images, get_image_indexes

### Load data and set holdout/test parameters

In [None]:
# load x (features) and y (labels) dataframes
load_path = pathlib.Path("../0.download_data/data/training_data.csv.gz")
training_data = get_features_data(load_path)
print(training_data.shape)

# ratio of data to be reserved for testing (ex 0.15 = 15%)
test_ratio = 0.15

In [None]:
training_data

In [None]:
training_data.loc[training_data["Metadata_DNA"] == "LT0066_19/LT0066_19_287_1.tif"]

In [22]:
class_values = {"MetaphaseAlignment": 0}
image_locations = training_data["Metadata_DNA"].unique()
np.random.shuffle(image_locations)
num_train = int(len(image_locations)*0.85)
image_locations_train = image_locations[num_train:]
image_locations_test = image_locations[:num_train]

for phenotypic_class in training_data["Mitocheck_Phenotypic_Class"].unique():
    first_total = 0
    for image_location in image_locations_train:
        try:
            m_a_count = training_data.loc[training_data["Metadata_DNA"] == image_location]["Mitocheck_Phenotypic_Class"].value_counts()[phenotypic_class]
        except KeyError:
            continue
        first_total += m_a_count

    second_total = 0
    for image_location in image_locations_test:
        try:
            m_a_count = training_data.loc[training_data["Metadata_DNA"] == image_location]["Mitocheck_Phenotypic_Class"].value_counts()[phenotypic_class]
        except KeyError:
            continue
        second_total += m_a_count
    print(f"For {phenotypic_class}: first half has {first_total/(first_total+second_total)*100}%")

For MetaphaseAlignment: first half has 10.619469026548673%
For Artefact: first half has 6.83453237410072%
For Prometaphase: first half has 14.940828402366865%
For Interphase: first half has 15.75682382133995%
For Grape: first half has 5.839416058394161%
For Polylobed: first half has 8.507306889352819%
For Apoptosis: first half has 14.7117296222664%
For Metaphase: first half has 12.5%
For Binuclear: first half has 21.689785624211854%
For Elongated: first half has 23.57142857142857%
For Hole: first half has 19.387755102040817%
For Folded: first half has 32.53012048192771%
For SmallIrregular: first half has 16.9811320754717%
For ADCCM: first half has 31.11111111111111%
For Anaphase: first half has 34.55882352941176%
For Large: first half has 18.38235294117647%
For OutOfFocus: first half has 5.88235294117647%


In [None]:
# test_data is pandas dataframe with test split, stratified by Mitocheck_Phenotypic_Class
test_data = training_data.groupby("Mitocheck_Phenotypic_Class", group_keys=False).apply(
    lambda x: x.sample(frac=test_ratio)
)
test_indexes = test_data.index
# remove test indexes
training_data = training_data.drop(pd.Index(data=test_indexes))

train_indexes = np.array(training_data.index)
print(training_data.shape)

In [None]:
# create pandas dataframe with all indexes and their respective labels
index_data = []
for index in test_indexes:
    index_data.append({"label": "test", "index": index})
for index in train_indexes:
    index_data.append({"label": "train", "index": index})
index_data = pd.DataFrame(index_data)
index_data

### Save indexes

In [None]:
# make results dir for saving
results_dir = pathlib.Path("indexes/")
results_dir.mkdir(parents=True, exist_ok=True)
# save indexes as tsv file
index_data.to_csv(f"{results_dir}/data_split_indexes.tsv", sep="\t")