## Dividir los datos

In [5]:
import os

### Configuración del entorno

In [6]:
os.chdir('..')

In [7]:
os.getcwd()

'/Users/castiler/TFM'

### Cargar librerías:

In [4]:
from scripts.A_config import NNUNetConfig, DatasetType
import nibabel as nib
import numpy as np
import matplotlib.pyplot as plt
import random
import pandas as pd

random.seed(81)
np.random.seed(81)

## Obetener los ficheros

In [8]:
all_images = [file for file in os.listdir(NNUNetConfig().train_images_dir) if file.endswith(NNUNetConfig().TERMINATION)]
all_ids = sorted({file_name.split(".")[0][:-5] for file_name in all_images})


In [9]:
len(all_ids)

81

In [12]:
import os
import pathlib
from typing import List, Tuple, Dict

import nibabel as nib
import numpy as np
import pandas as pd
from scipy.ndimage import label
from tqdm import tqdm

def detect_lesions(mask: nib.nifti1.Nifti1Image) -> Tuple[np.ndarray, Dict[str, List[int]]]:
    """Detect and label different lesions by using a pattern.

    Args:
        mask: nifti image with the mask

    Returns:
        basal_lesions_map: image with a different label (int) for each lesion
        joint_lesions: a dict with basal and new lesion identifiers
    """
    # Detecting lesions:
    basal_lesions_map, n_basal_lesions = label((mask.get_fdata() == 1).astype(int))
    followup_lesions_map, n_followup_lesions = label((mask.get_fdata() == 2).astype(int))
    # Merging results:
    new_lesion_ids = []
    for new_lesion in range(1, n_followup_lesions + 1):
        new_lesion_id = new_lesion + n_basal_lesions
        basal_lesions_map[followup_lesions_map == new_lesion] = new_lesion_id
        new_lesion_ids.append(new_lesion_id)
    joint_lesions = {'basal': list(range(1, n_basal_lesions + 1)), 'new': new_lesion_ids}
    return basal_lesions_map, joint_lesions

def analyse_cases(ids: List[str], labels_dir: pathlib.Path):
    """Analyse cases with id in 'ids' and whose labels are in directory 'labels_dir'.

    Args:
        ids: list of ids of the cases to be analysed
        labels_dir: directory where the labels of the ids are stored

    Returns:
        DataFrame with all results.
    """
    # We iterate over ids and gather all results in a list:
    analysis_results = []
    for case_id in tqdm(ids):
        # We load the labels:
        case_mask = nib.load(labels_dir / (case_id + NNUNetConfig().TERMINATION))
        # We detect the lesions (both basal and new ones):
        lesions_map, lesions = detect_lesions(case_mask)
        # We compute the total, mean and median lesion size for both types of lesions:
        total_basal_lesion_vol = (case_mask.get_fdata() == 1).sum()
        total_new_lesion_vol = (case_mask.get_fdata() == 2).sum()
        mean_basal_lesion_vol = np.median([(lesions_map == b_lesion).sum() for b_lesion in lesions['basal']])
        median_basal_lesion_vol = np.mean([(lesions_map == b_lesion).sum() for b_lesion in lesions['basal']])
        mean_new_lesion_vol = np.median([(lesions_map == b_lesion).sum() for b_lesion in lesions['new']])
        median_new_lesion_vol = np.mean([(lesions_map == b_lesion).sum() for b_lesion in lesions['new']])
        # And we append the results to the list, including the number of lesions:
        case_results = {
            "case_id": case_id,
            "n_lesions": len(lesions['basal']) + len(lesions['new']),
            "n_basal_lesions": len(lesions['basal']),
            "n_new_lesions": len(lesions['new']),
            "mean_basal_lesion_vol": mean_basal_lesion_vol,
            "median_basal_lesion_vol": median_basal_lesion_vol,
            "total_basal_lesion_vol": total_basal_lesion_vol,
            "mean_new_lesion_vol": mean_new_lesion_vol,
            "median_new_lesion_vol": median_new_lesion_vol,
            "total_new_lesion_vol": total_new_lesion_vol
        }
        analysis_results.append(case_results)
    # Finally, we return all results as a dataframe:
    return pd.DataFrame.from_records(analysis_results)

In [None]:
lesions_analysis = analyse_cases(ids=all_ids, labels_dir=NNUNetConfig().train_labels_dir)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
100%|██████████| 81/81 [00:42<00:00,  1.89it/s]


In [14]:
cols_w_possible_NAs = [
        'mean_basal_lesion_vol', 'median_basal_lesion_vol', 'mean_new_lesion_vol', 'median_new_lesion_vol'
    ]
lesions_analysis[cols_w_possible_NAs] = lesions_analysis[cols_w_possible_NAs].fillna(value=0)

In [15]:
# Cutting into categorical:
lesions_analysis['bl_bin'] = pd.qcut(lesions_analysis['n_basal_lesions'], 3)  # Cutting by quantiles
lesions_analysis['nl_bin'] = pd.cut(  # Cutting by specific boundaries
    lesions_analysis['n_new_lesions'],
    bins=[0, 1, 5, np.inf],
    right=False
)
# Combining both criteria:
lesions_analysis['stratification_bin'] = (
        lesions_analysis['bl_bin'].astype(str) + ' & ' + lesions_analysis['nl_bin'].astype(str)
)
# Unifying "contiguous" rare class for easier splitting:
rare_classes = ['(39.667, 75.0] & [1.0, 5.0)', '(39.667, 75.0] & [5.0, inf)']
lesions_analysis.loc[
    lesions_analysis['stratification_bin'].isin(rare_classes), 'stratification_bin'
] = ' | '.join(rare_classes)
# Factorizing:
lesions_analysis['stratification_class'] = pd.factorize(lesions_analysis['stratification_bin'])[0]

In [17]:
from sklearn.model_selection import StratifiedKFold, train_test_split

train_val_ids, test_ids = train_test_split(lesions_analysis['case_id'], test_size=0.3,
                                               stratify=lesions_analysis['stratification_class'])

In [None]:
try:
    os.mkdir(NNUNetConfig().test_images_dir)
except FileExistsError:
    pass
try:
    os.mkdir(NNUNetConfig().test_labels_dir)
except FileExistsError:
    pass

In [None]:
import shutil

for test_case in test_ids:
    # Images:
    basal_image = f"{test_case}_0000" + NNUNetConfig().TERMINATION
    followup_image = f"{test_case}_0001" + NNUNetConfig().TERMINATION
    shutil.move(NNUNetConfig().train_images_dir / basal_image, NNUNetConfig().test_images_dir / basal_image)
    shutil.move(NNUNetConfig().train_images_dir / followup_image, NNUNetConfig().test_images_dir / followup_image)
    # Mask:
    mask_image = test_case + NNUNetConfig().TERMINATION
    shutil.move(NNUNetConfig().train_labels_dir / mask_image, NNUNetConfig().test_labels_dir / mask_image)

In [None]:
import json
# Cross-validation folds generation:
train_stratification_bins = lesions_analysis.iloc[train_val_ids.index]['stratification_class']

cv_folds = []
skf = StratifiedKFold(n_splits=5)
# Generation of the "splits_final.json" file:
for train, val in skf.split(X=train_val_ids, y=train_stratification_bins):
    fold = {
        "train": train_val_ids.iloc[train].tolist(),
        "val": train_val_ids.iloc[val].tolist()
    }
    cv_folds.append(fold)
with open(NNUNetConfig().dataset_preprocessed_dir / 'splits_final.json', 'w') as f:
    json.dump(cv_folds, f)