# Construct the dataset from the archive

In [28]:
import numpy as np
import pandas as pd
import os
from zipfile import ZipFile
import nibabel as nib

from tqdm import tqdm
import shutil

np.random.seed(1312)

In [29]:
export_path = "./data"

In [30]:
def read_nii(filepath):
    '''
    Reads .nii file and returns pixel array
    '''
    ct_scan = nib.load(filepath)
    array   = ct_scan.get_fdata()
    array   = np.rot90(np.array(array))
    return(array)

In [31]:
# Export all slides to npy files
GENERATE_NPY_FILES = False


if (GENERATE_NPY_FILES) :

    archive_path_part1 =  "./data_seg_part1.zip"
    archive_path_part2 =  "./data_seg_part2.zip"
    export_path_ct = "./data/ct"
    export_path_mask = "./data/mask"
    os.makedirs(export_path, exist_ok=True)
    os.makedirs(export_path_ct, exist_ok=True)
    os.makedirs(export_path_mask, exist_ok=True)
    

    # opening the zip file in READ mode
    with ZipFile(archive_path_part1, 'r') as zip:
        # extracting all the files
        print('Extracting all the files now...')
        zip.extractall("./whole_data/") # This folder will be removed after the whole process
        print('Done!')

    # with ZipFile(archive_path_part2, 'r') as zip:
    #     # extracting all the files
    #     print('Extracting all the files now...')
    #     zip.extractall("./whole_data/")
    #     print('Done!')


    # Create a meta file for nii files processing
    file_list = []
    for dirname, _, filenames in os.walk('./whole_data/'):
        for filename in filenames:
            file_list.append((dirname, filename)) 


    df_files = pd.DataFrame(file_list, columns =['dirname', 'filename']) 
    df_files.sort_values(by=['filename'], ascending=True)  


    # Map CT scan and label 

    df_files["mask_dirname"]  = ""
    df_files["mask_filename"] = ""

    for i in range(131):
        ct = f"volume-{i}.nii"
        mask = f"segmentation-{i}.nii"
        
        df_files.loc[df_files['filename'] == ct, 'mask_filename'] = mask
        df_files.loc[df_files['filename'] == ct, 'mask_dirname'] = "./whole_data/segmentations"

    # drop segment rows
    df_files = df_files[df_files.mask_filename != ''].sort_values(by=['filename']).reset_index(drop=True) 

    whole_data_df = pd.DataFrame(columns=['patient_id', 'slice_id', 'ct_path', 'mask_path'])
    print("Exporting all slices to npy files...")
    for ii in tqdm(range(len(df_files))):
        row = df_files.iloc[ii]
        #Retrieve patient ID
        patient_id = row['filename'].split('-')[1][:-4]

        #Load scan and mask
        scan = read_nii(row['dirname']+"/"+row['filename'])
        mask = read_nii(row['mask_dirname']+"/"+row['mask_filename'])

        #Save each slice as a npy file
        for slice_id in range(scan.shape[2]):
            ct_path = f'{export_path_ct}/patient_{patient_id}_slice_{slice_id}_scan.npy'
            mask_path = f'{export_path_mask}/patient_{patient_id}_slice_{slice_id}_mask.npy'
            np.save(ct_path, scan[:,:,slice_id])
            np.save(mask_path, mask[:,:,slice_id])

            whole_data_df = pd.concat([whole_data_df, 
                                       pd.DataFrame([[patient_id, slice_id, ct_path, mask_path]],
                                            columns = ['patient_id', 'slice_id', 'ct_path', 'mask_path'])],
                                      ignore_index=True)

    print("Done!")
    whole_data_df.to_csv(f'{export_path}/whole_data_df.csv', index=False)

    # Remove whole_data folder
    shutil.rmtree('./whole_data')

In [35]:
BUILD_SPLIT_DATASET = False

if (BUILD_SPLIT_DATASET) :
    test_frac = 0.2
    val_frac = 0.1

    patient_id = whole_data_df['patient_id'].unique()
    n_patient = len(patient_id)

    n_test = int(n_patient * test_frac)
    n_val = int(n_patient * val_frac)
    n_train = n_patient - n_test - n_val

    # Randomly select test/train patients
    test_patients = np.random.choice(patient_id, n_test, replace=False)
    val_patients = np.random.choice(np.setdiff1d(patient_id, test_patients), n_val, replace=False)
    train_patients = np.setdiff1d(patient_id, np.concatenate([test_patients, val_patients]))

    print("Test patients: ", test_patients)
    print("Val patients: ", val_patients)
    print("Train patients: ", train_patients)

    # Split dataset
    train_df = whole_data_df[whole_data_df['patient_id'].isin(train_patients)]
    val_df = whole_data_df[whole_data_df['patient_id'].isin(val_patients)]
    test_df = whole_data_df[whole_data_df['patient_id'].isin(test_patients)]

    # Save train/test dataset
    train_df.to_csv(f'{export_path}/train_df.csv', index=False)
    val_df.to_csv(f'{export_path}/val_df.csv', index=False)
    test_df.to_csv(f'{export_path}/test_df.csv', index=False)

Test patients:  [43 39  8 38 13  1  2 21 31 33]
Val patients:  [10  0 50  7 28]
Train patients:  [ 3  4  5  6  9 11 12 14 15 16 17 18 19 20 22 23 24 25 26 27 29 30 32 34
 35 36 37 40 41 42 44 45 46 47 48 49]


In [39]:
COMPUTE_MEAN_STD = False

if (COMPUTE_MEAN_STD) :
    # Compute mean and std of train dataset
    train_df = pd.read_csv(f'{export_path}/train_df.csv')
    train_ct_paths = train_df['ct_path'].values

    ct = np.load(train_ct_paths[0])

    ct_mean = np.mean(ct)
    ct_std = np.std(ct)
    ct_max = np.max(ct)
    ct_min = np.min(ct)


    for i in tqdm(range(1, len(train_ct_paths))):
        ct = np.load(train_ct_paths[i])

        ct_mean += np.mean(ct)
        ct_std += np.std(ct)
        ct_max = max(ct_max, np.max(ct))
        ct_min = min(ct_min, np.min(ct))


    ct_mean /= len(train_ct_paths)
    ct_std /= len(train_ct_paths)

    print("CT mean: ", ct_mean)
    print("CT std: ", ct_std)
    print("CT max: ", ct_max)
    print("CT min: ", ct_min)

    # Save mean and std
    np.save(f'{export_path}/ct_mean.npy', ct_mean)
    np.save(f'{export_path}/ct_std.npy', ct_std)

100%|██████████| 13996/13996 [01:21<00:00, 171.49it/s]

CT mean:  -678.6109391237196
CT std:  656.9885500358824
CT max:  3071.0
CT min:  -3024.0



