## Prostate158 - Eploratory data analysis

In [125]:
import nibabel as nib
import numpy as np
import matplotlib.pyplot as plt
from nilearn import plotting
import scipy.ndimage as ndi
import pandas as pd

In [167]:
train_dir = '../input/prostate15/prostate158_train/prostate158_train'
train_csv = f'{train_dir}/train.csv'
train_valid_csv = f'{train_dir}/valid.csv'

test_dir = '../input/prostate15/prostate158_test/prostate158_test'
test_csv = f'{test_dir}/test.csv'

In [223]:
train_df = pd.read_csv(train_csv)
valid_df = pd.read_csv(train_valid_csv)
test_df = pd.read_csv(test_csv)

train_df.head()

Unnamed: 0,ID,t2,adc,dwi,t2_anatomy_reader1,t2_tumor_reader1,adc_tumor_reader1,t2_anatomy_reader2,adc_tumor_reader2
0,24,train/024/t2.nii.gz,train/024/adc.nii.gz,train/024/dwi.nii.gz,train/024/t2_anatomy_reader1.nii.gz,,train/024/empty.nii.gz,,
1,25,train/025/t2.nii.gz,train/025/adc.nii.gz,train/025/dwi.nii.gz,train/025/t2_anatomy_reader1.nii.gz,train/025/t2_tumor_reader1.nii.gz,train/025/adc_tumor_reader1.nii.gz,,train/025/adc_tumor_reader2.nii.gz
2,26,train/026/t2.nii.gz,train/026/adc.nii.gz,train/026/dwi.nii.gz,train/026/t2_anatomy_reader1.nii.gz,,train/026/empty.nii.gz,,
3,27,train/027/t2.nii.gz,train/027/adc.nii.gz,train/027/dwi.nii.gz,train/027/t2_anatomy_reader1.nii.gz,train/027/t2_tumor_reader1.nii.gz,train/027/adc_tumor_reader1.nii.gz,,train/027/adc_tumor_reader2.nii.gz
4,28,train/028/t2.nii.gz,train/028/adc.nii.gz,train/028/dwi.nii.gz,train/028/t2_anatomy_reader1.nii.gz,,train/028/empty.nii.gz,,


In [224]:
import math

def remove_compression_suffix(df):
    columns = ['t2', 'adc', 'dwi', 't2_anatomy_reader1', 't2_tumor_reader1', 'adc_tumor_reader1', 't2_anatomy_reader2', 'adc_tumor_reader2']
    df[columns] = df[columns].map(lambda v: '.'.join(x for x in v.split('.')[:-1]) if isinstance(v, str) else v)
    return df

train_df = remove_compression_suffix(train_df)
valid_df = remove_compression_suffix(valid_df)
test_df = remove_compression_suffix(test_df)
train_df.head()

Unnamed: 0,ID,t2,adc,dwi,t2_anatomy_reader1,t2_tumor_reader1,adc_tumor_reader1,t2_anatomy_reader2,adc_tumor_reader2
0,24,train/024/t2.nii,train/024/adc.nii,train/024/dwi.nii,train/024/t2_anatomy_reader1.nii,,train/024/empty.nii,,
1,25,train/025/t2.nii,train/025/adc.nii,train/025/dwi.nii,train/025/t2_anatomy_reader1.nii,train/025/t2_tumor_reader1.nii,train/025/adc_tumor_reader1.nii,,train/025/adc_tumor_reader2.nii
2,26,train/026/t2.nii,train/026/adc.nii,train/026/dwi.nii,train/026/t2_anatomy_reader1.nii,,train/026/empty.nii,,
3,27,train/027/t2.nii,train/027/adc.nii,train/027/dwi.nii,train/027/t2_anatomy_reader1.nii,train/027/t2_tumor_reader1.nii,train/027/adc_tumor_reader1.nii,,train/027/adc_tumor_reader2.nii
4,28,train/028/t2.nii,train/028/adc.nii,train/028/dwi.nii,train/028/t2_anatomy_reader1.nii,,train/028/empty.nii,,


### Columns' description&ast;
* **ID** – patient MRI identificator
* **t2** – T2W sequence in NIfTI format
* **adc** – ADC sequence in NIfTI format
* **dwi** – DWI sequence in NIfTI format
* **t2_anatomy_reader1** – Anatomy segmentation of reader 1
* **t2_anatomy_reader2** – Anatomy segmentation of reader 2&ast;&ast;
* **t2_tumor_reader1** – Tumor segmentation of reader 1
* **adc_tumor_reader1** – Tumor segmentation of reader 1
* **adc_tumor_reader2** – Tumor segmentation of reader 2

<details>
    <summary>Abbreviations</summary>
    - <a href="https://radiopaedia.org/articles/apparent-diffusion-coefficient-1">What is Apparent diffusion coefficient?</a>
    - <a href="https://radiopaedia.org/articles/apparent-diffusion-coefficient-1">What is Diffusion-weighted imaging?</a>
</details>

###### &ast;Based on https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9679750/pdf/main.pdf, &ast;&ast;own caption


### Half cases of the training data seems not to have cancerous lesions

In [225]:
for label, df in {'Train Set': train_df, 'Training Validation Set': valid_df, 'Test Set': test_df}.items():
    na_vals = pd.isna(df).sum()
    percent_missing = na_vals / df.shape[0]
    percent_missing = percent_missing.apply(lambda x: f"{x:.2%}")
    print(label)
    print(pd.DataFrame({'count missing': na_vals, 'percent_missing': percent_missing}).sort_values(by='count missing', ascending=False))
    print('\n' * 2)

Train Set
                    count missing percent_missing
t2_anatomy_reader2            119         100.00%
adc_tumor_reader2              62          52.10%
t2_tumor_reader1               49          41.18%
ID                              0           0.00%
t2                              0           0.00%
adc                             0           0.00%
dwi                             0           0.00%
t2_anatomy_reader1              0           0.00%
adc_tumor_reader1               0           0.00%



Training Validation Set
                    count missing percent_missing
t2_anatomy_reader2             20         100.00%
adc_tumor_reader2              10          50.00%
t2_tumor_reader1                8          40.00%
ID                              0           0.00%
t2                              0           0.00%
adc                             0           0.00%
dwi                             0           0.00%
t2_anatomy_reader1              0           0.00%
adc_tumor_rea

### Samples visualisation

In [186]:
train_df

# train_cases_dir = os.listdir(train_imgs_dir)
# train_cases_dir.sort()
# for case in train_cases_dir[:1]:
#     path = os.path.join(train_imgs_dir, case)
#     mri_files = os.listdir(f'{train_imgs_dir}/{case}')
#     img = nib.load(mri_files[0])
#     img_data = img.get_fdata()
#     plotting.plot_img(img, display_mode='mosaic', cmap='gray')
#     plt.show()



Unnamed: 0,ID,t2,adc,dwi,t2_anatomy_reader1,t2_tumor_reader1,adc_tumor_reader1,t2_anatomy_reader2,adc_tumor_reader2
0,24,train/024/t2.nii.gz,train/024/adc.nii.gz,train/024/dwi.nii.gz,train/024/t2_anatomy_reader1.nii.gz,,train/024/empty.nii.gz,,
1,25,train/025/t2.nii.gz,train/025/adc.nii.gz,train/025/dwi.nii.gz,train/025/t2_anatomy_reader1.nii.gz,train/025/t2_tumor_reader1.nii.gz,train/025/adc_tumor_reader1.nii.gz,,train/025/adc_tumor_reader2.nii.gz
2,26,train/026/t2.nii.gz,train/026/adc.nii.gz,train/026/dwi.nii.gz,train/026/t2_anatomy_reader1.nii.gz,,train/026/empty.nii.gz,,
3,27,train/027/t2.nii.gz,train/027/adc.nii.gz,train/027/dwi.nii.gz,train/027/t2_anatomy_reader1.nii.gz,train/027/t2_tumor_reader1.nii.gz,train/027/adc_tumor_reader1.nii.gz,,train/027/adc_tumor_reader2.nii.gz
4,28,train/028/t2.nii.gz,train/028/adc.nii.gz,train/028/dwi.nii.gz,train/028/t2_anatomy_reader1.nii.gz,,train/028/empty.nii.gz,,
...,...,...,...,...,...,...,...,...,...
114,154,train/154/t2.nii.gz,train/154/adc.nii.gz,train/154/dwi.nii.gz,train/154/t2_anatomy_reader1.nii.gz,train/154/t2_tumor_reader1.nii.gz,train/154/adc_tumor_reader1.nii.gz,,train/154/adc_tumor_reader2.nii.gz
115,155,train/155/t2.nii.gz,train/155/adc.nii.gz,train/155/dwi.nii.gz,train/155/t2_anatomy_reader1.nii.gz,,train/155/empty.nii.gz,,
116,156,train/156/t2.nii.gz,train/156/adc.nii.gz,train/156/dwi.nii.gz,train/156/t2_anatomy_reader1.nii.gz,train/156/t2_tumor_reader1.nii.gz,train/156/adc_tumor_reader1.nii.gz,,
117,157,train/157/t2.nii.gz,train/157/adc.nii.gz,train/157/dwi.nii.gz,train/157/t2_anatomy_reader1.nii.gz,train/157/t2_tumor_reader1.nii.gz,train/157/adc_tumor_reader1.nii.gz,,train/157/adc_tumor_reader2.nii.gz
