# Data preparation and preprocessing 👊

In [None]:
import pydicom, collections, cv2
import random, os, time, json, glob

from sklearn.model_selection import StratifiedKFold
from sklearn import model_selection as sk_model_selection
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM, BatchNormalization
from keras.callbacks import TensorBoard
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam

import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds

import PIL
# import PIL.Image
from PIL import Image

In [None]:
train_df = pd.read_csv("../input/rsna-miccai-brain-tumor-radiogenomic-classification/train_labels.csv")
train_df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(5, 5))
sns.countplot(data=train_df, x="MGMT_value");

In [None]:
def load_dicom(path):
    dicom = pydicom.read_file(path)
    data = dicom.pixel_array
    data = data - np.min(data)
    if np.max(data) != 0:
        data = data / np.max(data)
    data = (data * 255).astype(np.uint8)
    return data


def visualize_sample(
    brats21id, 
    slice_i,
    mgmt_value,
    types=("FLAIR", "T1w", "T1wCE", "T2w")
):
    plt.figure(figsize=(16, 5))
    patient_path = os.path.join(
        "../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/", 
        str(brats21id).zfill(5),
    )
    for i, t in enumerate(types, 1):
        t_paths = sorted(
            glob.glob(os.path.join(patient_path, t, "*")), 
            key=lambda x: int(x[:-4].split("-")[-1]),
        )
        data = load_dicom(t_paths[int(len(t_paths) * slice_i)])
        plt.subplot(1, 4, i)
        plt.imshow(data, cmap="gray")
        plt.title(f"{t}", fontsize=16)
        plt.axis("off")

    plt.suptitle(f"MGMT_value: {mgmt_value}", fontsize=16)
    plt.show()

In [None]:
submission = pd.read_csv("../input/rsna-miccai-brain-tumor-radiogenomic-classification/sample_submission.csv")
# submission.to_csv("submission.csv", index=False)
submission

In [None]:
import pathlib
path ='../input/rsna-miccai-png/train'
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
  path,
  labels='inferred',
  validation_split=0.2,
  subset="training",
  seed=123,
  # image_size=(img_height, img_width),
  # batch_size=batch_size,
)


In [None]:
class_names = train_ds.class_names
print(class_names)

In [None]:
plt.figure(figsize=(10, 10))
for images, labels in train_ds.take(1):
  for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(images[i].numpy().astype("uint8"))
    plt.title(class_names[labels[i]])
    plt.axis("off")

In [None]:
path2 ='../input/rsna-miccai-png/train/00000'
img_type = tf.keras.preprocessing.image_dataset_from_directory(
  path2,
  labels='inferred',
  validation_split=0.2,
  subset="training",
  seed=123,
  # image_size=(img_height, img_width),
  # batch_size=batch_size,
)

In [None]:
plt.figure(figsize=(10, 10))
for images, labels in img_type.take(1):
  for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(images[i].numpy().astype("uint8"))
    plt.title(class_names[labels[i]])
    plt.axis("off")

In [None]:
root_dir = "../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/"
types = ["FLAIR","T1w","T1wCE","T2w"]
_new = []

for index, num in enumerate(train_df["BraTS21ID"]):
    a = []
    for i in range(len(types)):
        _path = root_dir + str(num).zfill(5)
        tpath = os.path.join(_path, types[i])
        lists = os.listdir(tpath)
        a.append(len(lists))
    
    _new.append([train_df["BraTS21ID"].iloc[index],train_df["MGMT_value"].iloc[index],a[0], a[1], a[2], a[3]])

new_df = pd.DataFrame(_new)
new_df.columns = ["BraTS21Id", "MGMT_value","FLAIR", "T1w", "T1wCE", "T2w"]

new_df

## ✋ I found that sizes are different per patients ✋

In [None]:
img_1_png = Image.open("../input/rsna-miccai-png/train/00000/FLAIR/Image-100.png")
img_1_png.size

In [None]:
img_2_png = Image.open("../input/rsna-miccai-png/train/00137/T1wCE/Image-29.png")
img_2_png.size

## 00000 shape is (512,512)

In [None]:
path = "../input/rsna-miccai-png/train/00000"
sizes = []
folders = os.listdir(path)
for folder in folders:
    folder_path = os.path.join(path, folder)
    names = os.listdir(folder_path)
    
    for name in names:
        png_path = os.path.join(folder_path, name)
        
        im_size = Image.open(png_path).size
        sizes.append(im_size)
print(names)

print(set(sizes))

## 00137 shape is (256,256)

In [None]:
path = "../input/rsna-miccai-png/train/00137"
sizes = []
folders = os.listdir(path)
for folder in folders:
    folder_path = os.path.join(path, folder)
    names = os.listdir(folder_path)
    
    for name in names:
        png_path = os.path.join(folder_path, name)
        
        im_size = Image.open(png_path).size
        sizes.append(im_size)
print(names)

print(set(sizes))

In [None]:
root_dir = "../input/rsna-miccai-brain-tumor-radiogenomic-classification/train/"
types = ["FLAIR","T1w","T1wCE","T2w"]
_new = []

for index, num in enumerate(train_df["BraTS21ID"]):
    a = []
    for i in range(len(types)):
        _path = root_dir + str(num).zfill(5)
        tpath = os.path.join(_path, types[i])
        lists = os.listdir(tpath)
        a.append(len(lists))
    
    _new.append([train_df["BraTS21ID"].iloc[index],train_df["MGMT_value"].iloc[index],a[0], a[1], a[2], a[3]])

new_df = pd.DataFrame(_new)
new_df.columns = ["BraTS21Id", "MGMT_value","FLAIR", "T1w", "T1wCE", "T2w"]

new_df


## Check files shape per patients ✔

In [None]:
 if __name__ == "__main__":
    root_dir = "../input/rsna-miccai-png/train"
    sizes = []
    csv_path = "imagetype.csv"
    with open(csv_path, 'w') as f: 
        f.write('path,height,width,size\n') 
    for (root, dirs, files) in os.walk(root_dir):
        print("# root : " + root)
        
        if len(files) > 0:
            for file_name in files:
                png_path = os.path.join(root, file_name)
                # print("file: " + file_name)
                
                im_size = Image.open(png_path).size
                with open(csv_path, 'a+') as f:  
                    f.write(f'{png_path},{im_size[0]},{im_size[1]},({im_size[0]}x{im_size[1]})\n')



## ⚠ Due to quantities of files, I spent time too much to load 📊
## images are over 250k, so I decided to make a csv file 💾 

In [None]:
imagetype_csv = pd.read_csv('./imagetype.csv')
imagetype_csv.shape

## Change CSV file into DataFrame 

In [None]:
image_df = pd.read_csv('./imagetype.csv',index_col = False)
image_df

## Check height and width image sizes

In [None]:
image_df['height'].value_counts()

In [None]:
image_df['width'].value_counts() 

In [None]:
image_df['size'].value_counts()

### 50% is (512,512) 20% is (192,256), and about 20% is (256,256)

In [None]:
 if __name__ == "__main__":
    root_dir = "../input/rsna-miccai-png/test"
    sizes = []
    csv_path = "imagetype_test.csv"
    with open(csv_path, 'w') as f: 
        f.write('path,height,width,size\n') 
    for (root, dirs, files) in os.walk(root_dir):
        print("# root : " + root)
        
        if len(files) > 0:
            for file_name in files:
                png_path = os.path.join(root, file_name)
                # print("file: " + file_name)
                
                im_size = Image.open(png_path).size
                with open(csv_path, 'a+') as f:  
                    f.write(f'{png_path},{im_size[0]},{im_size[1]},({im_size[0]}x{im_size[1]})\n')



In [None]:
image_test_df = pd.read_csv('./imagetype_test.csv',index_col = False)
image_test_df

In [None]:
image_test_df['size'].value_counts()

## I need to confirm what sizes are the best for getting features from images

## ONGOING image resize~