### Import thư viện

In [None]:
%matplotlib inline
import numpy as np 
import pandas as pd
from pydicom import dcmread
import os
import scipy.ndimage
import matplotlib.pyplot as plt
from supporters import *
from PIL import Image
import SimpleITK as sitk
import nibabel as nib
import tensorflow as tf
import pickle

### 1. Đọc và tiền xử lý ảnh file dcm

In [None]:
def load_scan(path):
    slices = [dcmread(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
    try:
        slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
    except:
        slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
    for s in slices:
        s.SliceThickness = slice_thickness
    return slices


def get_pixels_hu(slices):
    image = np.stack([s.pixel_array for s in slices])
    image = image.astype(np.int16)
    image[image == -2000] = 0
    for slice_number in range(len(slices)):
        intercept = slices[slice_number].RescaleIntercept
        slope = slices[slice_number].RescaleSlope
        if slope != 1:
            image[slice_number] = slope * image[slice_number].astype(np.float64)
            image[slice_number] = image[slice_number].astype(np.int16)
        image[slice_number] += np.int16(intercept)  
    return np.array(image, dtype=np.int16)

##### Load folder tất cả bệnh nhân và load 1 bệnh nhân 

In [None]:
INPUT_FOLDER = '../../data/PatientsDCM/'
patients = os.listdir(INPUT_FOLDER)
patients.sort()

In [None]:
PAT001_scan = load_scan(INPUT_FOLDER + patients[0])
PAT001 = get_pixels_hu(PAT001_scan)
PAT001.shape

In [None]:
explore_3D_array(PAT001)

### 2. Thực Hiện Linear Regression để loại bỏ những tấm ảnh gây nhiễu (không chứa tim)

In [None]:
y = pd.read_csv('../../data/PatientsDCM/Postprocessing/binary_training_label.csv')
y

In [None]:
# Load tất cả bộ ảnh của tất cả bệnh nhân và X_train
X = []

for i in range(0, 16, 1):
    PAT = load_scan(INPUT_FOLDER + patients[i])
    PAT = get_pixels_hu(PAT)
    for j in range(PAT.shape[0]):
        X.append(PAT[j])
    
X = np.array(X)
X.shape

In [None]:
# Normalization
X_min = np.min(X)
X_max = np.max(X)

X = (X - X_min) / (X_max - X_min)
np.min(X), np.max(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

#### 2.1 Huấn luyện mô hình với Logistic Regression của scikit-learn

##### 2.1.1 Trình chiếu trên mặt phẳng 2D để xem độ phân biệt của data

In [None]:
from sklearn.decomposition import PCA

X = X.reshape(X.shape[0], -1)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

# plot the transformed data with different colors for different labels
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.show()

In [None]:
y = np.array(y)

In [None]:
from sklearn.linear_model import LogisticRegression

# Reshape the data
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)

# Create a logistic regression object
logreg = LogisticRegression()

# Train the model
logreg.fit(X_train, y_train)

# Predict the test set results
y_pred = logreg.predict(X_test)

In [None]:
# Đánh giá
from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

def log_loss(y_true, y_pred):
    n = len(y_true)
    loss = -1/n * np.sum(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
    return loss

# Calculate loss
loss = log_loss(y_test, y_pred)

accuracy, loss

In [None]:

import numpy as np
import matplotlib.pyplot as plt

# YOUR CODE HERE
def sigmoid(x):
  return 1/(1 + np.exp(-x))

x = np.linspace(-6,6,100)
y = sigmoid(x)
plt.plot(x,y)
plt.show()

In [None]:
import numpy as np

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def logistic_regression(X, y, alpha=0.01, num_iterations=100):
    m, n = X.shape
    theta = np.zeros((n, 1))
    accuracy = []
    log_loss = []
    for i in range(num_iterations):
        z = np.dot(X, theta)
        h = sigmoid(z)
        J = (-1/m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
        gradient = (1/m) * np.dot(X.T, (h - y))
        theta -= alpha * gradient
        y_pred = (h >= 0.4).astype(int)
        accuracy.append(np.mean(y_pred == y))
        log_loss.append(J)
    return theta, accuracy, log_loss

# Reshape the data
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)

# Add bias term to X
X_train = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
X_test = np.hstack((np.ones((X_test.shape[0], 1)), X_test))

# Train the model
theta, accuracy, log_loss = logistic_regression(X_train, y_train)

# Plot the accuracy and loss curves
import matplotlib.pyplot as plt

plt.plot(accuracy)
plt.title('Đường biểu diễn độ chính xác')
plt.xlabel('Số lần chạy')
plt.ylabel('Chỉ số chính xác')
plt.show()

plt.plot(log_loss)
plt.title('Đường biểu diễn hàm mất mát log')
plt.xlabel('Số lần chạy')
plt.ylabel('Chỉ số mất mát')
plt.show()

In [None]:
len(theta)

In [None]:
512 * 512 + 1

In [None]:
import numpy as np

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def logistic_regression(X, y, alpha=0.01, num_iterations=1000):
    m, n = X.shape
    theta = np.zeros((n, 1))
    accuracy = []
    log_loss = []
    for i in range(num_iterations):
        z = np.dot(X, theta)
        h = sigmoid(z)
        J = (-1/m) * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))
        gradient = (1/m) * np.dot(X.T, (h - y))
        theta -= alpha * gradient
        y_pred = (h >= 0.5).astype(int)
        accuracy.append(np.mean(y_pred == y))
        log_loss.append(J)
    return theta, accuracy, log_loss

# Reshape the data
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)

# Add bias term to X
X_train = np.hstack((np.ones((X_train.shape[0], 1)), X_train))
X_test = np.hstack((np.ones((X_test.shape[0], 1)), X_test))

# Train the model
theta, accuracy, log_loss = logistic_regression(X_train, y_train)

# Plot the accuracy and loss curves
import matplotlib.pyplot as plt

plt.plot(accuracy)
plt.title('Đường biểu diễn độ chính xác')
plt.xlabel('Số lần chạy')
plt.ylabel('Chỉ số chính xác')
plt.show()

plt.plot(log_loss)
plt.title('Đường biểu diễn hàm mất mát log')
plt.xlabel('Số lần chạy')
plt.ylabel('Chỉ số mất mát')
plt.show()

In [None]:
theta

In [None]:
image = PAT001
image = image.reshape(image.shape[0],-1)
image.shape

In [None]:
import pickle

# Save the model to disk and reload
filename = '../../model/training/models/logreg_model.sav'
pickle.dump(logreg, open(filename, 'wb'))
loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
y_pred = loaded_model.predict(image)
y_pred

In [None]:
start = end = 0

for index, value in enumerate(y_pred):
    if value == 1:
        start = index
        while y_pred[index] == 1:
            index += 1
        end = index
        break
        
start, end

#### Tạo pipeline hoàn chỉnh

In [None]:
def filtering(image):
    # Load model
    filename = '../../model/training/models/logreg_model.sav'
    loaded_model = pickle.load(open(filename, 'rb'))
    
    # Predict
    temp = image.reshape(image.shape[0],-1)
    y_pred = loaded_model.predict(temp)
    
    # filtering
    start = end = 0
    for index, value in enumerate(y_pred):
        if value == 1:
            start = index
            while y_pred[index] == 1:
                index += 1
            end = index
            break
            
    image = image[start:end]
    return image

PAT001 = filtering(PAT001)
PAT001.shape

In [None]:
# Giảm thiểu các tấm ảnh nhiễu rất nhiều
explore_3D_array(PAT001)

#### 2.2 Huấn luyện mô hình với Tensorflow

In [None]:
# Normalize data
X_max = 1024
X_min = -1024

X = (X - X_min) / (X_max - X_min)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.1, random_state=42)

X_train.shape, X_test.shape, X_val.shape, y_train.shape, y_test.shape, y_val.shape

In [None]:
# Reshape the data
X_train = X_train.reshape(X_train.shape[0], -1)
X_test = X_test.reshape(X_test.shape[0], -1)
X_val = X_val.reshape(X_val.shape[0], -1) # Reshape the validation data

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model and save the history
history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val), validation_split=0.1) # Add the validation data and split

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
plt.title('Tập huấn luyện')
plt.plot(history.history['accuracy'], label='Độ chính xác')
plt.plot(history.history['loss'], label='Độ mất mát')
plt.xlabel('Epoch')
plt.legend(loc='upper right')

plt.subplot(1,2,2)
plt.title('Tập đánh giá')
plt.plot(history.history['val_accuracy'], label='Độ chính xác')
plt.plot(history.history['val_loss'], label='Độ mất mát')
plt.xlabel('Epoch')

plt.show()

In [None]:
model.save('../../model/training/models/binary_tensor.h5')

In [None]:
load_model_tensor = tf.keras.models.load_model('../../model/training/models/binary_tensor.h5')

In [None]:
PAT001 = PAT001.reshape(PAT001.shape[0], -1)
y_pred = load_model_tensor.predict(PAT001)
len(y_pred)

### 3. Cắt tròn với phương trình đường tròn và lưu

In [None]:
ban_kinh = 190
a = b = 512/2

left = top = a - ban_kinh
right = bottom = a + ban_kinh

def euclidian_distance(x, y, a, b):
    dis = ((x - a)**2 + (y - b)**2)**(1/2)
    return dis

crop_imgs = []
def circling():
    for index in range(PAT001.shape[0]):
        img = PAT001[index]
        for x in range(PAT001.shape[1]):
            for y in range(PAT001.shape[2]):
                if euclidian_distance(x, y, a, b) > ban_kinh:
                    img[x,y] = -2000
        img = img[int(top) : int(bottom), int(left) : int(right)]
        crop_imgs.append(img)
        
circling()
crop_imgs = np.array(crop_imgs)
crop_imgs.shape

In [None]:
PAT001 = crop_imgs
explore_3D_array(PAT001)

In [None]:
plt.imshow(PAT001[100], cmap="gray")

##### Lưu lại dưới định dạng .nii.gz

In [None]:
out_path = '../../data/PatientsDCM/PAT001/PAT001.nii.gz'
converted_array = np.array(PAT001, dtype=np.float32)
converted_array = np.transpose(converted_array, (2, 1, 0))

affine = np.eye(4)
nifti_file = nib.Nifti1Image(converted_array, affine)
nib.save(nifti_file, out_path)

# reread to check
raw_img_sitk = sitk.ReadImage(out_path, sitk.sitkFloat32)
raw_img_sitk = sitk.GetArrayFromImage(raw_img_sitk)
print(f'Shape of numpy array: {raw_img_sitk.shape}')

In [None]:
explore_3D_array_comparison(raw_img_sitk, PAT001)

### 4. Xem và Normalize đơn vị Housefield

Đơn vị Hounsfield (HU) là một đơn vị được sử dụng để thể hiện mật độ phóng xạ của vật liệu trong chụp cắt lớp vi tính (CT). Thang đo HU dựa trên mật độ phóng xạ của không khí và nước, được gán các giá trị lần lượt là -1000 HU và 0 HU. Thang đo HU dao động từ -1000 HU đối với không khí đến +3000 HU đối với xương hoặc kim loại rất dày đặc.  Các vật liệu và mô khác nhau có giá trị HU khác nhau, có thể giúp xác định và phân biệt chúng trên hình ảnh CT.

In [None]:
print(np.max(PAT001), np.min(PAT001))

plt.hist(PAT001.flatten(), bins=80, color='c')
plt.xlabel("Đơn vị Hounsfield")
plt.ylabel("Tấn suất")
plt.show()

Tuy nhiên range Hounsfield Unit ở trên chưa chính xác trong vùng hình tròn và có một vài nhiễu +3000

In [None]:
def normalization():
    max_val = min_val = 0
    # Find max min
    for index in range(PAT001.shape[0]):
            img = PAT001[index]
            for x in range(PAT001.shape[1]):
                for y in range(PAT001.shape[2]):
                    if euclidian_distance(x, y, a=PAT001.shape[1]/2, b=PAT001.shape[1]/2) <= ban_kinh: # nằm trong hình tròn
                        if img[x,y] > max_val:
                            max_val = img[x,y] 
                        elif img[x,y] < max_val:
                            min_val = img[x,y] 
    # Intensity normalization                        
    for index in range(PAT001.shape[0]):
        img = PAT001[index]
        for x in range(PAT001.shape[1]):
            for y in range(PAT001.shape[2]):
                if euclidian_distance(x, y, a=PAT001.shape[1]/2, b=PAT001.shape[1]/2) <= ban_kinh: # nằm trong hình tròn
                    img = (img - min_val) / (max_val - min_val)
                    PAT001[index] = img

Hoặc Normalize với range tùy chọn

In [None]:
from scipy import stats

in_range = (-1024, 1024)
mask = (PAT001 >= in_range [0]) & (PAT001 <= in_range [1])
masked_array = np.ma.masked_array (PAT001, ~mask)
normalized_array = stats.zscore (masked_array, axis = 0)
normalized_array = normalized_array.filled (0)

plt.hist(normalized_array.flatten(), bins=80, color='c')
plt.xlabel("Đơn vị Hounsfield")
plt.ylabel("Tấn suất")
plt.show()

#### Normalize with keras lib

In [None]:
img = sitk.ReadImage('../../data/PatientsDCM/Postprocessing/PAT001.nii.gz', sitk.sitkFloat32)
img = sitk.GetArrayFromImage(img)

In [None]:
plt.hist(img.flatten(), bins=80, color='c')
plt.xlabel("Đơn vị Hounsfield")
plt.ylabel("Tấn suất")
plt.show()

In [None]:
from keras.utils import normalize

img = normalize(img, axis=1)
print(np.max(img), np.min(img))
explore_3D_array(img)

Lưu ý: Bước normalization chỉ thực hiện khi muốn train một model nào đó

### 5. Toàn bộ code chạy một lần các các bộ PAT khác nhau

#### Các hàm hỗ trợ

In [None]:
INPUT_FOLDER = '../../data/PatientsDCM/'
patients = os.listdir(INPUT_FOLDER)
patients.sort()

def load_scan(path):
    slices = [dcmread(path + '/' + s) for s in os.listdir(path)]
    slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
    try:
        slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
    except:
        slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
    for s in slices:
        s.SliceThickness = slice_thickness
    return slices


def get_pixels_hu(slices):
    image = np.stack([s.pixel_array for s in slices])
    image = image.astype(np.int16)
    image[image == -2000] = 0
    for slice_number in range(len(slices)):
        intercept = slices[slice_number].RescaleIntercept
        slope = slices[slice_number].RescaleSlope
        if slope != 1:
            image[slice_number] = slope * image[slice_number].astype(np.float64)
            image[slice_number] = image[slice_number].astype(np.int16)
        image[slice_number] += np.int16(intercept)  
    return np.array(image, dtype=np.int16)

def euclidian_distance(x, y, a, b):
    dis = ((x - a)**2 + (y - b)**2)**(1/2)
    return dis

def filtering(image):
    # Load model
    filename = '../../model/training/models/logreg_model.sav'
    loaded_model = pickle.load(open(filename, 'rb'))
        
    # Predict
    temp = image.reshape(image.shape[0],-1)
    y_pred = loaded_model.predict(temp)
        
    # filtering
    start = end = 0
    for index, value in enumerate(y_pred):
        if value == 1:
            start = index
            while y_pred[index] == 1 and index < image.shape[0] - 1:
                index += 1
            end = index
            break
                
    image = image[start:end]
    return image

#### Code chạy chính

In [None]:
def run(index_PAT):
    # Read the volumetric images
    PAT_scan = load_scan(INPUT_FOLDER + patients[index_PAT - 1])
    PAT = get_pixels_hu(PAT_scan)
    
    # Filtering with logistic regression
    PAT = filtering(PAT)

    # Circling and clipping
    ban_kinh = 190
    a = b = 512/2
    left = top = a - ban_kinh
    right = bottom = a + ban_kinh
    crop_imgs = []

    for index in range(PAT.shape[0]):
        img = PAT[index]
        for x in range(PAT.shape[1]):
            for y in range(PAT.shape[2]):
                if euclidian_distance(x, y, a, b) > ban_kinh:
                    img[x,y] = -2000
        img = img[int(top) : int(bottom), int(left) : int(right)]
        crop_imgs.append(img)
    crop_imgs = np.array(crop_imgs)
    
    # Save into .nii.gz file
    index_PAT = '00' + str(index_PAT) if index_PAT < 10 else '0' + str(index_PAT)
    out_path = f'../../data/PatientsDCM/Postprocessing/PAT{index_PAT}.nii.gz'
    converted_array = np.array(crop_imgs, dtype=np.float32)
    converted_array = np.transpose(converted_array, (2, 1, 0))
    affine = np.eye(4)
    nifti_file = nib.Nifti1Image(converted_array, affine)
    nib.save(nifti_file, out_path)
    
    # Done line
    print(f'{index_PAT} done!')

# Loop through all patients
for index_PAT in range(1, 17, 1):
    run(index_PAT)

### 6. Kiểm tra kết quả

In [None]:
img = sitk.ReadImage('../../data/PatientsDCM/Postprocessing/PAT004.nii.gz', sitk.sitkFloat32)
img = sitk.GetArrayFromImage(img)
print(img.shape)
explore_3D_array(img)