KẾT QUẢ CỦA CÁC MÔ HÌNH SAU KHI XỬ LÍ MẤT CÂN BẰNG:
https://drive.google.com/drive/folders/1jmeYUwPDNT5dxC1V9JZ5d1o0Mgmlhgal?usp=sharing

Dataset: https://www.kaggle.com/datasets/tawsifurrahman/covid19-radiography-database

Covid19-XRay-w/-ML-and-DL: https://www.kaggle.com/code/thura1601/covid19-xray-w-ml-and-dl


COVID-19 Detection using X-Ray: https://www.kaggle.com/code/chiragbmiskin/covid-19-detection-using-x-ray


Tham khảo: https://www.kaggle.com/code/kaledhoshme/probabilistic-u-net-segmentation-ambiguous-images#Introduction

# **Library**

In [1]:
from google.colab import files

import os
import pandas as pd
import numpy as np
from PIL import Image, ImageOps
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time
import tensorflow as tf
import pathlib

import keras
from keras import layers
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Flatten
from sklearn.metrics import classification_report
from keras.preprocessing.image import ImageDataGenerator

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
import torch
import torch.nn as nn
from imblearn.combine import SMOTEENN
from sklearn import metrics

from keras.callbacks import EarlyStopping
es = EarlyStopping(monitor='val_loss', patience=5)  # Khởi tạo callback 'es'

import tensorflow as tf
tf.config.run_functions_eagerly(True)


# **Step 1: Load Data**

In [2]:
!pip install kaggle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
# Chọn tệp JSON API Token từ máy tính của bạn
uploaded = files.upload()

# Di chuyển tệp JSON vào thư mục ~/.kaggle/
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


In [4]:
# Tải xuống bộ dữ liệu từ Kaggle
!kaggle datasets download -d tawsifurrahman/covid19-radiography-database

# Giải nén bộ dữ liệu
!unzip covid19-radiography-database.zip

In [5]:
# label tagging to the images
levels = ['Normal', 'COVID', 'Lung_Opacity', 'Viral Pneumonia']
path = "/content/COVID-19_Radiography_Dataset"
data_dir = os.path.join(path)

data = []
for id, level in enumerate(levels):
    for file in os.listdir(os.path.join(data_dir, level + '/images')):
        data.append(['{}/images/{}'.format(level, file), level])

data = pd.DataFrame(data, columns = ['image_file', 'result'])

data['path'] = path + '/' + data['image_file']

In [6]:
data.head()

Unnamed: 0,image_file,result,path
0,Normal/images/Normal-8518.png,Normal,/content/COVID-19_Radiography_Dataset/Normal/i...
1,Normal/images/Normal-4299.png,Normal,/content/COVID-19_Radiography_Dataset/Normal/i...
2,Normal/images/Normal-6591.png,Normal,/content/COVID-19_Radiography_Dataset/Normal/i...
3,Normal/images/Normal-1375.png,Normal,/content/COVID-19_Radiography_Dataset/Normal/i...
4,Normal/images/Normal-22.png,Normal,/content/COVID-19_Radiography_Dataset/Normal/i...


# **Step 3: Model**

Xem chi tiết: https://colab.research.google.com/drive/1uMt2jXbfas_NnREQr2FP9VtLu9Sz6h3x?usp=sharing#scrollTo=HlnN67FHw16l

## **Các tiền xử lý cơ bản**

In [8]:
pixel_img = []

for image in tqdm(data['path']):
    img=Image.open(image)
    img=ImageOps.grayscale(img)
    img=img.resize((64,64))
    img=np.asarray(img)
    img=img.reshape((64,64,1))
    pixel_img.append(img)

pixel_img = np.array(pixel_img)
label_img = data['result'].map({'Normal': 0, 'COVID': 1, 'Lung_Opacity' : 2,
                               'Viral Pneumonia' : 3})

print(pixel_img.shape, label_img.shape)

# Splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(pixel_img, label_img, 
                                                    test_size=0.2, stratify=label_img)

100%|██████████| 21165/21165 [01:13<00:00, 288.01it/s]


(21165, 64, 64, 1) (21165,)


## **Sử dụng Data Augmentation kết hợp với Oversampling trên các mô hình**

### **Model ConvNet cnn1**

In [9]:
# Khởi tạo đối tượng ImageDataGenerator với các phép biến đổi tùy chỉnh
datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rotation_range=30,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    vertical_flip=False,
    fill_mode='nearest'
)

In [9]:
'''
Mô hình ConvNet - CNN model1
'''

# Xử lý trước khi đưa vào mô hình
X_train, X_test, y_train, y_test = train_test_split(pixel_img, label_img, 
                                                    test_size=0.2, stratify=label_img)

num_classes = len(np.unique(y_train))

y_train = tf.keras.utils.to_categorical(y_train)
y_test = tf.keras.utils.to_categorical(y_test)

X_train = X_train.astype('float32') / 255
X_test = X_test.astype('float32') / 255

input_shape = (X_train.shape[1], X_train.shape[2], 1)

cnn_model1 = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(32, kernel_size=(3,3), padding="same", activation="relu"),
        layers.Conv2D(32, kernel_size=(3,3), padding="valid", activation="relu"),
        layers.MaxPooling2D(pool_size=(2,2)),
        layers.Dropout(0.2, seed=235), 
        layers.Conv2D(32, kernel_size=(3,3), padding="same", activation="relu"),
        layers.Conv2D(32, kernel_size=(3,3), padding="valid", activation="relu"),
        layers.MaxPooling2D(pool_size=(2,2)),
        layers.Dropout(0.2, seed=235),
        layers.Conv2D(32, kernel_size=(3,3), padding="same", activation="relu"),
        layers.Conv2D(32, kernel_size=(3,3), padding="valid", activation="relu"),
        layers.MaxPooling2D(pool_size=(2,2)),
        layers.Dropout(0.2, seed=235),
        layers.Flatten(),
        layers.Dropout(0.5, seed=235),
        layers.Dense(512, activation="relu"),
        layers.Dense(num_classes, activation="softmax"),
    ]
)

# Áp dụng Data Augmentation lên dữ liệu huấn luyện
augmented_data = datagen.flow(X_train, y_train, batch_size=128)

# Xác định số lượng mẫu trong từng lớp
unique_classes, counts = np.unique(np.argmax(y_train, axis=1), return_counts=True)
num_samples = dict(zip(unique_classes, counts))

# Tìm lớp có số lượng mẫu lớn nhất
max_samples = max(num_samples.values())

# Xác định tỷ lệ oversampling cho từng lớp
oversampling_ratio = 2
sampling_strategy = {}
for label, count in num_samples.items():
    oversampling_count = int(max_samples * oversampling_ratio)
    sampling_strategy[label] = oversampling_count

# Làm phẳng dữ liệu hình ảnh thành ma trận 2D
X_train_flat = X_train.reshape(X_train.shape[0], -1)

over_sampler = RandomOverSampler(sampling_strategy=sampling_strategy)
X_oversampling_flat, y_oversampling = over_sampler.fit_resample(X_train_flat, y_train)

# Đưa dữ liệu trở lại dạng ban đầu
X_oversampling = X_oversampling_flat.reshape(X_oversampling_flat.shape[0], X_train.shape[1], X_train.shape[2], 1)

# Biên dịch mô hình
cnn_model1.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Huấn luyện mô hình với dữ liệu đã được tăng cường và oversampling
history = cnn_model1.fit(X_oversampling, y_oversampling, epochs=50, validation_data=(X_test, y_test), callbacks=[es])



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50


In [10]:
loss, acc = cnn_model1.evaluate(X_test, y_test, verbose=0)

print("Accuracy model8 adam: %.2f%%" % (100.0 * acc))

predictions = cnn_model1.predict(X_test)
pd.crosstab(np.argmax(y_test, axis=1), np.argmax(predictions, axis=1))

Accuracy model8 adam: 91.19%


col_0,0,1,2,3
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1888,44,85,21
1,11,699,12,1
2,137,50,1014,2
3,6,3,1,259


In [11]:
cnn_model1.save('/content/Model/cnn_model1_data_augmentation_and_oversampling.h5')

print("Classification report for classifier :\n"
      f"{metrics.classification_report(np.argmax(y_test, axis=1), np.argmax(predictions, axis=1))}\n")

Classification report for classifier :
              precision    recall  f1-score   support

           0       0.92      0.93      0.93      2038
           1       0.88      0.97      0.92       723
           2       0.91      0.84      0.88      1203
           3       0.92      0.96      0.94       269

    accuracy                           0.91      4233
   macro avg       0.91      0.92      0.92      4233
weighted avg       0.91      0.91      0.91      4233




### **Model ConvNet cnn2**

In [12]:
'''
Mô hình ConvNet - CNN model2
'''

cnn_model2 = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(32, kernel_size=(3,3), padding="same", activation="relu"),
        layers.Conv2D(32, kernel_size=(3,3), padding="valid", activation="relu"),
        layers.MaxPooling2D(pool_size=(2,2)),
        layers.Dropout(0.2, seed=235), 
        layers.Conv2D(32, kernel_size=(3,3), padding="same", activation="relu"),
        layers.Conv2D(32, kernel_size=(3,3), padding="valid", activation="relu"),
        layers.MaxPooling2D(pool_size=(2,2)),
        layers.Dropout(0.2, seed=235),
        layers.Conv2D(32, kernel_size=(3,3), padding="same", activation="relu"),
        layers.Conv2D(32, kernel_size=(3,3), padding="valid", activation="relu"),
        layers.MaxPooling2D(pool_size=(2,2)),
        layers.Dropout(0.2, seed=235),
        layers.Conv2D(32, kernel_size=(3,3), padding="same", activation="relu"),
        layers.Conv2D(32, kernel_size=(3,3), padding="valid", activation="relu"),
        layers.MaxPooling2D(pool_size=(2,2)),
        layers.Dropout(0.2, seed=235),
        layers.Flatten(),
        layers.Dropout(0.5, seed=235),
        layers.Dense(512, activation="relu"),
        layers.Dense(num_classes, activation="softmax"),
    ]
)

# Áp dụng Data Augmentation lên dữ liệu huấn luyện
augmented_data = datagen.flow(X_train, y_train, batch_size=128)

# Xác định số lượng mẫu trong từng lớp
unique_classes, counts = np.unique(np.argmax(y_train, axis=1), return_counts=True)
num_samples = dict(zip(unique_classes, counts))

# Tìm lớp có số lượng mẫu lớn nhất
max_samples = max(num_samples.values())

# Xác định tỷ lệ oversampling cho từng lớp
oversampling_ratio = 2
sampling_strategy = {}
for label, count in num_samples.items():
    oversampling_count = int(max_samples * oversampling_ratio)
    sampling_strategy[label] = oversampling_count

# Làm phẳng dữ liệu hình ảnh thành ma trận 2D
X_train_flat = X_train.reshape(X_train.shape[0], -1)

over_sampler = RandomOverSampler(sampling_strategy=sampling_strategy)
X_oversampling_flat, y_oversampling = over_sampler.fit_resample(X_train_flat, y_train)

# Đưa dữ liệu trở lại dạng ban đầu
X_oversampling = X_oversampling_flat.reshape(X_oversampling_flat.shape[0], X_train.shape[1], X_train.shape[2], 1)

# Biên dịch mô hình
cnn_model2.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Huấn luyện mô hình với dữ liệu đã được tăng cường và oversampling
history = cnn_model2.fit(X_oversampling, y_oversampling, epochs=50, validation_data=(X_test, y_test), callbacks=[es])



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50


In [13]:
loss, acc = cnn_model2.evaluate(X_test, y_test, verbose=0)

print("Accuracy model8 adam: %.2f%%" % (100.0 * acc))

predictions = cnn_model2.predict(X_test)
pd.crosstab(np.argmax(y_test, axis=1), np.argmax(predictions, axis=1))

Accuracy model8 adam: 91.14%


col_0,0,1,2,3
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1852,27,128,31
1,11,690,17,5
2,118,27,1056,2
3,4,4,1,260


In [14]:
cnn_model2.save('/content/Model/cnn_model2_data_augmentation_and_oversampling.h5')

print("Classification report for classifier :\n"
      f"{metrics.classification_report(np.argmax(y_test, axis=1), np.argmax(predictions, axis=1))}\n")

Classification report for classifier :
              precision    recall  f1-score   support

           0       0.93      0.91      0.92      2038
           1       0.92      0.95      0.94       723
           2       0.88      0.88      0.88      1203
           3       0.87      0.97      0.92       269

    accuracy                           0.91      4233
   macro avg       0.90      0.93      0.91      4233
weighted avg       0.91      0.91      0.91      4233




### **Model MLP**

In [11]:
'''
mlp_model
'''
# MLP with ReLU and dropout
mlp_model = Sequential()
mlp_model.add(Flatten(input_shape=(X_train.shape[1], X_train.shape[2], 1)))
mlp_model.add(Dense(256))
mlp_model.add(Activation('relu'))
mlp_model.add(Dropout(0.5))
mlp_model.add(Dense(num_classes))
mlp_model.add(Activation('softmax'))

mlp_model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

# Áp dụng Data Augmentation lên dữ liệu huấn luyện
augmented_data = datagen.flow(X_train, y_train, batch_size=128)

# Xác định số lượng mẫu trong từng lớp
unique_classes, counts = np.unique(np.argmax(y_train, axis=1), return_counts=True)
num_samples = dict(zip(unique_classes, counts))

# Tìm lớp có số lượng mẫu lớn nhất
max_samples = max(num_samples.values())

# Xác định tỷ lệ oversampling cho từng lớp
oversampling_ratio = 2
sampling_strategy = {}
for label, count in num_samples.items():
    oversampling_count = int(max_samples * oversampling_ratio)
    sampling_strategy[label] = oversampling_count

# Làm phẳng dữ liệu hình ảnh thành ma trận 2D
X_train_flat = X_train.reshape(X_train.shape[0], -1)

over_sampler = RandomOverSampler(sampling_strategy=sampling_strategy)
X_oversampling_flat, y_oversampling = over_sampler.fit_resample(X_train_flat, y_train)

# Đưa dữ liệu trở lại dạng ban đầu
X_oversampling = X_oversampling_flat.reshape(X_oversampling_flat.shape[0], X_train.shape[1], X_train.shape[2], 1)

# Biên dịch mô hình
mlp_model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

# Huấn luyện mô hình với dữ liệu đã được tăng cường và oversampling
history = mlp_model.fit(X_oversampling, y_oversampling, epochs=50, validation_data=(X_test, y_test), callbacks=[es])



Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50


In [12]:
loss, acc = mlp_model.evaluate(X_test, y_test, verbose=0)

print("Accuracy model8 adam: %.2f%%" % (100.0 * acc))

predictions = mlp_model.predict(X_test)
pd.crosstab(np.argmax(y_test, axis=1), np.argmax(predictions, axis=1))

Accuracy model8 adam: 68.72%


col_0,0,1,2,3
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1264,479,200,95
1,39,519,144,21
2,93,230,865,15
3,4,1,3,261


In [13]:
mlp_model.save('/content/Model/mlp_model_data_augmentation_and_oversampling.h5')

print("Classification report for classifier :\n"
      f"{metrics.classification_report(np.argmax(y_test, axis=1), np.argmax(predictions, axis=1))}\n")

Classification report for classifier :
              precision    recall  f1-score   support

           0       0.90      0.62      0.74      2038
           1       0.42      0.72      0.53       723
           2       0.71      0.72      0.72      1203
           3       0.67      0.97      0.79       269

    accuracy                           0.69      4233
   macro avg       0.68      0.76      0.69      4233
weighted avg       0.75      0.69      0.70      4233


