In [1]:
import pandas as pd # 用于数据处理的库
import numpy as np # 用于计算和矢量处理的库
from glob import glob # 文件路径匹配
import matplotlib.pyplot as plt  # 用于绘制图形的库
import cv2 # 用于图像管理和计算机视觉的库
import os # 用于路径处理和操作系统连接的库
import seaborn as sns # 图形增强
sns.set_theme()
sns.set(font_scale = 2)
import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator # 模型的数据生成器
from keras.utils import array_to_img, img_to_array, load_img # 对图像进行处理
from keras import layers # 定义了架构的各层
from keras import Model # 将架构定义为一个模型
from keras.optimizers import RMSprop, Adam, SGD # 不同的优化器
from keras.utils import plot_model # 用于模型的图形可视化的工具
from sklearn.model_selection import train_test_split # 分离训练集和测试集.
from sklearn.metrics import classification_report # 显示所做的预测报告
from ipywidgets import interact # 交互工具

In [2]:
df = pd.read_csv('./archive/Data_Entry_2017.csv') 

diseases = ['Cardiomegaly','Emphysema','Effusion','Hernia','Nodule','Pneumothorax','Atelectasis','Pleural_Thickening','Mass','Edema','Consolidation','Infiltration','Fibrosis','Pneumonia']
#Number diseases
for disease in diseases :
    df[disease] = df['Finding Labels'].apply(lambda x: 1 if disease in x else 0) # 标明是否为病理状态

df

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,...,Nodule,Pneumothorax,Atelectasis,Pleural_Thickening,Mass,Edema,Consolidation,Infiltration,Fibrosis,Pneumonia
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,...,0,0,0,0,0,0,0,0,0,0
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,...,0,0,0,0,0,0,0,0,0,0
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,...,0,0,0,0,0,0,0,0,0,0
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,...,0,0,0,0,0,0,0,0,0,0
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112115,00030801_001.png,Mass|Pneumonia,1,30801,39,M,PA,2048,2500,0.168,...,0,0,0,0,1,0,0,0,0,1
112116,00030802_000.png,No Finding,0,30802,29,M,PA,2048,2500,0.168,...,0,0,0,0,0,0,0,0,0,0
112117,00030803_000.png,No Finding,0,30803,42,F,PA,2048,2500,0.168,...,0,0,0,0,0,0,0,0,0,0
112118,00030804_000.png,No Finding,0,30804,30,F,PA,2048,2500,0.168,...,0,0,0,0,0,0,0,0,0,0


In [3]:
labels = df[diseases].to_numpy()
all_image_paths = {os.path.basename(x): x for x in 
                   glob(os.path.join('.', 'archive','images*','images','*.png'))} # 找到图像的路径
print('Images found:', len(all_image_paths))


df['Path'] = df['Image Index'].map(all_image_paths.get) # 用上述字典中的路径替换 "Image Index "列
files_list = df['Path'].tolist() # 将前一列转换为一个列表

labelB = (df[diseases].sum(axis=1)>0).tolist() # 将包含病理或非病理信息的列转换为列表
labelB = np.array(labelB, dtype=int)

df = pd.DataFrame({'path':files_list,
                   'labels':labelB}) #  用定义的列表创建一个新的DataFrame
df['labels'] = df['labels'].astype(str) # 将label列定义为字符串
df.head(100) 

Images found: 112120


Unnamed: 0,path,labels
0,.\archive\images_001\images\00000001_000.png,1
1,.\archive\images_001\images\00000001_001.png,1
2,.\archive\images_001\images\00000001_002.png,1
3,.\archive\images_001\images\00000002_000.png,0
4,.\archive\images_001\images\00000003_000.png,1
...,...,...
95,.\archive\images_001\images\00000020_002.png,1
96,.\archive\images_001\images\00000021_000.png,1
97,.\archive\images_001\images\00000021_001.png,1
98,.\archive\images_001\images\00000022_000.png,0


In [4]:
df,_ = train_test_split(df,test_size=0.9,stratify=df['labels']) # 只有10%的数据被使用

df_train,df_test = train_test_split(df,test_size=0.3,stratify=df['labels'],random_state=1993) # 将70%用于训练，30%用于测试
df_train,df_val = train_test_split(df_train,test_size=0.3,stratify=df_train['labels'],random_state=1993) # 从上述70%中抽取30%用于验证
print('trainning set:', df_train.shape)
print('validate set:', df_val.shape)
print('test set:', df_test.shape)

trainning set: (5493, 2)
validate set: (2355, 2)
test set: (3364, 2)


In [5]:
datagen = ImageDataGenerator(fill_mode='nearest', # 边缘以外的像素取最接近的值
                             rotation_range=20, # 20度旋转
                             rescale=1./255) # 将图像的强度重新划分为[0,1]之间的值

nimag = df.shape[0] # 样本数量

@interact(nray_num=(0,nimag-1,1))
def plot_Data_Augmentation(nray_num):
    img_path = df['path'].iloc[nray_num]
    img = load_img(img_path, target_size=(150, 150))  # 加载图片
    x = img_to_array(img)  # 将图像转换成大小为(150, 150, 3)的numpy数组
    x = x.reshape((1,) + x.shape)  # Numpy数组，大小为（1, 150, 150, 3）

    # 未定义的随机图像将被生成，所以用'break'来停止迭代。
    i = 0 
    plt.figure(figsize=(20,5))
    for batch in datagen.flow(x, batch_size=1):
        plt.subplot(1,5,i+1)
        imgplot = plt.imshow(array_to_img(batch[0]))
        plt.axis('off')
        i += 1
        if i % 5 == 0:
            break
    plt.show()

interactive(children=(IntSlider(value=5605, description='nray_num', max=11211), Output()), _dom_classes=('widg…

In [6]:
train_datagen = ImageDataGenerator(
    rescale=1./255)
#     rotation_range=40,
#     width_shift_range=0.2,
#     height_shift_range=0.2, ## Diferentes aumentos de datos
#     shear_range=0.2,
#     zoom_range=0.2,
#     horizontal_flip=True)

# El generador de prueba (test) no necesita un Data Augmentation
test_datagen = ImageDataGenerator(rescale=1./255)

img_size = 128 # 所需的图像尺寸
batch_size = 128 # 每次迭代中交付给模型的数据量

train_generator = train_datagen.flow_from_dataframe(df_train,
                                                   x_col='path',
                                                   y_col='labels',
                                                   target_size=(img_size,img_size),# 用必要的参数来定义生成器
                                                   batch_size = batch_size,
                                                   color_mode='grayscale',
                                                   class_mode = 'binary')
validation_generator = test_datagen.flow_from_dataframe(df_val,
                                                        x_col='path',
                                                        y_col='labels',
                                                        target_size=(img_size,img_size),
                                                        batch_size = batch_size,
                                                        color_mode='grayscale',
                                                        class_mode = 'binary')

Found 5493 validated image filenames belonging to 2 classes.
Found 2355 validated image filenames belonging to 2 classes.


In [7]:
# 输入被定义为一个灰度的图像（128x128x1）
img_input = layers.Input(shape=(img_size, img_size, 1))

# First convolution extracts 16 filters that are 3x3
# Convolution is followed by max-pooling layer with a 2x2 window
x = layers.Conv2D(16, 3, activation='relu')(img_input)
x = layers.MaxPooling2D(2)(x)

# Second convolution extracts 32 filters that are 3x3
# Convolution is followed by max-pooling layer with a 2x2 window
x = layers.Conv2D(32, 3, activation='relu')(x)
x = layers.MaxPooling2D(2)(x)

# Third convolution extracts 64 filters that are 3x3
# Convolution is followed by max-pooling layer with a 2x2 window
x = layers.Convolution2D(64, 3, activation='relu')(x)
x = layers.MaxPooling2D(2)(x)

# Flatten feature map to a 1-dim tensor
x = layers.Flatten()(x)

# Create a fully connected layer with ReLU activation and 512 hidden units
x = layers.Dense(512, activation='relu')(x)

# Add a dropout rate of 0.5
x = layers.Dropout(0.5)(x)

# Create output layer with a single node and sigmoid activation
output = layers.Dense(1, activation='sigmoid')(x)

# Configure and compile the model
model = Model(img_input, output)
model.compile(loss='binary_crossentropy',
              optimizer=Adam(learning_rate=0.001),
              metrics=['acc'])

In [8]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 128, 128, 1)]     0         
                                                                 
 conv2d (Conv2D)             (None, 126, 126, 16)      160       
                                                                 
 max_pooling2d (MaxPooling2D  (None, 63, 63, 16)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 61, 61, 32)        4640      
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 30, 30, 32)       0         
 2D)                                                             
                                                                 
 conv2d_2 (Conv2D)           (None, 28, 28, 64)        18496 

In [9]:
plot_model(model,
           to_file="model.png",
           show_shapes=True,
           show_layer_names=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [10]:
history = model.fit(train_generator,
                  epochs = 10,
                  batch_size = batch_size,
                  validation_data = validation_generator, 
                  verbose = 2)

Epoch 1/10
43/43 - 298s - loss: 0.6954 - acc: 0.5290 - val_loss: 0.6841 - val_acc: 0.5406 - 298s/epoch - 7s/step
Epoch 2/10
43/43 - 179s - loss: 0.6682 - acc: 0.5960 - val_loss: 0.6652 - val_acc: 0.5945 - 179s/epoch - 4s/step
Epoch 3/10
43/43 - 160s - loss: 0.6599 - acc: 0.6135 - val_loss: 0.6618 - val_acc: 0.6055 - 160s/epoch - 4s/step
Epoch 4/10
43/43 - 158s - loss: 0.6535 - acc: 0.6248 - val_loss: 0.6581 - val_acc: 0.6115 - 158s/epoch - 4s/step
Epoch 5/10
43/43 - 159s - loss: 0.6458 - acc: 0.6357 - val_loss: 0.6553 - val_acc: 0.6170 - 159s/epoch - 4s/step
Epoch 6/10
43/43 - 158s - loss: 0.6372 - acc: 0.6417 - val_loss: 0.6546 - val_acc: 0.6225 - 158s/epoch - 4s/step
Epoch 7/10
43/43 - 157s - loss: 0.6304 - acc: 0.6506 - val_loss: 0.6603 - val_acc: 0.6144 - 157s/epoch - 4s/step
Epoch 8/10
43/43 - 160s - loss: 0.6201 - acc: 0.6659 - val_loss: 0.6595 - val_acc: 0.6229 - 160s/epoch - 4s/step
Epoch 9/10
43/43 - 112s - loss: 0.6124 - acc: 0.6679 - val_loss: 0.6700 - val_acc: 0.6085 - 112s

In [11]:
Nimgs = df_test.shape[0] # 测试图像总数

@interact(img_num = (0,Nimgs-1,1))
def test_on_image( img_num ):
    """ para datos de predicción >= .5 asignaremos 1, en caso contrario el valor será 0  
    """
    resized_image = cv2.resize(cv2.imread(df_test['path'].iloc[img_num],cv2.IMREAD_GRAYSCALE),(img_size,img_size)) # 将测试图像重新调整到所需的尺寸
    print(resized_image.shape)
    pred_pro = model.predict(resized_image.reshape(1,img_size,img_size,1))[0][0] # Utiliza el modelo para predecir si la imagen tiene una patología o no
    print(pred_pro)
    pred = 1. if pred_pro >= 0.5 else 0.
    
    # Muestra la imagen con la etiqueta esperada y la predicha
    fig = plt.figure(figsize = (5,5))
    plt.imshow(resized_image, cmap = "gray")
    plt.title("Etiqueta Real: {} \t Etiqueta Estimada: {}\t Confianza : {:.2%}".format(df_test['labels'].iloc[img_num],pred,pred_pro if pred==1 else 1-pred_pro))
    plt.axis("off")
    plt.show()
    return None

interactive(children=(IntSlider(value=1681, description='img_num', max=3363), Output()), _dom_classes=('widget…

In [12]:
test_datagen = ImageDataGenerator(rescale=1./255)
test_generator =  test_datagen.flow_from_dataframe(df_test,
                                                    x_col='path',
                                                    y_col=None,
                                                    target_size=(img_size,img_size),
                                                    batch_size = batch_size,
                                                    color_mode='grayscale',
                                                    class_mode = None) # class mode no retorna etiquetas del dataframe
pred = model.predict(test_generator) # Predice sobre un batch de datos

Found 3364 validated image filenames.
