## 癌症检测

#### 流程：

1. 数据下载与预处理，得到图片为224* 224* 3大小，标准化，并且将数据类别进行one-hot编码
2. 数据增强
3. 权重初始化
4. L2正则化
5. 建立神经网络，使用VGG16，先训练bottleneck features，再将前面的卷积层加入一起训练。
6. 绘制损失曲线，并且可视化第一层的权重。
7. 将数据分为5份，交叉验证。

尝试：fancy PCA，数据预训练（无监督）

In [1]:
# download files
from os.path import isdir, isfile
from os import mkdir
from tqdm import tqdm
from urllib.request import urlretrieve
import urllib

class DLProgress(tqdm):
    last_block = 0
    def hook(self, block_num = 1, block_size = 1, total_size = None):
        self.total = total_size
        self.update((block_num-self.last_block)*block_size)
        self.last_block = block_num

headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}  

if not isfile('./train.zip'):
    with DLProgress(unit = 'B',unit_scale = True, miniters=1, desc = 'train.zip') as pbar:        
        url = 'https://s3-us-west-1.amazonaws.com/udacity-dlnfd/datasets/skin-cancer/train.zip'
        req = urllib.request.Request(url=url, headers=headers)  
        urlretrieve(
            url,
            './train.zip',
            pbar.hook)
else:
    print('Training file already exists!')

if not isfile('./valid.zip'):
    with DLProgress(unit = 'B',unit_scale = True, miniters=1, desc = 'valid.zip') as pbar:
        url = 'https://s3-us-west-1.amazonaws.com/udacity-dlnfd/datasets/skin-cancer/valid.zip'
        req = urllib.request.Request(url=url, headers=headers)  
        urlretrieve(
            url,
            './valid.zip',
            pbar.hook)
else:
    print('Validation file already exists!')

    
if not isfile('./test.zip'):
    with DLProgress(unit = 'B',unit_scale= True, miniters=1, desc = 'test.zip') as pbar:
        url = 'https://s3-us-west-1.amazonaws.com/udacity-dlnfd/datasets/skin-cancer/test.zip'
        req = urllib.request.Request(url=url, headers=headers)  
        urlretrieve(
            url,
            './test.zip',
            pbar.hook)
else:
    print('Test file already exists!')

valid.zip: 865MB [01:18, 11.0MB/s]                           
test.zip: 5.53GB [08:06, 11.4MB/s]                            


In [3]:
# unzip data
import zipfile

with zipfile.ZipFile('./train.zip','r') as f:
    for file in tqdm(f.namelist()):
        f.extract(file,'./')
with zipfile.ZipFile('./valid.zip','r') as f:
    for file in tqdm(f.namelist()):
        f.extract(file,'./')
with zipfile.ZipFile('./test.zip','r') as f:
    for file in tqdm(f.namelist()):
        f.extract(file,'./')

100%|██████████| 2004/2004 [00:41<00:00, 47.93it/s]
100%|██████████| 154/154 [00:06<00:00, 24.53it/s]
100%|██████████| 604/604 [00:38<00:00, 15.83it/s]


In [4]:
from glob import glob
from keras.preprocessing import image
from PIL import ImageFile
import numpy as np
from tqdm import tqdm

ImageFile.LOAD_TRUNCATED_IMAGES = True
def path_to_tensor(img_path):
    img = image.load_img(img_path, target_size = (224,224))
    x = image.img_to_array(img)
    return np.expand_dims(x, axis = 0)
def paths_to_tensor(img_paths):
    list_of_tensor = [path_to_tensor(img_path) for img_path in tqdm(img_paths)]
    return np.vstack(list_of_tensor)

  return f(*args, **kwds)
Using TensorFlow backend.
  return f(*args, **kwds)
  return f(*args, **kwds)


In [5]:
#split data for future test
from keras.utils import np_utils
def process_data(filepath):
    class_name = [folder for folder in glob(filepath+'/*/') if isdir(folder)]
    features = np.array([])
    labels = np.array([])
    for i in range(len(class_name)):
        print(filepath+class_name[i])
        filename = class_name[i].replace('\\','/')
        contents = paths_to_tensor(glob(filename+'/*'))
        if features.shape == (0,):
            features = contents
        else:
            features = np.concatenate((features,contents))
        if i == 0:
            labels = np.zeros((len(contents),1))
        else:
            add_label = np.array([[i]]*len(contents))
            labels = np.concatenate((labels, add_label),axis = 0)
        i += 1
    labels = labels.flatten()
    labels = np_utils.to_categorical(labels)
    print('Feature shape: ', features.shape)
    print('Label shape:', labels.shape)
    return features, labels

In [6]:
from os.path import isdir, isfile
from os import mkdir
from tqdm import tqdm
from urllib.request import urlretrieve
import urllib
train_features,train_labels = process_data('./train')
valid_features,valid_labels = process_data('./valid')
test_features,test_labels = process_data('./test')

  0%|          | 1/254 [00:00<00:37,  6.74it/s]

./train./train/seborrheic_keratosis/


100%|██████████| 254/254 [00:59<00:00,  4.24it/s]
  0%|          | 0/1372 [00:00<?, ?it/s]

./train./train/nevus/


100%|██████████| 1372/1372 [03:10<00:00,  7.20it/s]
  0%|          | 1/374 [00:00<00:56,  6.60it/s]

./train./train/melanoma/


100%|██████████| 374/374 [00:55<00:00,  6.79it/s]
  2%|▏         | 1/42 [00:00<00:05,  7.90it/s]

Feature shape:  (2000, 224, 224, 3)
Label shape: (2000, 3)
./valid./valid/seborrheic_keratosis/


100%|██████████| 42/42 [00:07<00:00,  5.41it/s]
  1%|▏         | 1/78 [00:00<00:09,  8.10it/s]

./valid./valid/nevus/


100%|██████████| 78/78 [00:21<00:00,  3.70it/s]
  0%|          | 0/30 [00:00<?, ?it/s]

./valid./valid/melanoma/


100%|██████████| 30/30 [00:12<00:00,  2.47it/s]
  1%|          | 1/90 [00:00<00:11,  7.84it/s]

Feature shape:  (150, 224, 224, 3)
Label shape: (150, 3)
./test./test/seborrheic_keratosis/


100%|██████████| 90/90 [00:15<00:00,  5.84it/s]
  0%|          | 0/393 [00:00<?, ?it/s]

./test./test/nevus/


100%|██████████| 393/393 [03:13<00:00,  2.03it/s]
  0%|          | 0/117 [00:00<?, ?it/s]

./test./test/melanoma/


100%|██████████| 117/117 [00:46<00:00,  2.50it/s]

Feature shape:  (600, 224, 224, 3)
Label shape: (600, 3)





In [9]:
# save the data
np.save('train_features.npy',train_features)
np.save('train_labels.npy',train_labels)
np.save('valid_features.npy',valid_features)
np.save('valid_labels.npy',valid_labels)
np.save('test_features.npy',test_features)
np.save('test_labels.npy',test_labels)

In [2]:
# using load command to load the data
# this step is not necessary if you have already loaded the data
import numpy as np
train_features = np.load('train_features.npy')
train_labels = np.load('train_labels.npy')
valid_features = np.load('valid_features.npy')
valid_labels = np.load('valid_labels.npy')
test_features = np.load('test_features.npy')
test_labels = np.load('test_labels.npy')

In [3]:
# standardize data
for i in range(len(train_features)):
    train_features[i] = (train_features[i].astype('float32') - 125.0)/125.0
for i in range(len(valid_features)):
    valid_features[i] = (valid_features[i].astype('float32') - 125.0)/125.0
for i in range(len(test_features)):
    test_features[i] = (test_features[i].astype('float32') - 125.0)/125.0

### autoencoder

首先建立网络进行自编码，提取数据更清晰的特征。

下面使用TensorFlow建立一个自编码器autoencoder。

In [4]:
import tensorflow as tf
inputs_ = tf.placeholder(tf.float32, (None, 224, 224, 3), name='inputs')
targets_ = tf.placeholder(tf.float32, (None, 224, 224, 3), name='targets')

# Encoder
conv1 = tf.layers.conv2d(inputs_, 16, (3,3), padding='same', activation=tf.nn.relu)
# 224x224x16
maxpool1 = tf.layers.max_pooling2d(conv1, (2,2), (2,2), padding='same')
# 112x112x16
conv2 = tf.layers.conv2d(maxpool1, 8, (3,3), padding='same', activation=tf.nn.relu)
# 112x112x8
maxpool2 = tf.layers.max_pooling2d(conv2, (2,2), (2,2), padding='same')
# 56x56x8
conv3 = tf.layers.conv2d(maxpool2, 8, (3,3), padding='same', activation=tf.nn.relu)
# 56x56x8
encoded = tf.layers.max_pooling2d(conv3, (2,2), (2,2), padding='same')
# 28x28x8

# Decoder
upsample1 = tf.image.resize_nearest_neighbor(encoded, (56,56))
# 56x56x8
conv4 = tf.layers.conv2d(upsample1, 8, (3,3), padding='same', activation=tf.nn.relu)
# 56x56x8
upsample2 = tf.image.resize_nearest_neighbor(conv4, (112,112))
# 112x112x8
conv5 = tf.layers.conv2d(upsample2, 8, (3,3), padding='same', activation=tf.nn.relu)
# 56x56x8
upsample3 = tf.image.resize_nearest_neighbor(conv5, (224,224))
# 224x224x8
conv6 = tf.layers.conv2d(upsample3, 16, (3,3), padding='same', activation=tf.nn.relu)
# 224x224x16

logits = tf.layers.conv2d(conv6, 3, (3,3), padding='same', activation=None)
# 224x224x3

decoded = tf.nn.sigmoid(logits, name='decoded')

loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=targets_, logits=logits)
cost = tf.reduce_mean(loss)
opt = tf.train.AdamOptimizer(0.001).minimize(cost)

  return f(*args, **kwds)
  return f(*args, **kwds)


In [None]:
with tf.Session() as sess:
    epochs = 10
    batch_size = 200
    sess.run(tf.global_variables_initializer())
    for e in range(epochs):
        for ii in range(len(train_features)//batch_size):
            if ii == len(train_features)//batch_size -1:
                batch = train_features[-(len(train_features)-(ii+1)*batch_size):]
            else:
                batch = train_features[ii*batch_size:(ii+1)*batch_size]
            batch_cost, _ = sess.run([cost, opt], feed_dict={inputs_: batch,
                                                             targets_: batch})
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Training loss: {:.4f}".format(batch_cost))

Epoch: 1/10... Training loss: 0.7017
Epoch: 1/10... Training loss: 0.6857
Epoch: 1/10... Training loss: 0.6771
Epoch: 1/10... Training loss: 0.6765
Epoch: 1/10... Training loss: 0.6737
Epoch: 1/10... Training loss: 0.6731
Epoch: 1/10... Training loss: 0.6725
Epoch: 1/10... Training loss: 0.6713
Epoch: 1/10... Training loss: 0.6694


In [2]:
# save the model
saver = tf.train.Saver(max_to_keep=1)
with tf.Session() as sess:
    saver.save(sess,'autoencoder.ckpt')

NameError: name 'tf' is not defined

In [None]:
# visualize the results first
fig, axes = plt.subplots(nrows=2, ncols=5, sharex=True, sharey=True, figsize=(20,4))
imgs = train_features[:10]
reconstructed = sess.run(decoded, feed_dict={inputs_: imgs})

for images, row in zip([imgs, reconstructed], axes):
    for img, ax in zip(images, row):
        ax.imshow(img.reshape((112, 112)))
        ax.get_xaxis().set_visible(False)
        ax.get_yaxis().set_visible(False)
fig.tight_layout(pad=0.1)

In [None]:
# get results from autoencoder
with tf.Session() as sess:
    train_features_auto = sess.run(decoded, feed_dict = {inputs_: train_features})
    valid_features_auto = sess.run(decoded, feed_dict = {inputs_: valid_features})
    test_features_auto = sess.run(decoded, feed_dict = {inputs_: test_features})

In [None]:
# data augmentation
from keras.preprocessing.image import ImageDataGenerator
datagen = ImageDataGenerator(
    zoom_range = 0.2,
    horizontal_flip = True,
    shear_range = 0.2,
    vertical_flip = True,
    fill_mode = 'nearest'
)
datagen.fit(train_features)
datagen.fit(valid_features)

模型的初步训练也可以使用EarlyStopping，从而加速训练。

In [None]:
# 模型建立
from keras.models import Model
from keras.applications import vgg16
import tensorflow as tf
from keras.callbacks import ModelCheckpoint
from keras.optimizers import SGD

# 首先要对模型进行预训练，即固定VGG16前面的权重，对后面部分进行训练
model = vgg16.VGG16(weights = 'imagenet',include_top = False, input_shape=(224, 224, 3))
print('Model loaded')
for layer in model.layers:
    layer.trainable = False

In [None]:
# add new layer and regularizers
# 版本不符，softmax函数用tf.nn.softmax代替
from keras import regularizers
from keras.layers import Dense, Flatten, Dropout
import tensorflow as tf
output_shape = train_labels.shape
input_shape  = train_features.shape
def add_new_layer(model):
    x = model.output
    x = Flatten()(x)
    x = Dense(122, activation = 'relu',kernel_regularizer = regularizers.l1(0.01))(x)
    x = Dropout(0.5)(x)
    x = Dense(50, activation = 'relu',kernel_regularizer = regularizers.l1(0.01))(x)
    x = Dropout(0.5)(x)
    x = Dense(output_shape[1], activation = tf.nn.softmax)(x)
    model = Model(input = model.input, output = x)
    return model
def freeze_all_model(model):
    for layer in model.layers:
        layer.trainable = False

In [None]:
#setting checkpointer and early stopping
from keras.callbacks import ModelCheckpoint,EarlyStopping

checkpointer = ModelCheckpoint(
                filepath = 'Cancer_best_weights.hdf5', 
                verbose=1,
                save_best_only=True)
stopper = EarlyStopping(
                monitor = 'val_acc',
                patience = 2,
                min_delta = 0.0003,
                mode = 'max')

In [None]:
# set up model
model= add_new_layer(model)
# setting hyperparameters
batch_size = 32
epochs = 10
learning_rate = 0.01

sgd = SGD(lr=learning_rate, decay=1e-4, momentum=0.9, nesterov=True)
model.compile(optimizer = 'sgd',loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.summary()

In [None]:
# train model
model_history = model.fit_generator(datagen.flow(
                        train_features,
                        train_labels,
                        batch_size = batch_size),
                    steps_per_epoch = train_features.shape[0]//batch_size,
                    callbacks = [checkpointer],
                    validation_data=[valid_features, valid_labels],
                    epochs = epochs,
                    shuffle = True,
                    verbose = 2)

In [None]:
# visualize the loss and accuracy to find a perfect point
# summarize history for accuracy
import matplotlib.pyplot as plt
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

### 初始训练完模型后

很容易看出，训练过程中虽然验证集损失一直在减少，但其正确率没有得到提高，很可能是因为模型受局限于卷积层固定的权重。

这主要是因为我们只训练了最后几层模型，而没有对前面的层进行训练，现在开始结合VGG16的顶层卷积层来训练模型。

In [None]:
# train 
def fine_tune(model):
    for layer in model.layers[:-11]:
        layer.trainable = False
    for layer in model.layers[-11:]:
        layer.trainable = True

In [None]:
fine_tune(model)
# setting hyperparameters
batch_size = 32
epochs = 10
learning_rate = 0.0001

sgd = SGD(lr=learning_rate, momentum=0.9, nesterov=True)
model.compile(optimizer = 'sgd',loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.summary()

In [None]:
train_history_2 = model.fit