In [233]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import tensorflow_addons as tfa
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [234]:
data = pd.read_csv('./Image_classification_data/data_labels_mainData.csv')
# data['isCancerous'] = data['isCancerous'].astype(str)
# data['cellType'] = data['cellType'].astype(str)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [235]:
class_samples_isCancerous = train_data['isCancerous'].value_counts()
total_samples = np.sum(class_samples_isCancerous)
class_weights_isCancerous = total_samples / class_samples_isCancerous
class_weight_dict_isCancerous = {int(k): v for k, v in class_weights_isCancerous.to_dict().items()}

class_samples_cellType = train_data['cellType'].value_counts()
total_samples = np.sum(class_samples_cellType)
class_weights_cellType = total_samples / class_samples_cellType
class_weight_dict_cellType = {int(k): v for k, v in class_weights_cellType.to_dict().items()}

In [236]:
batch_size = 32
train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode='nearest'
)

test_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255
)
test_data['cellType'] = test_data['cellType'].astype(str)
validation_cellType_generator = test_datagen.flow_from_dataframe(
    test_data,
    directory='./Image_classification_data/patch_images',
    x_col='ImageName',
    y_col='cellType',
    target_size=(27, 27),
    batch_size=batch_size,
    class_mode='categorical')

Found 1980 validated image filenames belonging to 4 classes.


In [237]:
input_shape = (27, 27, 3)
num_classes = 4
l2_coeff = 0.01

model_categorical = tf.keras.models.Sequential([
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape, padding='same',
                           kernel_regularizer=tf.keras.regularizers.l2(l2_coeff)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2_coeff)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu', padding='same',
                           kernel_regularizer=tf.keras.regularizers.l2(l2_coeff)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2_coeff)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling2D(2, 2),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2_coeff)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2_coeff)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(num_classes, activation='softmax')
])
model_categorical.summary()

Model: "sequential_56"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_100 (Conv2D)         (None, 27, 27, 32)        896       
                                                                 
 batch_normalization_150 (Ba  (None, 27, 27, 32)       128       
 tchNormalization)                                               
                                                                 
 conv2d_101 (Conv2D)         (None, 25, 25, 32)        9248      
                                                                 
 batch_normalization_151 (Ba  (None, 25, 25, 32)       128       
 tchNormalization)                                               
                                                                 
 max_pooling2d_50 (MaxPoolin  (None, 12, 12, 32)       0         
 g2D)                                                            
                                                     

# DI/HD 使用data_labels_extraData对多分类的模型进行增强
通过半监督学习。我们将采用UDA。
首先先获取额外的数据集并且进行相关处理。
我们从数据集可以观察到，没有癌症在多分类中为2。所以我们可以将不是癌症的样本之间指定其多分类的类别为2.

In [238]:
model_categorical = tf.keras.models.load_model('saved_model/model_categorical')
data_extra = pd.read_csv('./Image_classification_data/data_labels_extraData.csv')
data_extra['isCancerous'] = data_extra['isCancerous'].astype(str)
data_extra['cellType'] = np.nan
data_extra.loc[data_extra['isCancerous'] == '1', 'cellType'] = 2
data_extra_unlabeled = data_extra[data_extra['cellType'].isna()]
data_extra_labeled = data_extra[data_extra['cellType'] == 2]
train_data = pd.concat([train_data, data_extra_labeled], axis=0).reset_index(drop=True)

In [239]:
train_data['cellType'] = train_data['cellType'].astype(str)
data_extra_unlabeled['isCancerous'] = data_extra_unlabeled['isCancerous'].astype(str)
train_generator = train_datagen.flow_from_dataframe(
    train_data,
    directory='./Image_classification_data/patch_images',
    x_col='ImageName',
    y_col='cellType',
    target_size=(27, 27),
    batch_size=batch_size,
    class_mode='categorical')

unlabeled_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
    rescale=1./255,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode='nearest'
)

unlabeled_generator = unlabeled_datagen.flow_from_dataframe(
    data_extra_unlabeled,
    directory='./Image_classification_data/patch_images',
    x_col='ImageName',
    y_col='isCancerous',
    target_size=(27, 27),
    batch_size=batch_size,
    class_mode='categorical')

Found 10906 validated image filenames belonging to 4 classes.
Found 7394 validated image filenames belonging to 1 classes.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_extra_unlabeled['isCancerous'] = data_extra_unlabeled['isCancerous'].astype(str)


In [240]:
consistency_weight = 1
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=0.0001)

# 定义数据增强策略
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.experimental.preprocessing.RandomFlip("horizontal"),
    tf.keras.layers.experimental.preprocessing.RandomRotation(0.1),
])

# 训练循环
def apply_uda(x_batch, y_batch, x_unlabeled, model, optimizer, training=True):
    x_unlabeled_augmented = None
    if x_unlabeled is not None:
        # 对无标签数据进行数据增强
        x_unlabeled_augmented = data_augmentation(x_unlabeled)

    # 计算模型在原始无标签数据和增强无标签数据上的输出
    with tf.GradientTape() as tape:
        y_pred_labeled = model(x_batch)
        if x_unlabeled is not None:
            y_pred_unlabeled = model(x_unlabeled)
            y_pred_unlabeled_augmented = model(x_unlabeled_augmented)

        # 计算有监督损失
        supervised_loss = tf.reduce_mean(tf.keras.losses.categorical_crossentropy(y_batch, y_pred_labeled))

        # 如果提供了无标签数据，则计算一致性损失
        if x_unlabeled is not None:
            consistency_loss = tf.reduce_mean(tf.keras.losses.mean_squared_error(y_pred_unlabeled, y_pred_unlabeled_augmented))
            total_loss = supervised_loss + consistency_weight * consistency_loss
        else:
            total_loss = supervised_loss

        # 计算准确度
        accuracy = tf.keras.metrics.categorical_accuracy(y_batch, y_pred_labeled)

    if training:
        # 反向传播和优化
        grads = tape.gradient(total_loss, model_categorical.trainable_variables)
        optimizer.apply_gradients(zip(grads, model_categorical.trainable_variables))
    return total_loss, supervised_loss, consistency_loss if x_unlabeled is not None else None, accuracy

epochs = 100

min_val_loss = float('inf')
num_epochs_no_improvement = 0
# current_learning_rate = optimizer.learning_rate.numpy()

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")

    total_loss = 0
    total_supervised_loss = 0
    total_consistency_loss = 0
    total_accuracy = 0
    steps = 0

    with tqdm(total=len(data_extra_unlabeled) / batch_size, desc="Training", ncols=100) as progress_bar:
        for (x_batch, y_batch), (x_unlabeled, _) in zip(train_generator, unlabeled_generator):
            progress_bar.update(1)
            loss, supervised_loss, consistency_loss, accuracy = apply_uda(x_batch, y_batch, x_unlabeled, model_categorical, optimizer)
            total_loss += loss
            total_supervised_loss += supervised_loss
            total_consistency_loss += consistency_loss
            total_accuracy += tf.reduce_mean(accuracy)
            steps += 1
            # 检查是否已经处理了所有批次
            if steps * batch_size >= len(train_data):
                break

    # 计算并打印平均损失和准确率
    avg_loss = total_loss / steps
    avg_supervised_loss = total_supervised_loss / steps
    avg_consistency_loss = total_consistency_loss / steps
    avg_accuracy = total_accuracy / steps
    print(f" - loss: {avg_loss.numpy():.4f}, supervised_loss: {avg_supervised_loss.numpy():.4f}, consistency_loss: {avg_consistency_loss.numpy():.4f}, accuracy: {avg_accuracy.numpy():.4f}")

    total_loss = 0
    total_accuracy = 0
    steps = 0
    for x_batch, y_batch in validation_cellType_generator:
        loss, _, _, accuracy = apply_uda(x_batch, y_batch, None, model_categorical, optimizer, False)
        total_loss += loss
        total_accuracy += tf.reduce_mean(accuracy)
        steps += 1
        # 检查是否需要更新学习率
        # if loss < min_val_loss:
        #     min_val_loss = loss
        #     num_epochs_no_improvement = 0
        # else:
        #     num_epochs_no_improvement += 1
        #
        # if num_epochs_no_improvement >= 3:
        #     current_learning_rate *= np.sqrt(0.1)
        #     current_learning_rate = max(current_learning_rate, 0.5e-15)
        #     optimizer.learning_rate.assign(current_learning_rate)
        #     num_epochs_no_improvement = 0

        if steps * batch_size >= len(test_data):
            break

    avg_loss = total_loss / steps
    avg_accuracy = total_accuracy / steps
    print(f"Validation set： - loss: {avg_loss.numpy():.4f}, accuracy: {avg_accuracy.numpy():.4f}")

Epoch 1/100


Training: 341it [00:31, 10.99it/s]                                                                  


 - loss: 0.3758, supervised_loss: 0.3689, consistency_loss: 0.0069, accuracy: 0.8620
Validation set： - loss: 0.5323, accuracy: 0.7993
Epoch 2/100


Training: 341it [00:31, 10.97it/s]                                                                  


 - loss: 0.3577, supervised_loss: 0.3513, consistency_loss: 0.0064, accuracy: 0.8697
Validation set： - loss: 0.5333, accuracy: 0.7987
Epoch 3/100


Training: 341it [00:31, 10.94it/s]                                                                  


 - loss: 0.3552, supervised_loss: 0.3485, consistency_loss: 0.0067, accuracy: 0.8701
Validation set： - loss: 0.5328, accuracy: 0.7990
Epoch 4/100


Training:  54%|████████████████████████▋                     | 124/231.0625 [00:11<00:09, 10.85it/s]


KeyboardInterrupt: 