In [23]:
import os 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import ast

from tensorflow.config.experimental import list_physical_devices, set_memory_growth
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dropout, Dense
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Activation, Dropout, Flatten, Dense

import shutil
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger
from tensorflow.keras.models import load_model

from sklearn.metrics import accuracy_score
from sklearn.model_selection import ParameterGrid

from matplotlib.backends.backend_pdf import PdfPages
from datetime import datetime
from contextlib import redirect_stdout
from tensorflow.keras.utils import plot_model


In [24]:
phys_devices = list_physical_devices('GPU')
if len(phys_devices) > 0:
    for device in phys_devices:
        set_memory_growth(device, enable=True)

In [25]:
# Polku, jossa opetuskuvat:
train_path = os.path.sep.join(['histopathologic-cancer-detection','train'])

In [26]:
#datagen_train = ImageDataGenerator(
 #   validation_split=0.2,
 #   rescale=1./255,
 #   width_shift_range=0.1,
 #   height_shift_range=0.1,
 #   horizontal_flip=True
#)
#datagen_val = ImageDataGenerator(validation_split=0.2, rescale=1./255)

In [27]:
# Mallin tietojen tallennus:
def save_model_info(model, cfg, models_path):
    path = os.path.join(models_path, 'model_infos')

    if not os.path.exists(path):
        os.mkdir(path) 
    
    with open(f"{models_path}/model_infos/{cfg['name']}_summary.txt", "w") as f:
        with redirect_stdout(f):
            model.summary()
        plot_model(model, to_file=f"{models_path}/model_infos/{cfg['name']}.png", show_shapes=True, dpi=80)

In [28]:
#Oppimiskäyrien piirtofunktio:
def learning_curves(histories, i, accuracy, model_accuracy, pdf):
    fig=plt.figure(figsize=(10,4))
    
    ax1=fig.add_subplot(1,2,1)
    ax2=fig.add_subplot(1,2,2)
    
    for hist in histories:
        hist_dict = hist.history
        epochs = [x+1 for x in hist.epoch]
        #häviöiden piirto:
        ax1.plot(epochs, hist_dict['loss'], color="blue", linewidth="0.8")
        ax1.plot(epochs, hist_dict['val_loss'], color="red", linewidth="0.8")
        #tarkkuuksien piirto:
        ax2.plot(epochs, hist_dict['accuracy'], color="purple", linewidth="0.8")
        ax2.plot(epochs, hist_dict['val_accuracy'], color="orange", linewidth="0.8")
        
        ax1.set_title('Opetus- ja validointivirhe')
        ax1.set_xlabel('Epochs')
        ax1.set_ylabel('Loss')
        ax1.legend(['Opetusvirhe', 'Validointivirhe'], loc='upper right')
        
        ax2.set_title('Opetus- ja validointitarkkuus')
        ax2.set_xlabel('Epochs')
        ax2.set_ylabel('Accuracy')
        ax2.legend(['Opetustarkkuus', 'Validointitarkkuus'], loc='upper right')

        plt.suptitle(f"cfg nro. {i}: >>> {accuracy*100:.5f} {str(model_accuracy)}")
        plt.tight_layout()

        pdf.savefig(fig)
        plt.close(fig)
        

In [29]:
# CNN-mallin rakennus
def build_model(cfg):
    input_data = Input(shape=(96, 96, 3))
    
    x = input_data
    x = Conv2D(32, cfg['size'], padding='same')(x)
    
    for n_filters in cfg['filters']:
        x = MaxPooling2D(2)(x)
        x = Conv2D(n_filters, cfg['size'], padding='same')(x)
        x = Conv2D(n_filters, cfg['size'], padding='same')(x)   
    
    x = Flatten()(x)
    x = Dense(256, activation="relu")(x)
    x = Dropout(cfg['dout'])(x)
    
    output = Dense(cfg['num_classes'], activation='sigmoid')(x)
    model = Model(input_data, output)
    model.summary()    

    return model

In [30]:
def data_generator():
    datagen_train = ImageDataGenerator(
        validation_split=0.2,
        rescale=1./255,
        width_shift_range=0.1,
        height_shift_range=0.1,
        horizontal_flip=True
    )
    datagen_val = ImageDataGenerator(validation_split=0.2, rescale=1./255)
    return datagen_train, datagen_val

In [31]:
def it_data(train_data, cfg):
    
    datagen_train, datagen_val = data_generator()
    
    train_generator = datagen_train.flow_from_dataframe(
        dataframe=train_data,
        directory=train_path,
        class_mode='binary',
        subset='training',
        target_size = (96, 96),
        batch_size = cfg['b_size'],
        x_col='id',
        y_col='label',
        shuffle=True
    )
    val_generator = datagen_val.flow_from_dataframe(
        dataframe=train_data,
        directory=train_path,
        class_mode='binary',
        subset='validation',
        target_size = (96, 96),
        batch_size=cfg['b_size'],
        x_col='id',
        y_col='label',
        shuffle=False
    )
    return train_generator, val_generator

In [32]:
# Mallin opetus: kääntö ja sovitus:
def train_model(train_data, cfg, models_path):
    # Kuvien käsittely, opetus- ja validointidata:
    train_generator, val_generator = it_data(train_data, cfg)
    
    # Mallin rakennus:
    model = build_model(cfg)

    # Mallin kääntö:
    sgd = Adam(amsgrad=True, learning_rate=cfg['lr'])
    model.compile(loss='binary_crossentropy', optimizer='sgd', metrics=['accuracy'])

    # Tietojen tallennusta varten:
    checkpoint_dir = os.path.sep.join([models_path, 'tmp'])
    checkpoint_filepath = os.path.sep.join([checkpoint_dir, 'checkpoint'])
    cbs_list=[
        ModelCheckpoint(checkpoint_filepath, monitor='val_accuracy', mode='max', save_weights_only=True, verbose=0),
        CSVLogger(os.path.sep.join([models_path, 'training.log']), append=True)
    ]
    # Sovitus:
    history = model.fit(train_generator, steps_per_epoch=len(train_generator), epochs=cfg['epochs'], validation_data=val_generator, validation_steps=len(val_generator), verbose=1, callbacks=cbs_list)
    
    open(os.path.sep.join([models_path, 'training.log']), 'a').write(f" \n")
   
    model.load_weights(checkpoint_filepath)
    shutil.rmtree(checkpoint_dir)
    

    return model, history
    

In [33]:
# Mallin evaluointi (ennustukset):
def evaluate_models(models, test_data):

    
    datagen_test = ImageDataGenerator(rescale=1./255)

    # generaattori luo aina yhden kuvan käyttöön
    test_generator = datagen_test.flow_from_dataframe(
        dataframe=test_data,
        directory=train_path,
        target_size=(96, 96),
        batch_size=1,
        class_mode='binary',
        x_col='id',
        y_col='label',
        shuffle=False
    )

    # Ennustus
    all_preds = [model.predict(test_generator, steps=len(test_generator)) for model in models]

    # Lasketaan tarkkuuksia:
    labels = test_generator.labels
    model_accuracy = [accuracy_score(labels, (preds > 0.5).astype(int)) for preds in all_preds]
    model_accuracy = np.array(model_accuracy).round(6)
    
    preds_weighted = (np.average(all_preds, axis=0, weights=model_accuracy) > 0.5).astype(int)
    accuracy_weighted = accuracy_score(labels,preds_weighted)
    
    return model_accuracy, accuracy_weighted

In [34]:
#GridSearch:
def grid_search(train_data, test_data, cfgs, models_path, pdf, n_repeats):
    scores_avg=[]
    scores_best=[]
    
    print(f"Yhteensä {len(cfgs)} konfiguraatiota, {n_repeats} toistolla, aloitetaan... {models_path}")
    for i, cfg in cfgs.items():
        cfg['name']=f"CNN-{i}"
        open(os.path.sep.join([models_path, 'training.log']), 'a').write(f"Nykyinen konfiguraatio {i}: {cfg} \n\n")
        # Mallin opetus:
        model_histories = [train_model(train_data, cfg, models_path) for _ in range(n_repeats)]
        #tallennetaan mallit ja historiat omiin listoihin:
        models =[model for model, history in model_histories]
        histories =[history for model, history in model_histories]
        save_model_info(models[0], cfg, models_path)

        #Mallin evaluointi:
        model_accuracy, accuracy_weighted = evaluate_models(models, test_data)

        #Käyrien piirrot
        learning_curves(histories, i, accuracy_weighted, model_accuracy, pdf)

        #Konfiguraatioita talteen:
        scores_avg.append((i, accuracy_weighted, model_accuracy, models))

        # Talennetaan parhaimmat tarkkuudet ja tarkkuuksien keskiarvoja:
        for j, (accuracy, model) in enumerate(zip(model_accuracy, models), 1):
            scores_best.append((f"{i}_v{j}", accuracy, model))

        if len(scores_avg) > 10:
            scores_avg.sort(key=lambda tup: tup[1], reverse=True)
            del scores_avg[10:]

        if len(scores_best)>5:
            scores_best.sort(key=lambda tup: tup[1], reverse=True)
            del scores_best[5:]
            
        #Tallennetaan konfiguraatiot    
        open(os.path.sep.join([models_path, 'all_config.txt']), 'a').write(f'{i}: {cfg} \n')


    pdf.close()
    
    return scores_avg, scores_best
        

In [35]:
# Haetaan csv-tiedostojen datat: opetus- ja evaluointidata
def get_data():
    data_path = os.path.sep.join(['histopathologic-cancer-detection'])
    train_data = pd.read_csv(data_path+'/train_data.csv')
    eval_data = pd.read_csv(data_path+'/eval_data.csv')
    
    train_data['id']=train_data['id'].astype('string')
    train_data['label']=train_data['label'].astype('string')
    
    eval_data['id']=eval_data['id'].astype('string')
    eval_data['label']=eval_data['label'].astype('string')
    return train_data, eval_data

In [36]:
# Haetaan konfiguraatiot ja käydään ne läpi:
param_grid = ast.literal_eval(open("param_grid.txt", "r").read())

cfgs = {i: cfg for i, cfg in enumerate(list(ParameterGrid(param_grid)), 1)}
for key, value in cfgs.items():
    print(key, value)

#Datan lataus
train_data, test_data_known = get_data()

#Luodaan uusi kansio testattaville malleille:
time = datetime.now().strftime('%Y%m%dT%H%M')
models_path=f"gs_dnn_ensemble_{time}"
if not os.path.exists(models_path):
    os.mkdir(models_path)

pdf = PdfPages(os.path.join(models_path, "learning_curves.pdf"))

#toisojen määrä:
n_repeats = 1

#grid search:
scores_avg, scores_best = grid_search(train_data, test_data_known, cfgs, models_path, pdf, n_repeats)

# tallennetaan tarkkuuksien keskiarvot ja parhaimmat tarkkuudet
for i, accuracy_avg, model_accuracy, models in scores_avg:
    open(os.path.sep.join([models_path, 'avg_accuracy.txt']), 'a').write(f"{i}: {accuracy_avg*100:.3f} {model_accuracy}\n")

for name, accuracy, model in scores_best:
    open(os.path.sep.join([models_path, 'best_accuracy.txt']), 'a').write(f"{name}: {accuracy*100:.3f}\n")
    # tallennetaan mallit:
    if f'{name}.hdf5' not in os.listdir(models_path):
            filepath = os.path.sep.join([models_path, 'models', f'{name}_{accuracy*100:.3f}.hdf5'])
            print(filepath)
            model.save(filepath)

1 {'act': 'relu', 'b_size': 32, 'dout': 0.6, 'epochs': 15, 'filters': [64, 128, 256], 'lr': 0.001, 'num_classes': 1, 'size': 3}
2 {'act': 'relu', 'b_size': 32, 'dout': 0.6, 'epochs': 15, 'filters': [64, 128, 256], 'lr': 0.0001, 'num_classes': 1, 'size': 3}
3 {'act': 'relu', 'b_size': 32, 'dout': 0.5, 'epochs': 15, 'filters': [64, 128, 256], 'lr': 0.001, 'num_classes': 1, 'size': 3}
4 {'act': 'relu', 'b_size': 32, 'dout': 0.5, 'epochs': 15, 'filters': [64, 128, 256], 'lr': 0.0001, 'num_classes': 1, 'size': 3}
5 {'act': 'relu', 'b_size': 32, 'dout': 0.4, 'epochs': 15, 'filters': [64, 128, 256], 'lr': 0.001, 'num_classes': 1, 'size': 3}
6 {'act': 'relu', 'b_size': 32, 'dout': 0.4, 'epochs': 15, 'filters': [64, 128, 256], 'lr': 0.0001, 'num_classes': 1, 'size': 3}
Yhteensä 6 konfiguraatiota, 1 toistolla, aloitetaan... gs_dnn_ensemble_20231210T1624
Found 114340 validated image filenames belonging to 2 classes.
Found 28584 validated image filenames belonging to 2 classes.
Model: "model_1"
__