In [None]:
import os 
import datetime
from pathlib import Path
from dotenv import load_dotenv, find_dotenv

basepath = Path(os.getcwd())
# make sure your working directory is the repository root.
if basepath.name != "idp-radio-1":
    os.chdir(basepath.parent.parent.parent)
load_dotenv(find_dotenv())

%load_ext autoreload
%autoreload 2
os.getcwd()

In [None]:
import tensorflow as tf
import tensorflow.keras as keras
import pandas as pd
import numpy as np
from skimage.transform import resize
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from keras_preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.applications import resnet_v2
from tensorflow.keras.callbacks import TensorBoard
from src.utils.save_model import save_model, model_set

In [None]:
print("GPUs Available:\n", str(tf.config.experimental.list_physical_devices('GPU')).replace("),", "),\n "))
print("CPU Available:\n", str(tf.config.experimental.list_physical_devices('CPU')).replace("),", "),\n "))

In [None]:
DATASET_FOLDER = Path(os.environ.get("CHEXPERT_DATASET_DIRECTORY"))
SEED = 17

dataset = DATASET_FOLDER.parent.name
dataset_version = DATASET_FOLDER.name
model_name = "Resnet151V2"
model_version = "1"
model_filename = model_name + "_" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + ".h5"
model_description = model_name + " trained on dataset " + dataset + "_" + dataset_version + "."

model_description

In [None]:
data = pd.read_csv(str(DATASET_FOLDER / 'train.csv'))

# preprocess
data = data.fillna(0)

# drop lateral images
data = data[~data['Frontal/Lateral'].str.contains("Lateral")]

# drop unrelevant columns
data = data.drop(["Sex", "Age", "Frontal/Lateral", "AP/PA"], axis=1)

# deal with uncertanty (-1) values
data = data.replace(-1,1)
#data = data[:64]

np.random.seed(SEED)
data_train, data_test = train_test_split(data, test_size=0.2)
data_train, data_val = train_test_split(data_train, test_size=0.2)


# The dev dataset and full dataset have the features in different columns
# we define this variable here to quickly switch between the two
FEATURES_SLICE = slice(1, 15)  # slice(2, 16)
data_train.columns[FEATURES_SLICE]

In [None]:
data_train.columns

In [None]:
train_datagen=ImageDataGenerator(rescale=1./255)
valid_datagen=ImageDataGenerator(rescale=1./255.)
test_datagen=ImageDataGenerator(rescale=1./255.)

In [None]:
target_size = (224, 224)
train_generator = train_datagen.flow_from_dataframe(
    dataframe=data_train,
    directory=DATASET_FOLDER,
    x_col='Path',
    y_col=list(data_train.columns[FEATURES_SLICE]),
    class_mode='other',
    target_size=target_size,
    batch_size=32
)
valid_generator = valid_datagen.flow_from_dataframe(
    dataframe=data_val,
    directory=DATASET_FOLDER,
    x_col='Path',
    y_col=list(data_val.columns[FEATURES_SLICE]),
    class_mode='other',
    target_size=target_size,
    batch_size=32
)
test_generator = test_datagen.flow_from_dataframe(
    dataframe=data_test,
    directory=DATASET_FOLDER,
    x_col="Path",
    y_col=list(data_test.columns[FEATURES_SLICE]),
    class_mode="other",
    target_size=target_size,
    shuffle=False,
    batch_size=1
)

In [None]:
base_model = resnet_v2.ResNet152V2(include_top=False, weights='imagenet')

# add global pooling and dense output layer 
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(1024, activation='relu')(x)
prediction_layer = Dense(14, activation='sigmoid')(x)

model = Model(inputs=base_model.input, outputs=prediction_layer)

In [None]:
# freeze all convolutional layers
for layer in base_model.layers:
    layer.trainable = False

In [None]:
# compile model
adam = keras.optimizers.Adam()
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
! /srv/idp-radio-1/remote_access/get_tunnels.sh

In [None]:
# fit model 
num_epochs = 3
STEP_SIZE_TRAIN= train_generator.n // train_generator.batch_size
STEP_SIZE_VALID= valid_generator.n // valid_generator.batch_size
STEP_SIZE_TEST= test_generator.n // test_generator.batch_size

models_dir = Path("models/") / model_name
log_dir =  models_dir / datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=str(log_dir))

result = model.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=num_epochs, 
                    callbacks=[tensorboard_callback])

In [None]:
model_id = save_model(model, result.history, model_name, model_filename, model_description)

In [None]:
print("predicting...")
test_generator.reset()
pred=model.predict_generator(test_generator, steps=STEP_SIZE_TEST, verbose=1)

In [None]:
pred_bool = (pred >= 0.5)
y_pred = np.array(pred_bool, dtype=int)

dtest = data_test.to_numpy()
y_true = np.array(dtest[:,FEATURES_SLICE], dtype=int)
report = classification_report(y_true, y_pred, target_names=list(data_test.columns[FEATURES_SLICE]))
model_id = model_set(model_id, 'classification_report', report)

In [None]:
score, acc = model.evaluate_generator(test_generator, steps=STEP_SIZE_TEST, verbose=1)
print('Test score:', score)
print('Test accuracy:', acc)
model_id = model_set(model_id, 'test', (score, acc))