# Author: Tobias

After some mistakes in the NanWrapper and metric calculation, we had to reevaluate all our models. We use this notebook to load previously trained models and reevalute their performance.

In [None]:
import os 
import datetime
import json
from pathlib import Path
from dotenv import load_dotenv, find_dotenv
import tensorflow as tf

basepath = Path(os.getcwd())
# make sure your working directory is the repository root.
if basepath.name != "idp-radio-1":
    os.chdir(basepath.parent.parent.parent)
load_dotenv(find_dotenv())
basepath = Path(os.getcwd())

%load_ext autoreload
%autoreload 2
basepath

In [None]:
# Specify which GPU(s) to use
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Or 2, 3, etc. other than 0

#config = tf.compat.v1.ConfigProto(device_count={'GPU': 1}, allow_soft_placement=True, log_device_placement=True)
config = tf.compat.v1.ConfigProto(allow_soft_placement=True, log_device_placement=True)
config.gpu_options.allow_growth = True
#config.gpu_options.per_process_gpu_memory_fraction = 1.2
tf.compat.v1.Session(config=config)
tf.__version__

In [6]:
import numpy as np

import traceback
from sklearn.metrics import classification_report
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras import models
from tensorflow.keras.optimizers import Adam, SGD
from keras.utils.generic_utils import get_custom_objects

from tensorflow.keras.applications import InceptionV3, Xception, DenseNet121, InceptionResNetV2, ResNet152V2, NASNetLarge
from src.architectures.simple.simple_base import SimpleBaseArchitecture
from src.architectures.simple.load_model import *
from src.architectures.benchmarks.benchmark import Benchmark, Experiment
from src.architectures.benchmarks.benchmark_definitions import generate_benchmarks,simple_architecture_experiment, Chexpert_Benchmark, CHEXPERT_COLUMNS, METRICS, SINGLE_CLASS_METRICS
from src.metrics.metrics import F2Score, SingleClassMetric
from src.metrics.losses import WeightedBinaryCrossentropy, compute_class_weight

In [None]:
experiments = get_all_experiment_logs()
experiments = [exp for exp in experiments if "Failed" not in exp["name"]]
#experiments = [exp for exp in experiments if "N12" in exp["name"]]
experiments = [exp for exp in experiments if "num_samples_test" in exp["benchmark"].keys() ]
experiments = [exp for exp in experiments if exp["benchmark"]["num_samples_test"] == 234 ]
len(experiments)

In [None]:
exp_dict = experiments[-2].copy()
exp_dict["name"], exp_dict["test"]["auc"]

In [None]:
print("Reevaluate model {} - {} ".format(exp_dict["name"], exp_dict["id"]), "\n")
exp = rebuild_experiment(exp_dict)
exp_dict["test_again"] = reevaluate(exp, new_metrics=True)
print(exp_dict["test"]["auc"] - exp_dict["test_again"]["auc"])
print("sum difference", sum(difference_test_results(exp_dict["test"], exp_dict["test_again"]).values()))

In [None]:
new_experiments = []
for index, exp_dict in enumerate(experiments[:]):
    for epoch_id in range(exp_dict["benchmark"]["epochs"]):
        new_exp_dict = exp_dict.copy()
        new_exp_dict["epoch_model"] = epoch_id + 1
        try:
            print("\nReevaluate model {} - {} ".format(index, new_exp_dict["id"]))
            print("For Epoch {} ".format(new_exp_dict["epoch_model"]))
            
            exp = rebuild_experiment(exp_dict, epoch=new_exp_dict["epoch_model"])
            new_exp_dict["test_again"] = reevaluate(exp, new_metrics=True)
            new_exp_dict["val_again"] = reevaluate_validation(exp, new_metrics=True)
            new_exp_dict["val"] = {name.replace("val_", ""):res[-1] for name, res in new_exp_dict["history"].items() if name.startswith("val_")}

            new_experiments.append(new_exp_dict)

            print("test_again auc is ", new_exp_dict["test_again"]["auc"])
            print("test auc is ", new_exp_dict["test"]["auc"])
            print("val_again auc is ", new_exp_dict["val_again"]["auc"])
            print("val auc is ", new_exp_dict["val"]["auc"])

            del exp
        except:
            print(traceback.format_exc())
        

In [None]:
[{
    "name":new_experiment["name"], 
    "id":new_experiment["id"],
    "epoch": new_experiment["epoch_model"],
    "test auc":new_experiment["test"]["auc"],
    "test again auc":new_experiment["test_again"]["auc"],
    "val auc":new_experiment["val"]["auc"] if "auc" in new_experiment["val"].keys() else None, 
    "val again auc": new_experiment["val_again"]["auc"]
 }
    for new_experiment in new_experiments]

In [None]:
[new_experiment["test_again"]["auc"] for new_experiment in new_experiments], [new_experiment["test"]["auc"] for new_experiment in new_experiments]

In [32]:
# cast from np.float32 to float:
for exp in new_experiments:
    exp["test_again"] = {k:float(v) for k, v in exp["test_again"].items()}
    exp["val_again"] = {k:float(v) for k, v in exp["val_again"].items()}

output_file = Path("/srv/idp-radio-1/logs/epoch_reevaluation_experiments.json")
with open(output_file, 'w') as f:
    data = {"experiments":new_experiments}
    json_data = json.dumps(data, indent=4)
    f.write(json_data)

In [28]:
for exp in new_experiments:
    classes = [classname.lower().replace(" ", "_") for classname in exp["benchmark"]["label_columns"]]
    if len(classes) == 12:
        #print([exp["test_again"]["auc_"+classname]for classname in classes])
        exp["test_again"]["auc"] = sum([exp["test_again"]["auc_"+classname]for classname in classes])/(len(classes)-1)

In [None]:
res = [(new_experiment["id"], new_experiment["epoch_model"], new_experiment["test_again"]["auc"]) for new_experiment in new_experiments if "N12" in new_experiment["name"]]
res.sort(key=lambda tup: tup[2])
res.reverse()
res

In [None]:
exp.benchmark.metrics