In [1]:
% matplotlib inline
import json

from tensorflow.python.keras.saving.save import load_model

from gs_ac_gan_training import f1_score_with_penalty
from utils.util import *

discriminator_folder = "../experiment_results/discriminator_0.8_test"
test_path = "../resource/test_0.2_super_pop.csv"
target_column = "Superpopulation code"

UsageError: Line magic function `%` not found.


In [None]:
discriminator = load_model(
    os.path.join(discriminator_folder, "discriminator"),
    custom_objects={'f1_score_with_penalty': f1_score_with_penalty})

In [None]:
# init class ids as model trained
with open(os.path.join(discriminator_folder, "class_id_map.json"), 'r') as file:
    json_data = file.read()

class_to_id = json.loads(json_data)
class_to_id

In [None]:
test_set = pd.read_csv(test_path)
relevant_columns = get_relevant_columns(test_set, [SAMPLE_COLUMN_NAME, target_column])
test_set = filter_samples_by_minimum_examples(10, test_set, target_column)
test_set

In [None]:
_, _, y_real = extract_y_column(class_to_id, test_set, target_column)
y_real = tensorflow.argmax(y_real, axis=1)
x_values = extract_x_values(test_set, relevant_columns, target_column)
x_values = x_values - np.random.uniform(0, 0.1, size=(x_values.shape[0], x_values.shape[1]))
x_values

In [None]:
_, class_predictions = discriminator.predict_on_batch([x_values])
y_pred = tensorflow.argmax(class_predictions, axis=1)
uniques, counts = np.unique(y_pred, return_counts=True)
class_id_to_counts = dict(zip(uniques, counts))
print(class_id_to_counts)

In [None]:
labels = list(class_to_id.values())


def plot_confusion_matrix(y_real, y_pred):
    # create a confusion matrix using numpy
    cm = np.zeros((len(labels), len(labels)))
    for i in range(len(y_real)):
        cm[int(y_real[i]), int(y_pred[i])] += 1

    # plot a heat map of the confusion matrix
    fig, ax = plt.subplots(figsize=(8, 8))
    im = ax.imshow(cm, cmap='Blues')
    ax.set_xticks(np.arange(len(labels)))
    ax.set_yticks(np.arange(len(labels)))
    ax.set_xticklabels(labels)
    ax.set_yticklabels(labels)
    ax.set_xlabel('Predicted label')
    ax.set_ylabel('True label')
    for i in range(len(labels)):
        for j in range(len(labels)):
            _ = ax.text(j, i, int(cm[i, j]),
                        ha='center', va='center', color='w')
    plt.title('Confusion matrix')
    plt.show()

In [None]:
def plot_bar_comparison(y_real, y_pred):
    # plot a bar chart of the number of true and predicted labels side by side
    fig, ax = plt.subplots()
    counts_real = np.zeros(len(labels))
    counts_pred = np.zeros(len(labels))
    width = 0.4
    positions = np.arange(len(labels))
    for i in range(len(y_real)):
        counts_real[int(y_real[i])] += 1
        counts_pred[int(y_pred[i])] += 1
    ax.bar(positions - width / 2, counts_real, width=width, color='blue', label='True label')
    ax.bar(positions + width / 2, counts_pred, width=width, color='orange', label='Predicted label')
    ax.set_xlabel('Label')
    ax.set_ylabel('Number of samples')
    plt.title('True vs. predicted label distribution')
    ax.set_xticks(positions)
    ax.set_xticklabels(labels)
    ax.legend()

    # add the numbers on each bar
    for i in range(len(positions)):
        ax.text(positions[i] - width / 2, counts_real[i] + 0.1, str(int(counts_real[i])), ha='center')
        ax.text(positions[i] + width / 2, counts_pred[i] + 0.1, str(int(counts_pred[i])), ha='center')

    plt.show()

In [None]:
plot_confusion_matrix(y_real, y_pred)

In [None]:
plot_bar_comparison(y_real, y_pred)

In [None]:
def plot_dataframe_correlation(df):
    # calculate correlation matrix
    corr_matrix = df.corr()

    # plot heatmap
    fig, ax = plt.subplots(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', ax=ax)
    ax.set_title('Correlation between variables')
    plt.show()


plot_dataframe_correlation(
    pd.concat([pd.DataFrame(y_real, columns=['y_real']), pd.DataFrame(y_pred, columns=['y_pred'])], axis=1))

In [None]:
def plot_correlation(y_real, y_pred):
    # plot a scatter plot of predicted vs true labels with a trendline
    fig, ax = plt.subplots()
    ax.scatter(y_real, y_pred)
    m, b = np.polyfit(y_real, y_pred, 1)
    ax.plot(y_real, m * y_real + b, color='red')
    ax.set_xlabel('True label')
    ax.set_ylabel('Predicted label')
    plt.title('Predicted vs true label correlation')
    plt.show()


plot_correlation(y_real, y_pred)

In [None]:
def plot_histogram(y_real, y_pred):
    # plot a histogram of the true and predicted labels
    fig, ax = plt.subplots()
    ax.hist(y_real, bins=len(labels), alpha=0.5, label='True label')
    ax.hist(y_pred, bins=len(labels), alpha=0.5, label='Predicted label')
    ax.set_xlabel('Label')
    ax.set_ylabel('Count')
    ax.set_title('True vs. predicted label histogram')
    ax.legend()
    plt.show()


plot_histogram(y_real, y_pred)

In [None]:
def plot_box(y_real, y_pred):
    # plot a box plot of the true and predicted labels
    fig, ax = plt.subplots()
    data = [y_real, y_pred]
    ax.boxplot(data, labels=['True label', 'Predicted label'])
    ax.set_ylabel('Label')
    ax.set_title('True vs. predicted label box plot')
    plt.show()


plot_box(y_real, y_pred)

In [None]:
import numpy as np

import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score


def plot_accuracy(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    fig, ax = plt.subplots()
    ax.bar(["Accuracy"], [acc])
    ax.set_ylim([0, 1])
    ax.set_ylabel("Accuracy")
    ax.set_title("Model Accuracy")
    plt.show()


plot_accuracy(y_real, y_pred)
