In [1]:
from gs_ac_gan_training import prepare_test_and_fake_dataset
import catboost
from catboost import CatBoostClassifier, Pool

from utils.util import *
%matplotlib inline
output_folder = "classifier_analysis"
hapt_genotypes_path = '../resource/train_0.8_super_pop.csv'
experiment_results = '../experiment_results/f1_score_macro'
output_results_path = 'catboost_classification_results.csv'
target_column = 'Superpopulation code'
(x_train, y_train), _, _, class_to_id = init_dataset(hapt_genotypes_path=hapt_genotypes_path,
                                                     target_column=target_column, without_extra_data=True)
id_to_class = {v: k for k, v in class_to_id.items()}

test_dataset = prepare_test_and_fake_dataset(experiment_results, test_path="../resource/test_0.2_super_pop.csv")
y_train = np.argmax(y_train, axis=-1)
print(y_train)


KeyboardInterrupt



In [None]:
# Create a CatBoost classifier
clf = CatBoostClassifier(iterations=1000, learning_rate=0.1)
train_pool = catboost.Pool(data=x_train, label=y_train)
# Train the classifier
clf.fit(train_pool)

In [None]:
def evaluate_and_save_catboost_model(test_dataset, prefix=""):
    shuffled_dataset = tensorflow.data.Dataset.from_tensor_slices(test_dataset).shuffle(
        test_dataset[0].shape[0]).batch(test_dataset[0].shape[0], drop_remainder=True)
    for x_batch_real, y_batch_real in shuffled_dataset:
        test_pool = Pool(np.array(x_batch_real))
        class_predictions = clf.predict(test_pool)
        class_classifier_results = pd.DataFrame(
            {"class_pred": np.array(class_predictions.flatten()), "class_real": np.array(y_batch_real)})
        class_classifier_results["class_name_real"] = class_classifier_results["class_real"].replace(id_to_class)
        class_classifier_results["class_name_pred"] = class_classifier_results["class_pred"].replace(id_to_class)
        class_classifier_results.to_csv(
            os.path.join(output_folder, prefix+"discriminator_pred_on_test.csv"), index=False)

        row_results = compute_metrics(class_classifier_results["class_real"], class_classifier_results["class_pred"])
        row_results["epoch"] = 1
        row_results = pd.DataFrame([row_results])
        class_metric_results = pd.DataFrame(row_results)

        class_metric_results.to_csv(os.path.join(output_folder, prefix+output_results_path), index=False)


In [None]:
evaluate_and_save_catboost_model(test_dataset)

In [None]:
fake_dataset = prepare_test_and_fake_dataset(experiment_results, test_path="../fake_genotypes_sequences/new_sequences/full_pop/10001_genotypes.hapt", from_generated=True)

In [None]:
evaluate_and_save_catboost_model(fake_dataset, prefix="fake_pop_")

In [None]:
from sklearn.metrics import cohen_kappa_score

def plot_classifications_confusion_matrix(file_paths, output_dir, experiments_rename, prefix=""):
    # Create an empty list to store the confusion matrices and kappa scores
    cm_list = []
    kappa_list = []

    # Loop through each file path
    for i, path in enumerate(file_paths):
        # Read in the CSV file
        data = pd.read_csv(path)

        # Extract the actual and predicted population values from the dataframe
        actual_pop = data['class_name_real'].values
        predicted_pop = data['class_name_pred'].values

        # Get the unique classes
        classes = np.unique(actual_pop)

        # Create the confusion matrix
        cm = confusion_matrix(actual_pop, predicted_pop, labels=classes)

        # Calculate Cohen's kappa score
        kappa = cohen_kappa_score(actual_pop, predicted_pop)

        # Append the confusion matrix and kappa score to the list
        cm_list.append(cm)
        kappa_list.append(kappa)

    # Calculate the number of rows and columns for the subplots
    n_plots = len(file_paths)
    n_cols = min(2, n_plots)
    n_rows = int(np.ceil(n_plots / n_cols))

    # Create the subplots
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(10, 5 * n_rows))
    fig.suptitle('Confusion Matrix Comparison')

    # Flatten the axes array for indexing
    axes = axes.flatten()

    # Loop through each confusion matrix and subplot
    for i, (cm, kappa) in enumerate(zip(cm_list, kappa_list)):
        ax = axes[i]
        im = ax.imshow(cm, cmap=plt.cm.Blues)
        ax.set_title(f'{experiments_rename[i]}\nKappa Score: {kappa:.2f}')
        ax.set_xlabel('Predicted Label')
        ax.set_ylabel('True Label')
        ax.set_xticks(np.arange(len(classes)))
        ax.set_yticks(np.arange(len(classes)))
        ax.set_xticklabels(classes)
        ax.set_yticklabels(classes)

        # Add a colorbar
        fig.colorbar(im, ax=ax, shrink=0.6)

        # Add the confusion matrix values as text annotations
        for j in range(len(classes)):
            for k in range(len(classes)):
                text = ax.text(k, j, cm[j, k], ha='center', va='center', color='white')

    # Hide any unused subplots
    for i in range(n_plots, n_rows * n_cols):
        axes[i].axis('off')

    # Show the plot
    plt.savefig(os.path.join(output_dir, prefix + "compare_confusion_matrix.jpg"))
    plt.show()

In [None]:
plot_classifications_confusion_matrix(['classifier_analysis/discriminator_pred_on_test.csv', 'classifier_analysis/fake_pop_discriminator_pred_on_test.csv'], output_folder, experiments_rename=['test_set', 'Gen-AC-GAN'], prefix="pop_classifier_comparison")