In [23]:
from datetime import datetime

from scipy.stats import stats
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier

from utils.util import *
%matplotlib inline
output_folder = "classifier_analysis"
hapt_genotypes_path = '../resource/train_0.8_super_pop.csv'
experiment_results = '../experiment_results/polyloss_ce_10k_pop'
output_results_path = 'catboost_classification_results.csv'
target_column = 'Superpopulation code'
(x_train, y_train), _, _, class_to_id = init_dataset(hapt_genotypes_path=hapt_genotypes_path,
                                                     target_column=target_column, without_extra_data=True)
id_to_class = {v: k for k, v in class_to_id.items()}

test_dataset = prepare_test_and_fake_dataset(experiment_results, test_path="../resource/test_0.2_super_pop.csv")
y_train = np.argmax(y_train, axis=-1)
print(y_train)


UsageError: Line magic function `%` not found.


In [None]:
generated_samples = prepare_test_and_fake_dataset(experiment_results,
                                                  test_path="../resource/Genome-AC-GAN By Continental Population genotypes.hapt",
                                                  from_generated=True)
generated_samples

In [None]:
def concatenate_fake_data(percentage, generated_samples, x_train, y_train):
    if percentage == 0:
        return (x_train, y_train), 0
    num_samples = int(percentage * len(generated_samples[0]))
    generated_samples_batches = tensorflow.data.Dataset.from_tensor_slices(generated_samples).shuffle(
        generated_samples[0].shape[0]).batch(num_samples, drop_remainder=True)
    for x_generated_random_batch, Y_generated_random_batch in generated_samples_batches:
        train_dataset_with_generated_data = (
            np.concatenate((x_train, x_generated_random_batch), axis=0),
            np.concatenate((y_train, Y_generated_random_batch), axis=0)
        )
        return train_dataset_with_generated_data, num_samples



In [None]:
def init_models():
    nb = MultinomialNB()
    knn = KNeighborsClassifier()
    lgr = LogisticRegression(multi_class='multinomial', max_iter=100)
    return {'NB': nb, 'KNN': knn, 'LGR': lgr}

In [None]:
rows = []
scores = []
test_predictions = []
number_of_iters = 2
k = 5
model_types = 1
number_of_models = 0
min_synthetic_percentage = 0
max_synthetic_percentage = 1
step_synthetic_percentage = 0.05
for i in range(model_types):
    classifiers = init_models()
    start_time = datetime.now()
    for index, (model_name, clf) in enumerate(classifiers.items()):
        for synthetic_percentage in np.arange(min_synthetic_percentage,
                                              max_synthetic_percentage + step_synthetic_percentage,
                                              step_synthetic_percentage):
            train_dataset_with_generated_data, number_of_synthetic_samples = concatenate_fake_data(
                percentage=synthetic_percentage,
                generated_samples=generated_samples,
                x_train=x_train, y_train=y_train)
            clf.fit(train_dataset_with_generated_data[0], train_dataset_with_generated_data[1])
            scores.append(cross_val_score(clf, train_dataset_with_generated_data[0],
                                          train_dataset_with_generated_data[1],
                                          cv=k)[:])
            test_predictions = clf.predict(test_dataset[0])
            test_score = accuracy_score(test_dataset[1], test_predictions)
            test_cohen_kappa = cohen_kappa_score(test_dataset[1], test_predictions)
            rows.append(
                {"synthetic_percentage": synthetic_percentage,
                 "samples_and_percentage": f"{number_of_synthetic_samples}\n{int(synthetic_percentage * 100)}%",
                 "model_name": model_name, "accuracy": test_score,
                 "cohen_kappa": test_cohen_kappa})
            number_of_models += 1
            if number_of_models % 10 == 0:
                print(f"finished train {number_of_models} models")
    end_time = datetime.now()
    duration_minutes = (end_time - start_time).total_seconds() / 60

    print(f"finished model iteration in  {duration_minutes} minutes")


In [None]:
results = pd.DataFrame(rows)
results.to_csv(os.path.join("classifier_analysis", "classifiers_results.csv"))
results

In [None]:
def compare_number_of_samples(df):
    # Sort the dataframe by number_of_samples in ascending order
    sorted_df = df.sort_values('samples_and_percentage')

    # Create a figure with two subplots, one for accuracy and the other for Cohen's kappa
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(18, 5))

    # Plot accuracy on the first subplot
    sns.pointplot(x='samples_and_percentage', y='accuracy', data=sorted_df, ax=axes[0])
    axes[0].set_xlabel('Additional Synthetic Samples')
    axes[0].set_ylabel('Accuracy')
    axes[0].set_title('Accuracy')

    # Plot Cohen's kappa on the second subplot
    sns.pointplot(x='samples_and_percentage', y='cohen_kappa', data=sorted_df, ax=axes[1])
    axes[1].set_xlabel('Additional Synthetic Samples')
    axes[1].set_ylabel('Cohen\'s Kappa')
    axes[1].set_title('Cohen\'s Kappa')

    # Adjust the layout of subplots
    plt.tight_layout()
    plt.savefig(os.path.join("classifier_analysis", 'classifier_with_synthetic_compare.jpg'))
    # Show the plot
    plt.show()

    # Calculate and return the maximum accuracy for each model
    results = df.groupby('model_name')['accuracy'].max().to_dict()
    return results


In [None]:
compare_number_of_samples(results)

In [None]:
def get_p_value(tmp_df, synthetic_percentage_add_compare, target_score="accuracy"):
    t_stat, p_value = stats.ttest_ind(tmp_df[tmp_df["synthetic_percentage"] == 0][target_score],
                                      tmp_df[tmp_df["synthetic_percentage"].isin(synthetic_percentage_add_compare)][
                                          target_score])
    m_train = tmp_df[tmp_df["synthetic_percentage"] == 0][target_score].mean()
    m_train_s = tmp_df[tmp_df["synthetic_percentage"].isin(synthetic_percentage_add_compare)][target_score].mean()
    number_of_models = tmp_df[tmp_df["synthetic_percentage"].isin(synthetic_percentage_add_compare)][
        target_score].count()
    print(
        f"p_value: {round(1 - p_value, 8)}, mean_train:{m_train}, mean_train_s:{m_train_s}, number_of_models: {number_of_models}")

In [None]:
get_p_value(results, [0.2, 0.25], "cohen_kappa")