In [None]:
%matplotlib inline

from utils.util import *


In [None]:
results = pd.read_csv("../resource/classifiers_results_final.csv")
print(len(results))

In [None]:
print("******* median *******")
print(results.groupby('synthetic_percentage')["accuracy"].median())

print("\n******* mean *******")
print(results.groupby('synthetic_percentage')["accuracy"].mean())

print("\n******* count *******")
print(results.groupby('synthetic_percentage')["accuracy"].count())

In [None]:
def compare_number_of_samples_mean(sorted_df):
    fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(18, 5))

    # Plot accuracy on the first subplot
    sns.pointplot(x='samples_and_percentage', y='accuracy', data=sorted_df, ax=axes, color='red')
    axes.set_xlabel('Additional Synthetic Samples', fontsize=20)
    axes.set_ylabel('Mean Accuracy', fontsize=20)
    axes.grid(True, color='black')
    axes.tick_params(axis='x', labelsize=16)
    axes.tick_params(axis='y', labelsize=16)
    # Adjust the layout of subplots
    plt.tight_layout()
    plt.savefig(os.path.join("classifier_analysis", 'classifier_with_synthetic_compare_mean.jpg'), bbox_inches='tight',
                dpi=250)
    # Show the plot
    plt.show()

    # Calculate and return the maximum accuracy for each model
    results = sorted_df.groupby('model_name')['accuracy'].max().to_dict()
    return results


In [None]:
def compare_number_of_samples(sorted_df, metric_name="accuracy"):
    # Get unique model names
    model_names = sorted_df['model_name'].unique()

    # Create a figure with subplots
    fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(90, 20))
    axes = axes.flatten()  # Flatten the axes array for easier indexing
    for i, model_name in enumerate(model_names):
        ax = axes[i]  # Select the current subplot

        # Filter the data for the current model name
        subset = sorted_df[sorted_df['model_name'] == model_name]
        subset = subset.sort_values('synthetic_percentage')
        # Calculate the maximum accuracy for each samples_and_percentage
        describe_values = subset.groupby('synthetic_percentage')[metric_name].describe()[['std', '50%']].sort_values('synthetic_percentage').to_records(index=True)
        sns.boxplot(x='samples_and_percentage', y=metric_name, data=subset, ax=ax, color='gray', whiskerprops=dict(linestyle='dotted',linewidth=8.0, color='black'),capprops=dict(linewidth=8.0, color='black'),
        boxprops=dict(linewidth=5),
        medianprops=dict(linewidth=8, color='black'))
        ax.set_xlabel('Additional Synthetic Samples', fontsize=55)
        ax.set_ylabel('Accuracy', fontsize=55)
        ax.set_title(f'{model_name}', fontsize=65, fontweight='bold', y=1.05)
        ax.tick_params(axis='x', labelsize=50)
        ax.tick_params(axis='y', labelsize=50)
        ax.grid(False)
        ax.set_ylim([0.25, 0.4])
        ax.set_facecolor('white')
        # Add value numbers to each box
        for index, values in enumerate(describe_values):
            # x_i = x if x == 0 else x - 1 if x > 1 else len(median_values) - 1
            text_y_position = float(values[1]) + float(values[2])  # Move text up by 2mm (adjust this value as needed)
            ax.text(index, text_y_position, f'{float(values[2]) * 100:.1f}%', ha='center', va='top', fontsize=55, color='white', fontweight='bold',
                    bbox=dict(facecolor='black', edgecolor='black', boxstyle='round', pad=0.1))

    # Adjust the layout of subplots
    plt.tight_layout()
    plt.savefig(os.path.join("classifier_analysis", 'classifier_with_synthetic_compare_by_model.jpg'),
                bbox_inches='tight', dpi=300)

    # Show the plot
    plt.show()

    # Calculate and return the maximum accuracy for each model
    results = sorted_df.groupby(['model_name','samples_and_percentage'])[metric_name].count().reset_index().groupby('model_name')["accuracy"].mean().reset_index()
    return results


In [None]:
sorted_df = results.sort_values('synthetic_percentage')
sorted_df["model_name"] = sorted_df["model_name"].replace(
    {"SVC": "SVC", "NeuralNet": "NN", "KNeighborsClassifier": "KNN"})
sorted_df["model_name"] = pd.Categorical(sorted_df["model_name"], categories=["SVC", "KNN", "NN"], ordered=True)

# Sort the DataFrame based on the new categorical order
sorted_df = sorted_df.sort_values("model_name")
sorted_df["model_name"]

In [None]:
compare_number_of_samples(sorted_df, "accuracy")


In [None]:
sorted_df[sorted_df["synthetic_percentage"] == 0]

In [None]:
compare_number_of_samples_mean(sorted_df)


In [None]:
from scipy import stats


def get_p_value(tmp_df, synthetic_percentage_add_compare, target_score="accuracy"):
    t_stat, p_value = stats.ttest_ind(tmp_df[tmp_df["synthetic_percentage"] == 0][target_score],
                                      tmp_df[tmp_df["synthetic_percentage"].isin(synthetic_percentage_add_compare)][
                                          target_score])
    m_train = tmp_df[tmp_df["synthetic_percentage"] == 0][target_score].mean()
    m_train_s = tmp_df[tmp_df["synthetic_percentage"].isin(synthetic_percentage_add_compare)][target_score].mean()
    number_of_models = tmp_df[tmp_df["synthetic_percentage"].isin(synthetic_percentage_add_compare)][
        target_score].count()
    print(
        f"p_value: {p_value}, mean_train:{m_train}, mean_train_s:{m_train_s}, number_of_models: {number_of_models}, number_of_models_real: {len(tmp_df[tmp_df['synthetic_percentage'] == 0][target_score])}")

In [None]:
get_p_value(sorted_df[sorted_df["model_name"]=="NN"], [0.2], "accuracy")
df = sorted_df

In [None]:
# Filter the data for real_train_set
real_train_data = sorted_df[sorted_df['synthetic_percentage'] == 0][
    ['synthetic_percentage', "ACB", "GWD", "ESN", "MSL", "YRI", "LWK", "ASW"]]

# Filter the data for train_with_aug
train_with_aug_data = sorted_df[sorted_df['synthetic_percentage'] == 0.9][
    ['synthetic_percentage', "ACB", "GWD", "ESN", "MSL", "YRI", "LWK", "ASW"]]

# Concatenate the filtered dataframes
merged_data = pd.concat([real_train_data, train_with_aug_data])

# Compute the mean accuracy for each population
mean_accuracy_real_train = merged_data[merged_data['synthetic_percentage'] == 0].mean(axis=0)
mean_accuracy_train_with_aug = merged_data[merged_data['synthetic_percentage'] == 0.9].mean(axis=0)

# Get the population names
population_names = ["ACB", "GWD", "ESN", "MSL", "YRI", "LWK", "ASW"]

# Set the figure size
plt.figure(figsize=(18, 8))
# Plot the histograms for train_with_aug_data
plt.hist(population_names, bins=len(population_names), weights=mean_accuracy_train_with_aug[population_names],
         alpha=0.5, label='Train with Augmentation 90%', color='blue', edgecolor='black')
# Plot the histograms for real_train_data
plt.hist(population_names, bins=len(population_names), weights=mean_accuracy_real_train[population_names],
         alpha=0.5, label='Real Train Set', color='yellow', edgecolor='black')

# Add labels and title
plt.xlabel('Population Name', fontsize=22)
plt.ylabel('Mean Accuracy', fontsize=22)

# Add a legend
plt.legend()

pos = ['left', 'left', 'center', 'center', 'center', 'right', 'right']
# Annotate the bars with mean accuracy values
for i, mean_acc in enumerate(mean_accuracy_real_train[population_names]):
    va = 'top' if i + 1 < len(population_names) else 'bottom'
    plt.text(i, mean_acc, f'{mean_acc * 100:.2f}%', ha=pos[i], va=va, color='yellow', fontweight='bold', fontsize=21,
             bbox=dict(facecolor='gray', edgecolor='black', boxstyle='round,pad=0.2'))
for i, mean_acc in enumerate(mean_accuracy_train_with_aug[population_names]):
    va = 'bottom' if i + 1 < len(population_names) else 'top'
    plt.text(i, mean_acc, f'{mean_acc * 100:.2f}%', ha=pos[i], va=va, color='blue', fontweight='bold', fontsize=21,
             bbox=dict(facecolor='gray', edgecolor='black', boxstyle='round,pad=0.2'))

# Rotate x-axis labels if needed
plt.tick_params(axis='x', labelsize=20)
plt.tick_params(axis='y', labelsize=20)
plt.legend(fontsize=18)
# Save the plot to a file
plt.savefig(os.path.join("classifier_analysis", 'classifier_with_synthetic_by_pop.jpg'), bbox_inches='tight', dpi=500)

# Display the plot
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os

SYNTHETIC_PERCENTAGE = 1

# Filter the data for real_train_set
real_train_data = sorted_df[sorted_df['synthetic_percentage'] == 0][
    ['synthetic_percentage', "ACB", "GWD", "ESN", "MSL", "YRI", "LWK", "ASW", 'model_name']]

# Filter the data for train_with_aug
train_with_aug_data = sorted_df[sorted_df['synthetic_percentage'] == SYNTHETIC_PERCENTAGE][
    ['synthetic_percentage', "ACB", "GWD", "ESN", "MSL", "YRI", "LWK", "ASW", 'model_name']]

# Concatenate the filtered dataframes
merged_data = pd.concat([real_train_data, train_with_aug_data])
merged_data = merged_data.reset_index()
# Convert population columns to numeric
population_columns = ["ACB", "GWD", "ESN", "MSL", "YRI", "LWK", "ASW"]
x_population_columns = [""] + population_columns
merged_data[population_columns] = merged_data[population_columns].apply(pd.to_numeric)

# Create subplots with three columns
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(50, 14), sharey=True)

# Iterate over unique model names
for i, model_name in enumerate(merged_data['model_name'].unique()):
    # Compute the mean accuracy for each population
    mean_accuracy_real_train = merged_data[merged_data['model_name'] == model_name][merged_data['synthetic_percentage'] == 0][population_columns].mean(axis=0)
    mean_accuracy_train_with_aug = merged_data[merged_data['model_name'] == model_name][merged_data['synthetic_percentage'] == SYNTHETIC_PERCENTAGE][population_columns].mean(axis=0)

    # Calculate the positions for bars
    bar_width = 0.35
    bar_positions_real = range(len(population_columns))
    bar_positions_fake = [pos + bar_width for pos in bar_positions_real]

    # Plot the histograms for train_with_aug_data in the corresponding subplot
    axes[i].bar(bar_positions_fake, mean_accuracy_train_with_aug[population_columns],
                width=bar_width, alpha=0.5, label='Augmented training set', color='blue', edgecolor='black')

    # Plot the histograms for real_train_data in the corresponding subplot
    axes[i].bar(bar_positions_real, mean_accuracy_real_train[population_columns],
                width=bar_width, alpha=0.5, label='Original training set', color='red', edgecolor='black')

    # Add labels and title to the current subplot
    axes[i].set_xlabel('Population Name', fontsize=30)
    axes[i].set_ylabel('Mean Accuracy', fontsize=30)
    axes[i].set_title(model_name, fontsize=35, fontweight='bold', y=1.05)
    axes[i].tick_params(axis='x', labelsize=30)
    axes[i].tick_params(axis='y', labelsize=30)
    axes[i].legend(fontsize=37, loc='upper left')
    axes[i].set_xticklabels(x_population_columns, fontsize=30)

    # Annotate the bars with mean accuracy values
    for j, (mean_acc_real, mean_acc_fake) in enumerate(zip(mean_accuracy_real_train[population_columns], mean_accuracy_train_with_aug[population_columns])):
        real_pos, fake_pos = ('top', 'bottom') if mean_acc_fake > mean_acc_real else ('bottom', 'top')
        axes[i].text(bar_positions_real[j], mean_acc_real, f'{mean_acc_real * 100:.2f}%', ha='center', va=real_pos, color='white', fontweight='bold', fontsize=35, bbox=dict(facecolor='black', edgecolor='red', boxstyle='round,pad=0.05', linewidth=4), verticalalignment='baseline')
        axes[i].text(bar_positions_fake[j], mean_acc_fake, f'{mean_acc_fake * 100:.2f}%', ha='center', va=fake_pos, color='white', fontweight='bold', fontsize=35, bbox=dict(facecolor='black', edgecolor='blue', boxstyle='round,pad=0.05', linewidth=4), verticalalignment='baseline')

# Adjust the spacing between subplots
plt.tight_layout()

# Save the plot to a file
output_dir = "classifier_analysis"
filename = 'classifier_with_synthetic_by_pop.jpg'
output_path = os.path.join(output_dir, filename)
plt.savefig(output_path, bbox_inches='tight', dpi=300)

# Display the plot
plt.show()


In [None]:
population_names