In [None]:
import pandas as pd

# Assuming your results are stored in 'balancedDataOutcomes.csv'
results_df = pd.read_csv("DataOutcomes.csv")

# Drop unwanted columns
results_df = results_df.drop(columns=["Training Time (s)", "Prediction Time (s)", "Fold"])

# Group by classifier and training set type, then calculate average metrics
# Changed from tuple to list for column selection
grouped_results = results_df.groupby(["Classifier Name", "Balanced or Unbalanced Train Set"])[
    ["Accuracy", "Precision", "Recall", "F1 Score", "ROC AUC"] # Use a list here
].mean()

# Sort the grouped results by the specified metric priority
grouped_results = grouped_results.sort_values(
    by=["Recall", "F1 Score", "ROC AUC", "Precision", "Accuracy"],
    ascending=[False, False, False, False, False]  # Sort in descending order for all metrics
)


# Print the grouped results
print(grouped_results)

# Export to CSV

# Convert MultiIndex to columns
grouped_results = grouped_results.reset_index()  # This keeps the model names and training set types


grouped_results["Accuracy"] = grouped_results["Accuracy"].map("{:.4f}".format)  # Example with 4 decimal places
grouped_results["Precision"] = grouped_results["Precision"].map("{:.4f}".format)  # Example with 4 decimal places
grouped_results["Recall"] = grouped_results["Recall"].map("{:.4f}".format)  # Example with 4 decimal places
grouped_results["F1 Score"] = grouped_results["F1 Score"].map("{:.4f}".format)  # Example with 4 decimal places
grouped_results["ROC AUC"] = grouped_results["ROC AUC"].map("{:.4f}".format)  # Example with 4 decimal places


grouped_results.to_csv("sorted_classifier_results.csv", encoding='utf-8', index=False)  # Also, explicitly set index=False
print("Sorted results exported to sorted_classifier_results.csv")

# Check for overfitting/underfitting
for classifier_name in results_df["Classifier Name"].unique():
    for balanced_type in results_df["Balanced or Unbalanced Train Set"].unique():
        train_accuracy = results_df[
            (results_df["Classifier Name"] == classifier_name) &
            (results_df["Balanced or Unbalanced Train Set"] == balanced_type) &
            (results_df["Training or Test Set"] == "Train")
        ]["Accuracy"].mean()

        test_accuracy = results_df[
            (results_df["Classifier Name"] == classifier_name) &
            (results_df["Balanced or Unbalanced Train Set"] == balanced_type) &
            (results_df["Training or Test Set"] == "Test")
        ]["Accuracy"].mean()

        print(f"{classifier_name} ({balanced_type}):")
        print(f"  Train Accuracy: {train_accuracy:.2f}")
        print(f"  Test Accuracy: {test_accuracy:.2f}")

        if train_accuracy > test_accuracy + 0.1:  # Arbitrary threshold for overfitting
            print("  Potential Overfitting Detected")
        elif test_accuracy < 0.7:  # Arbitrary threshold for underfitting
            print("  Potential Underfitting Detected")
        else:
            print("  Model performance is acceptable")

                                                      Accuracy  Precision  \
Classifier Name     Balanced or Unbalanced Train Set                        
XGBoost             Balanced                           1.00000    0.75375   
Logistic Regression Balanced                           0.96250    0.51625   
Adaboost            Balanced                           0.95375    0.50500   
Neural Network      Balanced                           1.00000    0.78250   
Gradient Boosting   Balanced                           0.96250    0.53875   
XGBoost             Unbalanced                         1.00000    0.97375   
Decision Tree       Balanced                           0.94875    0.52875   
Random Forest       Balanced                           0.96375    0.60000   
Naive Bayes         Balanced                           0.94500    0.51500   
                    Unbalanced                         0.98000    0.06125   
Neural Network      Unbalanced                         1.00000    0.94000   

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Assuming grouped_results is already calculated as per your provided code

# Define metrics and classifier names
metrics = ["Accuracy", "Precision", "Recall", "F1 Score", "ROC AUC"]
classifier_names = grouped_results["Classifier Name"].unique()

# Create a bar chart for each metric and dataset type
for metric in metrics:
    for dataset_type in ["Balanced", "Unbalanced"]:
        # Filter data for the current metric and dataset type
        data = grouped_results[grouped_results["Balanced or Unbalanced Train Set"] == dataset_type]

        # Create the bar chart
        plt.figure(figsize=(10, 6))  # Adjust figure size as needed
        plt.bar(classifier_names, data[metric].astype(float))
        plt.xlabel("Classifier")
        plt.ylabel(metric)
        plt.title(f"{metric} - {dataset_type} Dataset")
        plt.xticks(rotation=45, ha="right")  # Rotate x-axis labels
        plt.tight_layout()

        # Export the current bar chart
        plt.savefig(f"{metric}_{dataset_type}.png")  # Change file format if needed
        plt.close()  # Close the figure to avoid overlapping plots

print("Bar charts exported successfully!")

Bar charts exported successfully!
