In [3]:
import pandas as pd
from functools import reduce
import glob 
import os
import re

In [4]:
# Paths for each model
svm_folder_path = "svm/"  
rf_folder_path = "rf/"
cnn_folder_path = "cnn/"
paths = [svm_folder_path, rf_folder_path, cnn_folder_path]

summary_rows = []

for folder_path in paths:
    # Identify which model folder we're in
    if folder_path.startswith("svm"):
        model = "svm"
    elif folder_path.startswith("rf"):
        model = "rf"
    elif folder_path.startswith("cnn"):
        model = "cnn"

    # Scan folder for CSV metrics
    for filename in os.listdir(folder_path):
        if filename.endswith(f"_{model}_metrics.csv"):
            match = re.match(fr"(.+?)_(original|oversampled)_{model}_metrics\.csv", filename)
            if match:
                data_name, sampling_type = match.groups()
                filepath = os.path.join(folder_path, filename)

                df = pd.read_csv(filepath)
                avg_mcc = df["MCC"].mean()
                avg_f1_macro = df["F1_macro"].mean()
                avg_f1_weighted = df["F1_weighted"].mean()

                summary_rows.append({
                    "Model": model,
                    "Data": data_name,
                    "Sampling": sampling_type,
                    "Avg_MCC": avg_mcc,
                    "Avg_F1_macro": avg_f1_macro,
                    "Avg_F1_weighted": avg_f1_weighted
                })

# Convert to DataFrame
summary_df = pd.DataFrame(summary_rows)

# Pivot so that "original" and "oversampled" become separate columns
summary_wide = summary_df.pivot(
    index=["Data", "Model"], 
    columns="Sampling", 
    values=["Avg_MCC", "Avg_F1_macro", "Avg_F1_weighted"]
)

# Flatten multi-level columns: ("Avg_MCC", "original") -> "Avg_MCC_original"
summary_wide.columns = [f"{col[0]}_{col[1]}" for col in summary_wide.columns]

# Bring "Data" and "Model" back as columns
summary_wide.reset_index(inplace=True)

# Rename columns to your desired format
summary_wide.rename(columns={
    "Data": "data type",
    "Model": "model",
    "Avg_MCC_original": "MCC Original",
    "Avg_MCC_oversampled": "MCC Oversampled",
    "Avg_F1_macro_original": "macro f1-score Original",
    "Avg_F1_macro_oversampled": "macro f1-score Oversampled",
    "Avg_F1_weighted_original": "weighted f1-score Original",
    "Avg_F1_weighted_oversampled": "weighted f1-score Oversampled",
}, inplace=True)

# Reorder columns to the exact sequence you want
summary_wide = summary_wide[
    [
        "data type",
        "model",
        "MCC Original",
        "MCC Oversampled",
        "weighted f1-score Original",
        "weighted f1-score Oversampled",
        "macro f1-score Original",
        "macro f1-score Oversampled"
    ]
]

# Finally, save to CSV in the new wide format
summary_wide.to_csv("metrics_summary.csv", index=False)
print("Summary saved to metrics_summary.csv")

Summary saved to metrics_summary.csv


In [5]:
import pandas as pd
import re
from scipy.stats import ttest_ind
import os

# Base setup
models = ["svm", "rf", "cnn"]
metrics = ["MCC", "F1_macro", "F1_weighted"]
results = []

for model in models:
    base_dir = f"{model}/"
    
    # Load full benchmark datasets
    full_original = pd.read_csv(f"{base_dir}all_original_{model}_metrics.csv")
    full_oversampled = pd.read_csv(f"{base_dir}all_oversampled_{model}_metrics.csv")

    # Loop through each ablation file in the folder
    for filename in os.listdir(base_dir):
        match = re.match(fr"(.+?)_(original|oversampled)_{model}_metrics\.csv", filename)
        if match and not filename.startswith("all_"):  # Skip the 'all_original'/'all_oversampled' full files
            data_name, sampling_type = match.groups()
            ablation_path = os.path.join(base_dir, filename)
            ablation_df = pd.read_csv(ablation_path)

            # Select baseline (original or oversampled)
            full_df = full_original if sampling_type == "original" else full_oversampled

            # Run t-tests for each metric
            for metric in metrics:
                t_stat, p_val = ttest_ind(full_df[metric], ablation_df[metric], equal_var=False)
                results.append({
                    "Data": data_name,
                    "Model": model,
                    "Metric": metric,
                    "Sampling": sampling_type,   # "original" or "oversampled"
                    "T-stat": round(t_stat, 4),
                    "P-value": round(p_val, 4)
                })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# We want a single row for each (Data, Model).
# We'll pivot on [Metric, Sampling] for columns, and get T-stat/P-value as the values.
t_test_pivot = results_df.pivot_table(
    index=["Data", "Model"],
    columns=["Metric", "Sampling"],
    values=["T-stat", "P-value"]
)

# t_test_pivot will have multi-level columns like:
#   ("T-stat", "MCC", "original"), ("T-stat", "MCC", "oversampled"), 
#   ("P-value", "MCC", "original"), etc.

# Flatten the multi-level columns. 
# E.g. ("T-stat", "MCC", "original") -> "MCC Original T-stat"
flattened_cols = []
for (stat_type, metric, sampling) in t_test_pivot.columns:
    # stat_type is "T-stat" or "P-value"
    # metric is "MCC", "F1_macro", or "F1_weighted"
    # sampling is "original" or "oversampled"
    new_col_name = f"{metric} {sampling.capitalize()} {stat_type}"
    flattened_cols.append(new_col_name)

t_test_pivot.columns = flattened_cols

# Bring "Data" and "Model" back as columns
t_test_pivot.reset_index(inplace=True)

# OPTIONAL: If you want a specific column order, create a list:
# E.g., we can manually specify a nice sequence:
desired_order = [
    "Data", 
    "Model",
    "MCC Original T-stat", "MCC Original P-value", 
    "MCC Oversampled T-stat", "MCC Oversampled P-value",
    "F1_macro Original T-stat", "F1_macro Original P-value",
    "F1_macro Oversampled T-stat", "F1_macro Oversampled P-value",
    "F1_weighted Original T-stat", "F1_weighted Original P-value",
    "F1_weighted Oversampled T-stat", "F1_weighted Oversampled P-value",
]
# Filter to existing columns just in case any metric is missing for some reason
final_columns = [col for col in desired_order if col in t_test_pivot.columns]
t_test_pivot = t_test_pivot[final_columns]

# Sort by Data first, then by Model
t_test_pivot.sort_values(by=["Data", "Model"], inplace=True)

# Save to CSV
t_test_pivot.to_csv("t_test_results_all_in_one_row.csv", index=False)
print("Saved to t_test_results_all_in_one_row.csv!")


Saved to t_test_results_all_in_one_row.csv!


In [7]:
import os
import re
import pandas as pd
from scipy.stats import ttest_ind

# We want to compare SVM vs. RF for each sampling (original / oversampled).
metrics = ["MCC", "F1_macro", "F1_weighted"]
results = []

# 1) Load SVM "all_*" CSVs
svm_original_path = "svm/all_original_svm_metrics.csv"
svm_oversampled_path = "svm/all_oversampled_svm_metrics.csv"

df_svm_original = pd.read_csv(svm_original_path)
df_svm_oversampled = pd.read_csv(svm_oversampled_path)

# 2) Look for the RF "all_*" CSVs and do a t-test against SVM
base_dir = "rf/"
for filename in os.listdir(base_dir):
    # We'll look for "all_original_rf_metrics.csv" or "all_oversampled_rf_metrics.csv"
    match = re.match(r"all_(original|oversampled)_rf_metrics\.csv", filename)
    if match:
        sampling_type = match.group(1)  # "original" or "oversampled"
        
        rf_path = os.path.join(base_dir, filename)
        df_rf = pd.read_csv(rf_path)

        # Decide which SVM baseline to compare against
        if sampling_type == "original":
            df_svm = df_svm_original
        else:  # oversampled
            df_svm = df_svm_oversampled

        # For each metric, do t-test: SVM vs RF
        for metric in metrics:
            t_stat, p_val = ttest_ind(df_svm[metric], df_rf[metric], equal_var=False)
            results.append({
                "Data": "all",             # Because the filename was "all_..."
                "Metric": metric,
                "Model1": "svm",
                "Model2": "rf",
                "Sampling": sampling_type,
                "T-stat": round(t_stat, 4),
                "P-value": round(p_val, 4)
            })

# Convert all results to a DataFrame
results_df = pd.DataFrame(results)

# ---------- Pivoting to put everything on ONE row per (Data) -----------
# We'll pivot on: (Data, Sampling) as the row index, 
# and (Metric, [T-stat/P-value]) as columns. But we also have Model1 vs. Model2 in there.
# You might prefer a simpler pivot that just shows T-stat/P-value in columns.

# Here's an approach that puts each (Metric, T-stat, P-value) in columns, 
# distinguishing them by "Model1_vs_Model2" or "Sampling" or both.

# If your main goal is to have "MCC Original T-stat", "MCC Original P-value", etc.,
# you can do a triple pivot with index=["Data"], columns=["Sampling","Metric"], values=["T-stat","P-value"].
# That lumps the model comparison into the row data (since we only have one comparison: SVM vs RF).

t_test_pivot = results_df.pivot_table(
    index=["Data"],                    # e.g. "all"
    columns=["Sampling", "Metric"],    # (original|oversampled), (MCC|F1_macro|F1_weighted)
    values=["T-stat", "P-value"]       # We want T-stat and P-value
)

# This yields multi-level columns like:
# ("T-stat", "original", "MCC"), ("T-stat", "original", "F1_macro"), ...
# We'll flatten them.

flattened_cols = []
for (stat_type, sampling, metric) in t_test_pivot.columns:
    # Example: stat_type="T-stat", sampling="original", metric="MCC"
    # We'll produce "MCC Original T-stat".
    new_col_name = f"{metric} {sampling.capitalize()} {stat_type}"
    flattened_cols.append(new_col_name)

t_test_pivot.columns = flattened_cols
t_test_pivot.reset_index(inplace=True)

# If you want a certain column ordering:
desired_order = [
    "Data",
    "MCC Original T-stat", "MCC Original P-value",
    "MCC Oversampled T-stat", "MCC Oversampled P-value",
    "F1_macro Original T-stat", "F1_macro Original P-value",
    "F1_macro Oversampled T-stat", "F1_macro Oversampled P-value",
    "F1_weighted Original T-stat", "F1_weighted Original P-value",
    "F1_weighted Oversampled T-stat", "F1_weighted Oversampled P-value",
]
# Keep only columns that exist in the pivot
final_cols = [c for c in desired_order if c in t_test_pivot.columns]
t_test_pivot = t_test_pivot[final_cols]

# Save
t_test_pivot.to_csv("t_test_results_svm_rf.csv", index=False)


In [10]:
import pandas as pd
from scipy.stats import ttest_ind

# 1) Load CNN "all_original_cnn_metrics.csv" and "all_oversampled_cnn_metrics.csv"
cnn_original_path = "svm/all_original_svm_metrics.csv"
cnn_oversampled_path = "svm/all_oversampled_svm_metrics.csv"

df_cnn_original = pd.read_csv(cnn_original_path)
df_cnn_oversampled = pd.read_csv(cnn_oversampled_path)

# 2) Define metrics to test
metrics = ["MCC", "F1_macro", "F1_weighted"]

# 3) Run t-tests
results = []
for metric in metrics:
    t_stat, p_val = ttest_ind(df_cnn_original[metric], df_cnn_oversampled[metric], equal_var=False)
    results.append({
        "Data": "cnn_all",      # Because your filenames are "all_*_cnn_metrics.csv"
        "Metric": metric,
        "T-stat": round(t_stat, 4),
        "P-value": round(p_val, 4)
    })

# 4) Convert to a DataFrame
results_df = pd.DataFrame(results)

# 5) Pivot so each metric has its own T-stat and P-value columns, with only 1 row
t_test_pivot = results_df.pivot_table(
    index="Data",       # single row for "cnn_all"
    columns="Metric",   # "MCC", "F1_macro", "F1_weighted"
    values=["T-stat","P-value"]
)

# The columns are a multi-index: ("T-stat", "MCC"), ("P-value", "MCC"), ...
# Flatten them
new_cols = []
for (stat_type, metric_name) in t_test_pivot.columns:
    # e.g. ("T-stat", "MCC") -> "MCC T-stat"
    new_cols.append(f"{metric_name} {stat_type}")
t_test_pivot.columns = new_cols

# Put index (Data) back as a column
t_test_pivot.reset_index(inplace=True)

# Optional: reorder columns
desired_order = [
    "Data",
    "MCC T-stat", "MCC P-value",
    "F1_macro T-stat", "F1_macro P-value",
    "F1_weighted T-stat", "F1_weighted P-value",
]
t_test_pivot = t_test_pivot[desired_order]

# 6) Save final CSV
t_test_pivot.to_csv("t_test_results_svm.csv", index=False)
