In [1]:
import os
import pandas as pd

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    average_precision_score,
    confusion_matrix,
)

In [None]:
def annotate_output(dataset):
    directory = f"./source/Results/{dataset}/columns/"
    clean_dataset_path = f"../../datasets/{dataset}/clean.csv"
    dirty_dataset_path = f"../../datasets/{dataset}/{dataset}.csv"

    # Load the datasets
    clean_dataset = pd.read_csv(clean_dataset_path)
    dirty_dataset = pd.read_csv(dirty_dataset_path)

    annotated_output = dirty_dataset.copy()

    # Iterate through all subfolders and files in the directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".csv"):  # Ensure it's a CSV file
                # Read the outlier file
                col_output = pd.read_csv(os.path.join(root, file))

                # Get the "outliers" column
                if "outliers" in col_output.columns:
                    outliers = col_output["outliers"].dropna()

                    # Identify the column in the main dataset to match
                    column_name = os.path.splitext(file)[
                        0
                    ]  # Get file name without .csv
                    column_name = column_name.replace(
                        "_results_col", ""
                    )  # Remove suffix

                    if column_name in annotated_output.columns:
                        # Update the dataset with "synodc_outlier" for outliers
                        annotated_output.loc[
                            annotated_output[column_name].isin(outliers), column_name
                        ] = "synodc_outlier"

    # Convert non-outliers to 0 and outliers to 1
    annotated_output = annotated_output.applymap(
        lambda x: 1 if x == "synodc_outlier" else 0
    )

    # Save the updated dataset
    output_path = f"./output{dataset}/annotated_output.csv"
    annotated_output.to_csv(output_path, index=False)

    return annotated_output

In [3]:
def annotate_errors(
    df_fixed: pd.DataFrame, df_with_errors: pd.DataFrame
) -> pd.DataFrame:
    # Check if the dataframes have the same shape and columns after sorting
    if df_fixed.shape != df_with_errors.shape or not all(
        df_fixed.columns == df_with_errors.columns
    ):

        raise ValueError("Both dataframes must have the same structure.")

    # Convert both dataframes to strings for datatype-agnostic comparison
    df_fixed_str = df_fixed.astype(str)
    df_with_errors_str = df_with_errors.astype(str)

    # Create the annotation dataframe by comparing the two dataframes
    error_annotation = (df_fixed_str != df_with_errors_str).astype(int)

    return error_annotation

In [4]:
def inspect_classification(
    true_dataset: pd.DataFrame, pred_dataset: pd.DataFrame, input_dataset: pd.DataFrame
):
    true_dataset.reset_index(drop=True)
    pred_dataset.reset_index(drop=True)
    input_dataset.reset_index(drop=True)

    true_dataset.columns = input_dataset.columns
    pred_dataset.columns = input_dataset.columns

    calc = true_dataset.add(2)
    calc_out = pred_dataset.copy()
    calc_out[calc_out == 0] = -1

    calc = calc.add(calc_out)

    # True positive calculation
    tp = calc == 4
    true_positive_df = input_dataset[tp].astype(str)
    true_positive_df = true_positive_df.replace(to_replace="nan", value=0)
    true_positive_df = true_positive_df.reset_index(drop=True)  # Remove index

    # False positive calculation
    fp = calc == 3
    false_positive_df = input_dataset[fp].astype(str)
    false_positive_df = false_positive_df.replace(to_replace="nan", value=0)
    false_positive_df = false_positive_df.reset_index(drop=True)  # Remove index

    # False negative calculation
    fn = calc == 2
    false_negative_df = input_dataset[fn].astype(str)
    false_negative_df = false_negative_df.replace(to_replace="nan", value=0)
    false_negative_df = false_negative_df.reset_index(drop=True)  # Remove index

    all_errors_df = input_dataset[fp | fn].astype(str)
    all_errors_df = all_errors_df.replace(to_replace="nan", value=0)
    all_errors_df = all_errors_df.reset_index(drop=True)  # Remove index

    return true_positive_df, false_positive_df, false_negative_df, all_errors_df

In [5]:
def calculate_metrics(true_dataset: pd.DataFrame, pred_dataset: pd.DataFrame):
    # Flatten the dataframes to 1D arrays
    y_true = true_dataset.values.flatten()
    y_pred = pred_dataset.values.flatten()

    # Basic metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    # Class-specific accuracy
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    class_0_accuracy = tn / (tn + fp)  # Accuracy for class 0
    class_1_accuracy = tp / (tp + fn)  # Accuracy for class 1

    # AUC scores
    roc_auc = roc_auc_score(y_true, y_pred)
    pr_auc = average_precision_score(y_true, y_pred)

    # Count of 1s in true and predicted labels
    true_positives_count = sum(y_true == 1)  # Total 1s in true labels
    predicted_positives_count = tp  # Total 1s in predictions

    return {
        "accuracy": float(accuracy),
        "precision": float(precision),
        "recall": float(recall),
        "f1_score": float(f1),
        "roc_auc": float(roc_auc),
        "pr_auc": float(pr_auc),
        "true_positives_count": int(true_positives_count),
        "predicted_positives_count": int(predicted_positives_count),
    }

In [None]:
# Dataset name
dataset = "rayyan"

clean_dataset = pd.read_csv(f"../../datasets/{dataset}/clean.csv")
dirty_dataset = pd.read_csv(f"../../datasets/{dataset}/{dataset}.csv")

annotated_output = annotate_output(dataset)

error_annotation = annotate_errors(clean_dataset, dirty_dataset)

true_positive_df, false_positive_df, false_negative_df, all_errors_df = (
    inspect_classification(error_annotation, annotated_output, dirty_dataset)
)

true_positive_df.to_csv(f"./output/{dataset}/tp.csv")

print(calculate_metrics(error_annotation, annotated_output))
# Paths for folders and files

{'accuracy': 0.734909090909091, 'precision': 0.2345679012345679, 'recall': 0.006613296206056387, 'f1_score': 0.012863913337846986, 'roc_auc': 0.49949220242811737, 'pr_auc': 0.2610058124658426, 'true_positives_count': 2873, 'predicted_positives_count': 19}
