In [80]:
import pandas as pd
import numpy as np

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    average_precision_score,
    confusion_matrix,
)

In [81]:
def annotate_errors(
    df_fixed: pd.DataFrame, df_with_errors: pd.DataFrame
) -> pd.DataFrame:
    # Check if the dataframes have the same shape and columns after sorting
    if df_fixed.shape != df_with_errors.shape or not all(
        df_fixed.columns == df_with_errors.columns
    ):

        raise ValueError("Both dataframes must have the same structure.")

    # Convert both dataframes to strings for datatype-agnostic comparison
    df_fixed_str = df_fixed.astype(str)
    df_with_errors_str = df_with_errors.astype(str)

    # Create the annotation dataframe by comparing the two dataframes
    error_annotation = (df_fixed_str != df_with_errors_str).astype(int)

    return error_annotation

In [82]:
def inspect_classification(
    true_dataset: pd.DataFrame, pred_dataset: pd.DataFrame, input_dataset: pd.DataFrame
):
    true_dataset.reset_index(drop=True)
    pred_dataset.reset_index(drop=True)
    input_dataset.reset_index(drop=True)

    true_dataset.columns = input_dataset.columns
    pred_dataset.columns = input_dataset.columns

    calc = true_dataset.add(2)
    calc_out = pred_dataset.copy()
    calc_out[calc_out == 0] = -1

    calc = calc.add(calc_out)

    # True positive calculation
    tp = calc == 4
    true_positive_df = input_dataset[tp].astype(str)
    true_positive_df = true_positive_df.replace(to_replace="nan", value=0)
    true_positive_df = true_positive_df.reset_index(drop=True)  # Remove index

    # False positive calculation
    fp = calc == 3
    false_positive_df = input_dataset[fp].astype(str)
    false_positive_df = false_positive_df.replace(to_replace="nan", value=0)
    false_positive_df = false_positive_df.reset_index(drop=True)  # Remove index

    # False negative calculation
    fn = calc == 2
    false_negative_df = input_dataset[fn].astype(str)
    false_negative_df = false_negative_df.replace(to_replace="nan", value=0)
    false_negative_df = false_negative_df.reset_index(drop=True)  # Remove index

    all_errors_df = input_dataset[fp | fn].astype(str)
    all_errors_df = all_errors_df.replace(to_replace="nan", value=0)
    all_errors_df = all_errors_df.reset_index(drop=True)  # Remove index

    return true_positive_df, false_positive_df, false_negative_df, all_errors_df

In [83]:
def calculate_metrics(true_dataset: pd.DataFrame, pred_dataset: pd.DataFrame):
    # Flatten the dataframes to 1D arrays
    y_true = true_dataset.values.flatten()
    y_pred = pred_dataset.values.flatten()

    # Basic metrics
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    # Class-specific accuracy
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    # AUC scores
    roc_auc = roc_auc_score(y_true, y_pred)
    pr_auc = average_precision_score(y_true, y_pred)

    # Count of 1s in true and predicted labels
    true_positives_count = sum(y_true == 1)  # Total 1s in true labels
    predicted_positives_count = tp  # Total 1s in predictions

    return {
        "accuracy": float(accuracy),
        "precision": float(precision),
        "recall": float(recall),
        "f1_score": float(f1),
        "roc_auc": float(roc_auc),
        "pr_auc": float(pr_auc),
        "true_positives_count": int(true_positives_count),
        "predicted_positives_count": int(predicted_positives_count),
    }

In [84]:
def get_annotated_output(dirty_dataset, ill_formed_records):
    # Create a copy of the dataset to annotate
    annotated_output = dirty_dataset.copy()

    # Initialize with 0s
    annotated_output.loc[:, :] = 0

    # Set rows with ill-formed records to 1
    annotated_output.loc[ill_formed_records, :] = 1

    return annotated_output.astype(int)

In [None]:
output_file = pd.read_csv("./source/output/flights/ResultIndices.csv")
clean_dataset = pd.read_csv("../../datasets/flights/clean.csv")
dirty_dataset = pd.read_csv("../../datasets/flights/dirty.csv")

ill_formed_records_string = output_file["Row Indices"][1]

# Subtract 2 for every entry. The index for the df starts at 0, but the output starts at row 1 of text file + header
ill_formed_records = [int(index) - 2 for index in ill_formed_records_string.split(",")]

ill_formed_records = [index for index in ill_formed_records if index >= 0]

In [None]:
# Generate the annotated output
output = get_annotated_output(dirty_dataset, ill_formed_records)
output.to_csv("./output/flights/annotated_output.csv", index=False)


error_annotation = annotate_errors(clean_dataset, dirty_dataset)
error_annotation.reset_index(drop=True)


print(calculate_metrics(error_annotation, output))

(2376, 7)
(2376, 7)
{'accuracy': 0.6077441077441077, 'precision': 0.32042095835199286, 'recall': 0.29085365853658535, 'f1_score': 0.3049222245898146, 'roc_auc': 0.5158588647874184, 'pr_auc': 0.30297194278481077, 'true_positives_count': 4920, 'predicted_positives_count': 1431}
