In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    average_precision_score,
    confusion_matrix,
)

In [2]:
output_file_1 = "flights_20241114_101306"
output_file_2 = "flights_20241114_103536"
output_file_3 = "flights_20241114_101306"

Consolidated Output


In [3]:
output_1 = pd.read_csv(
    f"../backend/data/{output_file_1}/consolidated_error_annotations.csv"
)


output_2 = pd.read_csv(
    f"../backend/data/{output_file_2}/consolidated_error_annotations.csv"
)


output_3 = pd.read_csv(
    f"../backend/data/{output_file_3}/consolidated_error_annotations.csv"
)

In [4]:
clean_dataset = pd.read_csv(f"../backend/data/{output_file_1}/clean.csv")
dirty_dataset = pd.read_csv(f"../backend/data/{output_file_1}/dirty.csv")

In [5]:
def annotate_errors(
    df_fixed: pd.DataFrame, df_with_errors: pd.DataFrame
) -> pd.DataFrame:
    # Check if the dataframes have the same shape and columns after sorting
    if df_fixed.shape != df_with_errors.shape or not all(
        df_fixed.columns == df_with_errors.columns
    ):
        raise ValueError("Both dataframes must have the same structure.")

    # Convert both dataframes to strings for datatype-agnostic comparison
    df_fixed_str = df_fixed.astype(str)
    df_with_errors_str = df_with_errors.astype(str)

    # Create the annotation dataframe by comparing the two dataframes
    error_annotation = (df_fixed_str != df_with_errors_str).astype(int)

    return error_annotation

In [6]:
error_annotation = annotate_errors(clean_dataset, dirty_dataset)

In [7]:
def union_method(df1, df2, df3):
    """
    Combine three dataframes using the union method.
    A cell is 1 in the output if it is 1 in any of the input dataframes.
    """
    return (df1 | df2 | df3).astype(int)


def threshold_method(df1, df2, df3, threshold=0.5):
    """
    Combine three dataframes using a threshold.
    A cell is 1 in the output if it is 1 in at least `threshold` proportion of the input dataframes.
    """
    # Stack the dataframes and calculate the sum along the stack
    stacked = np.stack([df1.values, df2.values, df3.values])
    count_ones = np.sum(stacked, axis=0)

    # Calculate the threshold in terms of number of dataframes
    threshold_count = int(threshold * 3)

    # Determine if each cell meets the threshold and create a DataFrame
    result = (count_ones >= threshold_count).astype(int)

    # Return the result as a DataFrame with the same index and columns as the input dataframes
    return pd.DataFrame(result, index=df1.index, columns=df1.columns)

In [8]:
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_auc_score,
    average_precision_score,
)


def calculate_metrics(true_dataset: pd.DataFrame, pred_dataset: pd.DataFrame):
    # Flatten the dataframes to 1D arrays
    y_true = true_dataset.values.flatten()
    y_pred = pred_dataset.values.flatten()

    # Basic metrics
    accuracy = accuracy_score(y_true, y_pred)

    # Handle precision, recall, and F1 score gracefully
    try:
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)
    except ValueError:
        # This case happens when y_true has no positive samples
        precision = recall = f1 = 0.0

    # Class-specific accuracy
    cm = confusion_matrix(y_true, y_pred)
    if cm.size == 4:  # If confusion matrix has 4 elements (2x2 matrix)
        tn, fp, fn, tp = cm.ravel()
    else:  # Handle cases where there's only one class in y_true
        tn = fp = fn = tp = 0
        if len(cm) == 1:
            tn = cm[0, 0] if y_true[0] == 0 else 0
            tp = cm[0, 0] if y_true[0] == 1 else 0

    # AUC scores
    if len(set(y_true)) > 1:  # AUC scores require at least two classes
        roc_auc = roc_auc_score(y_true, y_pred)
    else:
        roc_auc = None

    # Count of 1s in true and predicted labels
    predicted_positives_count = sum(y_pred == 1)  # Total 1s in true labels
    actual_positives_count = sum(y_true == 1)

    return {
        "accuracy": float(accuracy),
        "precision": float(precision),
        "recall": float(recall),
        "f1_score": float(f1),
        "roc_auc": float(roc_auc) if roc_auc is not None else None,
        "true_positives_count": int(tp),
        "true_negative_count": int(tn),
        "false_positive_count": int(fp),
        "false_negative_count": int(fn),
        "predicted_positives_count": int(predicted_positives_count),
        "actual_positives_count": int(actual_positives_count),
        "fp_rate": float(fp / len(y_pred)),
    }

In [9]:
# Column-wise metric calculation wrapper
def calculate_columnwise_metrics(
    true_dataset: pd.DataFrame, pred_dataset: pd.DataFrame
):
    results = {}

    for column in true_dataset.columns:
        if column in pred_dataset.columns:

            metrics = calculate_metrics(true_dataset[[column]], pred_dataset[[column]])
            results[column] = metrics
        else:
            results[column] = "Column missing in predictions"

    return results

In [10]:
union_output = union_method(output_1, output_2, output_3)

threshold_output = threshold_method(output_1, output_2, output_3)

In [11]:
print(calculate_metrics(error_annotation, union_output))

{'accuracy': 0.5573593073593074, 'precision': 0.3241647465437788, 'recall': 0.45752032520325203, 'f1_score': 0.37946729602157786, 'roc_auc': 0.5284100942956151, 'true_positives_count': 2251, 'true_negative_count': 7019, 'false_positive_count': 4693, 'false_negative_count': 2669, 'predicted_positives_count': 6944, 'actual_positives_count': 4920, 'fp_rate': 0.2821669071669072}


In [12]:
print(calculate_metrics(error_annotation, threshold_output))

{'accuracy': 0.5573593073593074, 'precision': 0.3241647465437788, 'recall': 0.45752032520325203, 'f1_score': 0.37946729602157786, 'roc_auc': 0.5284100942956151, 'true_positives_count': 2251, 'true_negative_count': 7019, 'false_positive_count': 4693, 'false_negative_count': 2669, 'predicted_positives_count': 6944, 'actual_positives_count': 4920, 'fp_rate': 0.2821669071669072}


In [13]:
results = calculate_columnwise_metrics(error_annotation, union_output)

pd.DataFrame(results).to_csv("./consolidated_results.csv")

Attribute Output


In [14]:
output_1 = pd.read_csv(f"../backend/data/{output_file_1}/attribute/output.csv")

output_2 = pd.read_csv(f"../backend/data/{output_file_2}/attribute/output.csv")

output_3 = pd.read_csv(f"../backend/data/{output_file_3}/attribute/output.csv")

In [15]:
union_output = union_method(output_1, output_2, output_3)

threshold_output = threshold_method(output_1, output_2, output_3)

In [16]:
print(calculate_metrics(error_annotation, union_output))

{'accuracy': 0.545995670995671, 'precision': 0.12252510760401722, 'recall': 0.08678861788617886, 'f1_score': 0.10160618679357526, 'roc_auc': 0.4128444455551113, 'true_positives_count': 427, 'true_negative_count': 8654, 'false_positive_count': 3058, 'false_negative_count': 4493, 'predicted_positives_count': 3485, 'actual_positives_count': 4920, 'fp_rate': 0.18386243386243387}


In [17]:
print(calculate_metrics(error_annotation, threshold_output))

{'accuracy': 0.545995670995671, 'precision': 0.12252510760401722, 'recall': 0.08678861788617886, 'f1_score': 0.10160618679357526, 'roc_auc': 0.4128444455551113, 'true_positives_count': 427, 'true_negative_count': 8654, 'false_positive_count': 3058, 'false_negative_count': 4493, 'predicted_positives_count': 3485, 'actual_positives_count': 4920, 'fp_rate': 0.18386243386243387}


In [18]:
results = calculate_columnwise_metrics(error_annotation, union_output)

pd.DataFrame(results).to_csv("./attribute_results.csv")

Dependency Violation Output


In [19]:
output_1 = pd.read_csv(
    f"../backend/data/{output_file_1}/dependency_violations/output.csv"
)


output_2 = pd.read_csv(
    f"../backend/data/{output_file_2}/dependency_violations/output.csv"
)


output_3 = pd.read_csv(
    f"../backend/data/{output_file_3}/dependency_violations/output.csv"
)

In [20]:
union_output = union_method(output_1, output_2, output_3)

threshold_output = threshold_method(output_1, output_2, output_3)

In [21]:
print(calculate_metrics(error_annotation, union_output))

{'accuracy': 0.7285954785954786, 'precision': 0.5507753876938469, 'recall': 0.4475609756097561, 'f1_score': 0.49383269791433054, 'roc_auc': 0.6471069905371184, 'true_positives_count': 2202, 'true_negative_count': 9916, 'false_positive_count': 1796, 'false_negative_count': 2718, 'predicted_positives_count': 3998, 'actual_positives_count': 4920, 'fp_rate': 0.10798460798460799}


In [22]:
print(calculate_metrics(error_annotation, threshold_output))

{'accuracy': 0.7285954785954786, 'precision': 0.5507753876938469, 'recall': 0.4475609756097561, 'f1_score': 0.49383269791433054, 'roc_auc': 0.6471069905371184, 'true_positives_count': 2202, 'true_negative_count': 9916, 'false_positive_count': 1796, 'false_negative_count': 2718, 'predicted_positives_count': 3998, 'actual_positives_count': 4920, 'fp_rate': 0.10798460798460799}


In [23]:
results = calculate_columnwise_metrics(error_annotation, union_output)

pd.DataFrame(results).to_csv("./dep_viol_results.csv")

Prompt Metadata


In [42]:
import pandas as pd

# measure = "attribute"
# measure = "dependency"
measure = "dependency_violations"


# Load your DataFrames
df1 = pd.read_csv(
    f"../backend/data/rayyan_20241114_105931/{measure}/prompt_metadata.csv"
)
df2 = pd.read_csv(
    f"../backend/data/rayyan_20241114_111119/{measure}/prompt_metadata.csv"
)
df3 = pd.read_csv(
    f"../backend/data/rayyan_20241114_112300/{measure}/prompt_metadata.csv"
)


# Convert duration from MM:SS.MSS to seconds
def duration_to_seconds(duration):
    try:
        # Split minutes and seconds
        minutes, seconds = duration.split(":")
        minutes = int(minutes)
        seconds = float(seconds)  # Includes fractional seconds
        return minutes * 60 + seconds
    except Exception as e:
        print(f"Error parsing duration '{duration}': {e}")
        return 0  # Default to 0 seconds for invalid formats


# Apply conversion to all DataFrames
for df in [df1, df2, df3]:
    df["duration_seconds"] = df["elapsed_time"].apply(duration_to_seconds)


# Calculate totals for each dataset
def calculate_totals(df):
    totals = df.sum(numeric_only=True)  # Sum only numeric columns
    totals["duration_seconds"] = df["duration_seconds"].sum()
    return totals


totals1 = calculate_totals(df1)
totals2 = calculate_totals(df2)
totals3 = calculate_totals(df3)

# Combine totals into a single DataFrame
totals_df = pd.DataFrame([totals1, totals2, totals3])

# Calculate the average across datasets
averages = totals_df.mean()


# Convert total duration per dataset and average duration to MM:SS.MSS format
def seconds_to_duration(seconds):
    minutes = int(seconds // 60)
    seconds = seconds % 60
    return f"{minutes:02}:{seconds:06.3f}"  # Keeps milliseconds in the output


# Convert durations
totals_df["duration"] = totals_df["duration_seconds"].apply(seconds_to_duration)
average_duration = seconds_to_duration(averages["duration_seconds"])


# Print results
print("Totals for each dataset:")
print(totals_df)
print("\nAverage Duration Across Datasets:")
print(average_duration)
print("Total Tokens: ", averages["total_tokens"])
print("Completion Tokens: ", averages["completion_tokens"])
print("Prompt Tokens: ", averages["prompt_tokens"])

Totals for each dataset:
   completion_tokens  prompt_tokens  total_tokens  batches  duration_seconds  \
0            25461.0       145152.0      170613.0     48.0           214.507   
1            29767.0       183880.0      213647.0     60.0           260.720   
2            34225.0       200045.0      234270.0     58.0           278.790   

    duration  
0  03:34.507  
1  04:20.720  
2  04:38.790  

Average Duration Across Datasets:
04:11.339
Total Tokens:  206176.66666666666
Completion Tokens:  29817.666666666668
Prompt Tokens:  176359.0


Novel Detections


In [25]:
import pandas as pd
import numpy as np

# Example DataFrames
df1 = pd.DataFrame({"A": [1, 0, 3], "B": [0, 4, 0], "C": [5, 0, 0]})

df2 = pd.DataFrame({"A": [1, 0, 0], "B": [0, 4, 2], "C": [0, 1, 0]})

# Step 1: Find where the values differ
difference_mask = df1 != df2

# Step 2: Create a DataFrame with only differing values
differences = df1.where(difference_mask, np.nan)

# Step 3: (Optional) Replace NaN with 0 if you want a 0-filled output
differences_filled = differences.fillna(0)


print(differences_filled)

     A    B    C
0  0.0  0.0  5.0
1  0.0  0.0  0.0
2  3.0  0.0  0.0


In [57]:
tokens = 176359
cost = 0.15

print("$", float(tokens / 1000000) * cost)

$ 0.026453849999999998


In [74]:
tokens = 29818
cost = 0.60

print("$", float(tokens / 1000000) * cost)

$ 0.0178908


In [26]:
def inspect_classification(
    true_dataset: pd.DataFrame, pred_dataset: pd.DataFrame, input_dataset: pd.DataFrame
):
    true_dataset.reset_index(drop=True)
    pred_dataset.reset_index(drop=True)
    input_dataset.reset_index(drop=True)

    true_dataset.columns = input_dataset.columns
    pred_dataset.columns = input_dataset.columns

    calc = true_dataset.add(2)
    calc_out = pred_dataset.copy()
    calc_out[calc_out == 0] = -1

    calc = calc.add(calc_out)

    # True positive calculation
    tp = calc == 4
    true_positive_df = input_dataset[tp].astype(str)
    true_positive_df = true_positive_df.replace(to_replace="nan", value=0)
    true_positive_df = true_positive_df.reset_index(drop=True)  # Remove index

    # False positive calculation
    fp = calc == 3
    false_positive_df = input_dataset[fp].astype(str)
    false_positive_df = false_positive_df.replace(to_replace="nan", value=0)
    false_positive_df = false_positive_df.reset_index(drop=True)  # Remove index

    # False negative calculation
    fn = calc == 2
    false_negative_df = input_dataset[fn].astype(str)
    false_negative_df = false_negative_df.replace(to_replace="nan", value=0)
    false_negative_df = false_negative_df.reset_index(drop=True)  # Remove index

    all_errors_df = input_dataset[fp | fn].astype(str)
    all_errors_df = all_errors_df.replace(to_replace="nan", value=0)
    all_errors_df = all_errors_df.reset_index(drop=True)  # Remove index

    return true_positive_df, false_positive_df, false_negative_df, all_errors_df

In [27]:
output_file_1 = "rayyan_20241114_105931"
output_file_2 = "rayyan_20241114_111119"
output_file_3 = "rayyan_20241114_112300"

output_1 = pd.read_csv(
    f"../backend/data/{output_file_1}/consolidated_error_annotations.csv"
)


output_2 = pd.read_csv(
    f"../backend/data/{output_file_2}/consolidated_error_annotations.csv"
)


output_3 = pd.read_csv(
    f"../backend/data/{output_file_3}/consolidated_error_annotations.csv"
)

union_output = union_method(output_1, output_2, output_3)

In [28]:
dataset = "rayyan"

raha = (
    pd.read_csv(f"./tools/raha/datasets/{dataset}/annotated_cells.csv")
    # pd.read_csv(f"./tools/raha/datasets/movies_1/annotated_cells.csv")
    .astype(int).fillna(0)
)
# SynODC output
syn = (
    pd.read_csv(f"./tools/SynODC/Results/{dataset}/output/annotated_output.csv")
    .astype(int)
    .fillna(0)
)

FileNotFoundError: [Errno 2] No such file or directory: './tools/raha/datasets/rayyan/annotated_cells.csv'

In [None]:
raha_synodc = (raha | syn).fillna(0)

# Make sure the columns are in the correct order and ensure no index changes
raha_synodc = raha_synodc.astype(int)

# If you want to ensure the column order is preserved explicitly, you can reorder:
raha_synodc = raha_synodc[raha.columns]

In [None]:
dirty_dataset = pd.read_csv(f"./datasets/{dataset}/{dataset}.csv")
clean_dataset = pd.read_csv(f"./datasets/{dataset}/clean.csv")

In [None]:
error_annotation = annotate_errors(clean_dataset, dirty_dataset)

# CAED output
true_positive_df, false_positive_df, false_negative_df, all_errors_df = (
    inspect_classification(error_annotation, union_output, dirty_dataset)
)

caed_tp = true_positive_df.copy()


# Raha SynODC output
true_positive_df, false_positive_df, false_negative_df, all_errors_df = (
    inspect_classification(error_annotation, raha_synodc, dirty_dataset)
)

raha_syn_tp = true_positive_df.copy()

# Raha output
true_positive_df, false_positive_df, false_negative_df, all_errors_df = (
    inspect_classification(error_annotation, raha, dirty_dataset)
)

raha_tp = true_positive_df.copy()

# SynODC output
true_positive_df, false_positive_df, false_negative_df, all_errors_df = (
    inspect_classification(error_annotation, syn, dirty_dataset)
)

syn_tp = true_positive_df.copy()

In [None]:
# Combined
# Step 1: Find where the values differ
difference_mask = caed_tp != raha_syn_tp

# Step 2: Create a DataFrame with only differing values
differences = caed_tp.where(difference_mask, np.nan)

# Step 3: Replace NaN with 0, ensuring all values are treated as float
differences_filled = differences.fillna(0)

# Optional: Drop the first column if needed
# differences_filled.drop(columns=differences_filled.columns[0], axis=1, inplace=True)


# Step 4: Define a function to identify non-zero values robustly
def is_non_zero(value):
    try:
        # Try to cast to a float, compare to zero
        return float(value) != 0.0
    except ValueError:
        # If value cannot be converted to a float, assume it's non-zero
        return True


# Apply the function to the entire DataFrame
non_zero_mask = differences_filled.applymap(is_non_zero)

# Count the number of True values (non-zero values)
non_zero_count = non_zero_mask.sum().sum()

print(f"Count of non-zero values: {non_zero_count}")

# Step 5: Save the differences
differences_filled.to_csv(f"./novel_detections/{dataset}/novel.csv", index=False)

Count of non-zero values: 253


In [None]:
# Raha output

# Step 1: Find where the values differ
difference_mask = caed_tp != raha_tp


# Step 2: Create a DataFrame with only differing values
differences = caed_tp.where(difference_mask, np.nan)


# Step 3: Replace NaN with 0, ensuring all values are treated as float
differences_filled = differences.fillna(0)


# Optional: Drop the first column if needed
# differences_filled.drop(columns=differences_filled.columns[0], axis=1, inplace=True)


# Step 4: Define a function to identify non-zero values robustly
def is_non_zero(value):
    try:
        # Try to cast to a float, compare to zero
        return float(value) != 0.0
    except ValueError:
        # If value cannot be converted to a float, assume it's non-zero
        return True


# Apply the function to the entire DataFrame
non_zero_mask = differences_filled.applymap(is_non_zero)

# Count the number of True values (non-zero values)
non_zero_count = non_zero_mask.sum().sum()

print(f"Count of non-zero values Raha: {non_zero_count}")

# Step 5: Save the differences
# differences_filled.to_csv("./novel.csv", index=False)

Count of non-zero values Raha: 256


In [None]:
# SynODC

# Raha output
# raha = pd.read_csv(f"./tools/raha/datasets/{dataset}/tp.csv")
# # SynODC output


# raha_syn = raha.combine_first(syn).iloc[:, 1:]


# Step 1: Find where the values differ
difference_mask = caed_tp != syn_tp

# Step 2: Create a DataFrame with only differing values
differences = caed_tp.where(difference_mask, np.nan)

# Step 3: Replace NaN with 0, ensuring all values are treated as float
differences_filled = differences.fillna(0)

# Optional: Drop the first column if needed
# differences_filled.drop(columns=differences_filled.columns[0], axis=1, inplace=True)


# Step 4: Define a function to identify non-zero values robustly
def is_non_zero(value):
    try:
        # Try to cast to a float, compare to zero
        return float(value) != 0.0
    except ValueError:
        # If value cannot be converted to a float, assume it's non-zero
        return True


# Apply the function to the entire DataFrame
non_zero_mask = differences_filled.applymap(is_non_zero)

# Count the number of True values (non-zero values)
non_zero_count = non_zero_mask.sum().sum()

print(f"Count of non-zero values SynODC: {non_zero_count}")

# Step 5: Save the differences
# differences_filled.to_csv("./novel.csv", index=False)

Count of non-zero values SynODC: 516
