In [2]:

import csv
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

In [3]:
def evaluate_predictions(actual_df, prediction_df):
    """
    Evaluate the performance of predicted labels against actual labels.

    Args:
        actual_df (pd.DataFrame): DataFrame containing the actual labels with "ReviewText" and "label".
        prediction_df (pd.DataFrame): DataFrame containing the predicted labels with "ReviewText" and "PredictedLabel".

    Returns:
        dict: A dictionary containing accuracy, precision, recall, and F1 score.
    """
    # Combine the actual and predicted DataFrames on "ReviewText"
    comparison_df = pd.merge(
        actual_df, 
        prediction_df, 
        on="ReviewText", 
        how="inner"
    )

    if comparison_df.empty:
        raise ValueError("No matching ReviewText found between actual and predicted data.")

    # Check for matches between 'label' and 'PredictedLabel'
    comparison_df["Match"] = comparison_df["label"] == comparison_df["PredictedLabel"]

    # Calculate accuracy
    accuracy = comparison_df["Match"].mean()

    # Convert labels to numeric for precision, recall, and F1 score
    y_true = comparison_df["label"]
    y_pred = comparison_df["PredictedLabel"]

    # Calculate evaluation metrics
    precision = precision_score(y_true, y_pred, average="weighted", zero_division=0)
    recall = recall_score(y_true, y_pred, average="weighted", zero_division=0)
    f1 = f1_score(y_true, y_pred, average="weighted", zero_division=0)

    # Print classification report for more insights (optional)
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

    # Return the evaluation metrics
    return {
        "accuracy": accuracy * 100,
        "precision": precision * 100,
        "recall": recall * 100,
        "f1_score": f1
    }

In [17]:
input_file = "vendor.csv"  # Original file with actual labels
predictions_file = "results/vendor_predictions_one_shot_full_dataset.csv"  # File with model predictions

# Load input and prediction files as DataFrames
df_input = pd.read_csv(input_file)
df_predictions = pd.read_csv(predictions_file)

# Check unique values in the 'label' column
unique_labels = df_predictions['PredictedLabel'].unique()
print(f"Unique labels: {unique_labels}")

# Count occurrences of each label
label_counts = df_predictions['PredictedLabel'].value_counts()

# Display the counts
print("Counts of each category:")
print(label_counts)

label_mapping = {
    'Non-satisfaction With': 'Non-satisfaction With Vendor'
}

# Replace incorrect labels with standardized ones
df_predictions['PredictedLabel'] = df_predictions['PredictedLabel'].replace(label_mapping)

# Evaluate predictions
metrics = evaluate_predictions(df_input, df_predictions)

# Print evaluation metrics
print(f"Accuracy: {metrics['accuracy']:.2f}%")
print(f"Precision: {metrics['precision']:.2f}%")
print(f"Recall: {metrics['recall']:.2f}%")
print(f"F1 Score: {metrics['f1_score']:.2f}")

Unique labels: ['Satisfied With Vendor' 'Non-satisfaction With' 'No Comment']
Counts of each category:
Satisfied With Vendor    1937
Non-satisfaction With     313
No Comment                  5
Name: PredictedLabel, dtype: int64

Classification Report:
                              precision    recall  f1-score   support

                  No Comment       1.00      0.00      0.00      2068
Non-satisfaction With Vendor       0.16      0.62      0.25        79
       Satisfied With Vendor       0.05      0.98      0.10       108

                    accuracy                           0.07      2255
                   macro avg       0.40      0.53      0.12      2255
                weighted avg       0.93      0.07      0.02      2255

Accuracy: 7.10%
Precision: 92.52%
Recall: 7.10%
F1 Score: 0.02
