In [1]:
import paths
from ansi_colors import *
from evaluation import evaluate_claims, evaluate_specs, extract_claims_from_directory, calculate_metrics_with_threshold

gt_dict = extract_claims_from_directory(paths.GROUND_TRUTH.CLAIMS.value)  # Ground truth
claims_dict = extract_claims_from_directory(paths.CLAIMS)  # Predictions

## Evaluation method 1: Exact match

Here we consider a claim a true positive if it is exactly the same as the ground truth. Even one spec error or one difference in a string leads to a true negative.
We measure claim Precision, Accuracy and F1-measure alongside the same metrics regarding specifications.

In [2]:
precision, recall, file_number = evaluate_claims(gt_dict, claims_dict)
f1 = 2 * ((precision * recall) / (precision + recall)) if (precision + recall) > 0 else 0

print(f"{GREEN}Done! {RESET}{file_number}{GREEN} files were analyzed{RESET}")
print(f"{GREEN}Precision = {RESET} {precision:.2f}")
print(f"{GREEN}Recall = {RESET} {recall:.2f}")
print(f"{GREEN}F1 = {RESET} {f1:.2f}\n")


precision, recall, file_number = evaluate_specs(gt_dict, claims_dict)
f1 = 2 * ((precision * recall) / (precision + recall)) if (precision + recall) > 0 else 0

print(f"{GREEN}Done! {RESET}{file_number}{GREEN} files were analyzed{RESET}")
print(f"{GREEN}Precision = {RESET} {precision:.2f}")
print(f"{GREEN}Recall = {RESET} {recall:.2f}")
print(f"{GREEN}F1 = {RESET} {f1:.2f}\n")

- [31mCLAIMS evaluation started:[0m
[32mDone! [0m31[32m files were analyzed[0m
[32mPrecision = [0m 0.53
[32mRecall = [0m 0.52
[32mF1 = [0m 0.52

- [31mSPECS evaluation started:[0m
[32mDone! [0m31[32m files were analyzed[0m
[32mPrecision = [0m 0.93
[32mRecall = [0m 0.92
[32mF1 = [0m 0.93



## Evaluation method 2: Best match with Threshold

Here we consider a claim a true positive if it has at least _threshold_ specifications that are the exact same. Specifications are treated as before.
We measure claim Precision, Accuracy and F1-measure alongside the same metrics regarding specifications.

### How It Works

1. Specification Precision for Each Claim:
    - For each predicted claim, it calculates the precision of its specifications compared to every unmatched ground truth claim.
    - The precision is calculated as:
    - Precision=True PositivesTrue Positives+False Positives
    - Precision=True Positives+False PositivesTrue Positives​
    - The claim is considered a true positive if its specification precision is greater than or equal to the threshold.

2. Claim Matching:
    - Each predicted claim is matched to the best ground truth claim based on specification precision.
    - Ground truth and predicted claims that are matched are marked to avoid duplication.

3. Update Metrics:
    - True positives, false positives, and false negatives are updated for both claims and specifications based on the matching.

4. Final Metric Calculation:
    - After processing all claims, precision, recall, and F1-score are calculated for both specifications and claims.

In [20]:
metrics = calculate_metrics_with_threshold(gt_dict, claims_dict, 0.9)

print(f"- {RED}CLAIMS evaluation:{RESET}")
print(f"{GREEN}Done!{RESET}")
print(f"{GREEN}Precision = {RESET} {metrics["claim_precision"]:.2f}")
print(f"{GREEN}Recall = {RESET} {metrics["claim_recall"]:.2f}")
print(f"{GREEN}F1 = {RESET} {metrics["claim_f1"]:.2f}\n")

print(f"- {RED}SPECS evaluation:{RESET}")
print(f"{GREEN}Done!{RESET}")
print(f"{GREEN}Precision = {RESET} {metrics["spec_precision"]:.2f}")
print(f"{GREEN}Recall = {RESET} {metrics["spec_recall"]:.2f}")
print(f"{GREEN}F1 = {RESET} {metrics["spec_f1"]:.2f}\n")


- [31mCLAIMS evaluation:[0m
[32mDone![0m
[32mPrecision = [0m 0.79
[32mRecall = [0m 0.87
[32mF1 = [0m 0.83

- [31mSPECS evaluation:[0m
[32mDone![0m
[32mPrecision = [0m 1.00
[32mRecall = [0m 0.98
[32mF1 = [0m 0.99

