# ROC With Majority Vote

In [None]:
import itertools
import pathlib
import statistics
from typing import Any, Literal

import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, recall_score, f1_score
from tqdm import tqdm

from scripts.load_working_set import load_working_set

print("Done")

Some constants.

In [None]:
# The location of the data
DATA_DIR = pathlib.Path("./data")
if not DATA_DIR.exists():
    raise FileNotFoundError(f"Data directory not found: '{DATA_DIR}'")

Grabbing our data.

In [None]:
# Load our data
sentence_evaluations, sentence_realities = load_working_set("fine-tuning")

print("Done")

Create a working set of thresholds to calculate the ROC/AUROC for.

In [None]:
# Flatten our sentence evaluations array
flat_sentence_evaluations = list(itertools.chain.from_iterable(sentence_evaluations))

# Extract only the unique percentages
flat_sentence_evaluations = list(set(flat_sentence_evaluations))

# Sort this
flat_sentence_evaluations.sort()

# Calculate the distinct thresholds
distinct_thresholds = []
for i in range(len(flat_sentence_evaluations) - 1):
    midpoint = (flat_sentence_evaluations[i] + flat_sentence_evaluations[i + 1]) / 2
    distinct_thresholds.append(midpoint)

# Also add the minimum and maximum
if 0 not in distinct_thresholds:
    distinct_thresholds.append(0)
if 1 not in distinct_thresholds:
    distinct_thresholds.append(1)

A little prediction function to implement the majority approach.

In [None]:
def predict(_sentence_evaluations: list[float], _threshold: float) -> float:
    _votes = [
        True if _evaluation >= _threshold else False
        for _evaluation in _sentence_evaluations
    ]

    # Average them
    _prediction = statistics.mean(_votes)

    return _prediction


print("Done")

Now we create the evaluations from each percentage, generate a ROC curve and AUROC, then store it.

In [None]:
_data_keys = Literal["AUROC", "fpr", "tpr", "thresholds"]
threshold_to_data: dict[float, dict[_data_keys, Any]] = {}

# For each threshold
for current_threshold in tqdm(distinct_thresholds, desc="Calculating AUROC for each threshold"):
    # Generate the predictions for each sentence
    sentence_predictions = []
    for evaluations in sentence_evaluations:
        sentence_predictions.append(predict(evaluations, current_threshold))

    # Generate the ROC and AUROC for this threshold
    fpr, tpr, all_vote_thresholds = roc_curve(sentence_realities, sentence_predictions)
    roc_auc = auc(fpr, tpr)

    # Store
    threshold_to_data[current_threshold] = {
        "AUROC": roc_auc,
        "fpr": fpr,
        "tpr": tpr,
        "thresholds": all_vote_thresholds
    }

print("Done")

Finding the best threshold value via AUROC and graphing it.

In [None]:
best_vote_threshold, best_vote_threshold_data = max(threshold_to_data.items(), key=lambda x: x[1]["AUROC"])
best_vote_threshold_auroc = best_vote_threshold_data["AUROC"]

# Extract threshold and AUROC values for plotting
all_vote_thresholds = list(threshold_to_data.keys())
all_vote_data = [_data["AUROC"] for _data in threshold_to_data.values()]

# Sort by thresholds for a smoother plot
sorted_vote_thresholds, sorted_vote_auroc_values = zip(*sorted(zip(all_vote_thresholds, all_vote_data)))

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(sorted_vote_thresholds, sorted_vote_auroc_values, marker='o', linestyle='-')
plt.xlabel("Threshold")
plt.ylabel("AUROC")
plt.grid(True)

# Highlight the best threshold point with a vertical line and red marker
plt.axvline(x=best_vote_threshold, color='red', linestyle='--', label=f"Best Threshold: {best_vote_threshold:.2f}")
plt.scatter(best_vote_threshold, best_vote_threshold_auroc, color='red', s=100)  # Red point at best threshold

# Add legend
plt.legend()

# Set the title
title = "AUROC vs Vote Threshold for Majority Vote Approach"
plt.title(title)

# Also save it as a file
file = DATA_DIR / (title.replace(" ", "_") + ".png")
plt.savefig(file)

# And show the plot
plt.show()

Graphic the ROC curve for this best threshold.  
Then use Youden's J Statistic to find the best threshold to apply to the final value. 

In [None]:
# Find the best threshold for this threshold, kinda confusing I know...
youden_index = best_vote_threshold_data["tpr"] - best_vote_threshold_data["fpr"]
best_threshold_youden_index = np.argmax(youden_index)
best_threshold_youden = best_vote_threshold_data["thresholds"][best_threshold_youden_index]

# And plot
plt.figure()
plt.plot(best_vote_threshold_data["fpr"], best_vote_threshold_data["tpr"],
         color="darkorange", lw=2,
         label=f"ROC curve (AUROC = {best_vote_threshold_data["AUROC"]:.2f})")
plt.plot([0, 1], [0, 1], color="navy", lw=2, linestyle="--")

# Plot the point of the best threshold
plt.scatter(best_vote_threshold_data["fpr"][best_threshold_youden_index],
            best_vote_threshold_data["tpr"][best_threshold_youden_index],
            color="red", label=f"Best Threshold (J={best_threshold_youden:.2f})")
# And draw a vertical line underneath
plt.plot(
    [
        best_vote_threshold_data["fpr"][best_threshold_youden_index],
        best_vote_threshold_data["fpr"][best_threshold_youden_index]
    ],
    [
        best_vote_threshold_data["tpr"][best_threshold_youden_index],
        best_vote_threshold_data["fpr"][best_threshold_youden_index]
    ],
    "r--"
)

# Legend
plt.legend(loc="lower right")

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")

# Title
title = f"Receiver Operating Characteristic (ROC) for Vote Threshold {best_vote_threshold:.2f}"
plt.title(title)

# Save the file
file = DATA_DIR / (title.replace(" ", "_") + ".png")
plt.savefig(file)

# Display
plt.show()

print(f"Best threshold = {best_threshold_youden:.2f}")

print("Done")

Now some further evaluation.

In [None]:
sentence_evaluations_eval, sentence_realities_eval = load_working_set("evaluation")

sentence_predictions_eval: list[float] = []
for evaluations in sentence_evaluations_eval:
    sentence_predictions_eval.append(predict(evaluations, best_vote_threshold))

# Apply the threshold
classified_sentences_eval = [
    1 if p >= best_vote_threshold else 0 for p in sentence_predictions_eval
]

# And calculate some performance metrics
accuracy = accuracy_score(sentence_realities_eval, classified_sentences_eval)
precision = precision_score(sentence_realities_eval, classified_sentences_eval)
recall = recall_score(sentence_realities_eval, classified_sentences_eval)
f1 = f1_score(sentence_realities_eval, classified_sentences_eval)

# Tells us the secrets
print(f"For vote threshold {best_vote_threshold:.2f}")
print(f"For final threshold {best_threshold_youden:.2f}\n")
print(f"\tAccuracy:  {accuracy:.2f}")
print(f"\tPrecision: {precision:.2f}")
print(f"\tRecall:    {recall:.2f}")
print(f"\tF1 Score:  {f1:.2f}")

# File to store this
results_file = DATA_DIR / "roc_vote_results.txt"
with results_file.open("w") as f:
    print(f"For vote threshold {best_vote_threshold:.2f}", file=f)
    print(f"For final threshold {best_threshold_youden:.2f}\n", file=f)
    print(f"\tAccuracy:  {accuracy:.2f}", file=f)
    print(f"\tPrecision: {precision:.2f}", file=f)
    print(f"\tRecall:    {recall:.2f}", file=f)
    print(f"\tF1 Score:  {f1:.2f}", file=f)

print()
print("Done")