In [3]:
# 📘 SecEval Ollama Benchmark Notebook (Colab)

# 📦 Install dependencies
!pip install datasets requests



In [4]:
# ⚙️ Setup
import time
import requests
from datasets import load_dataset

# 🔗 Replace this with your ngrok/localtunnel/external link
OLLAMA_API = "https://be12-2405-201-e025-f0fb-7464-6fa0-ae1a-e93a.ngrok-free.app/"  # e.g., https://abc123.loca.lt
MODEL_NAME = "qwen2.5-coder:latest"  # or your custom model name in Ollama
NUM_QUESTIONS = 30  # Set how many questions you want to evaluate

In [20]:
def format_prompt(question, choices):
    formatted_choices = "\n".join([f"{chr(65+i)}. {c}" for i, c in enumerate(choices)])
    return f"""You are given a multiple-choice question.

Strictly Respond ONLY with capital letter of choice: A
if multiple answers are correct respond lik : ABC (if A,B,C are correct then output shd be like this ABC no spaces or commas in middle)

Question:
{question}

Choices:
{formatted_choices}
Answer:"""
import re
def extract_answer(output):
    # Convert to uppercase to simplify matching
    output = output.upper()

    # Match multiple letters: A-D, with optional separators (space, comma, "and")
    matches = re.findall(r'\b[A-D]\b', output)
    if matches:
        # Remove duplicates and sort
        unique_answers = sorted(set(matches))
        return ''.join(unique_answers)

    # Fallback: match combined answers like "AB", "ACD"
    match = re.search(r'\b([A-D]{2,4})\b', output)
    if match:
        return ''.join(sorted(set(match.group(1))))

    return "N/A"

In [21]:
# 📥 Load dataset
import pandas as pd
from datasets import Dataset
# 🔢 Define number of questions
NUM_QUESTIONS = 30  # Change if needed

print("📥 Loading SecEval dataset from Hugging Face (via hf:// protocol)...")


df = pd.read_json("hf://datasets/XuanwuAI/SecEval/questions.json")

# 🧹 Trim to the first N questions
df = df.iloc[:NUM_QUESTIONS].reset_index(drop=True)

# 🔁 Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df)

print(f"✅ Loaded {len(dataset)} questions")

# 📝 Display a sample
sample = dataset[0]
print(f"\n📝 Sample question:")
print(f"Q: {sample['question']}")
print(f"Choices: {sample['choices']}")
print(f"Answer: {sample['answer']}")

📥 Loading SecEval dataset from Hugging Face (via hf:// protocol)...
✅ Loaded 30 questions

📝 Sample question:
Q: You are tasked with designing a secure storage system for an Android device's hardware identifiers as part of an ID attestation implementation. Which of the following properties are essential for ensuring the system's integrity and security?
Choices: ['A: The storage must contain the original identifiers to enable the TEE to verify their authenticity during attestation.', 'B: The storage should be tamper-evident to ensure any modification is detectable, rendering the attestation invalid.', 'C: The `destroyAttestationIds()` method should be able to restore the identifier-derived data after a factory reset.', 'D: RMA facilities must not have the ability to regenerate hardware identifier-derived data to prevent unauthorized attestation.']
Answer: B


In [22]:
import json

# 🚀 Benchmark
correct = 0
latencies = []

for i, sample in enumerate(dataset):
    print(f"\n🔄 Question {i+1}/{NUM_QUESTIONS}")

    # Use sample directly (no json.loads needed)
    prompt = format_prompt(sample["question"], sample["choices"])

    # Send prompt to Ollama API
    start = time.time()
    response = requests.post(
        f"{OLLAMA_API}/api/generate",
        json={
            "model": MODEL_NAME,
            "prompt": prompt,
            "stream": False
        }
    )
    latency = time.time() - start

    output_text = response.json().get("response", "")
    prediction = extract_answer(output_text)
    actual = sample["answer"].strip().upper()
    is_correct = (prediction == actual)

    correct += int(is_correct)
    latencies.append(latency)

    # 🔍 Show model prediction and correct answer
    print("📤 Model's Raw Output:", output_text.strip())
    print("📝 Prediction:", prediction)
    print("✅ Actual Answer:", actual)
    print("🎯 Correct?" , "✅ Yes" if is_correct else "❌ No")
    print("⏱️ Latency:", round(latency, 2), "s")

accuracy = correct / NUM_QUESTIONS * 100
avg_latency = sum(latencies) / NUM_QUESTIONS

print("\n🎯 Results")
print(f"✅ Accuracy: {accuracy:.2f}%")
print(f"⏱️ Avg Latency: {avg_latency:.2f}s")



🔄 Question 1/30
📤 Model's Raw Output: ABCD
📝 Prediction: ABCD
✅ Actual Answer: B
🎯 Correct? ❌ No
⏱️ Latency: 1.7 s

🔄 Question 2/30
📤 Model's Raw Output: B
📝 Prediction: B
✅ Actual Answer: B
🎯 Correct? ✅ Yes
⏱️ Latency: 0.33 s

🔄 Question 3/30
📤 Model's Raw Output: B
📝 Prediction: B
✅ Actual Answer: B
🎯 Correct? ✅ Yes
⏱️ Latency: 0.34 s

🔄 Question 4/30
📤 Model's Raw Output: A
📝 Prediction: A
✅ Actual Answer: A
🎯 Correct? ✅ Yes
⏱️ Latency: 0.31 s

🔄 Question 5/30
📤 Model's Raw Output: A
📝 Prediction: A
✅ Actual Answer: A
🎯 Correct? ✅ Yes
⏱️ Latency: 0.33 s

🔄 Question 6/30
📤 Model's Raw Output: A
📝 Prediction: A
✅ Actual Answer: A
🎯 Correct? ✅ Yes
⏱️ Latency: 0.35 s

🔄 Question 7/30
📤 Model's Raw Output: ABC
📝 Prediction: ABC
✅ Actual Answer: AB
🎯 Correct? ❌ No
⏱️ Latency: 0.4 s

🔄 Question 8/30
📤 Model's Raw Output: A
📝 Prediction: A
✅ Actual Answer: A
🎯 Correct? ✅ Yes
⏱️ Latency: 0.37 s

🔄 Question 9/30
📤 Model's Raw Output: ABCD
📝 Prediction: ABCD
✅ Actual Answer: B
🎯 Correct? ❌ No