In [2]:
import requests

def categorize_query_with_gemma(question, model="gemma3:latest"):
    url = "http://localhost:11434/api/generate"
    headers = {"Content-Type": "application/json"}

    # Structured prompt
    prompt = (
        f"Given the question: \"{question}\"\n"
        "Classify it into a broad category and generate 2–4 relevant tags.\n"
        "Respond only in this format:\n"
        "Category: <Category Name>\n"
        "Tags: <Comma-separated tags>\n"
    )

    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False
    }

    response = requests.post(url, headers=headers, json=payload)
    if response.status_code == 200:
        return response.json()["response"]
    else:
        print(f"[ERROR] Status: {response.status_code}")
        print(response.text)
        return None

# Test it!
question = "What is the famous dish sinigang in the Philippines?"   
result = categorize_query_with_gemma(question)
print(result)


Category: Cuisine
Tags: Sinigang, Filipino Food, Sour Soup


In [5]:
import json
import unicodedata

with open("evaluation.json", encoding="utf-8") as f:
    data = json.load(f)

# Define a set of allowed characters (ASCII + common Filipino diacritics + ñ + ’)
allowed = set("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 .,!?;:'\"-()[]{}@#$%^&*/\\|_+=~`<>\n\r\táéíóúÁÉÍÓÚñÑ’‘“”")

def is_suspicious(char):
    # Allow basic and Filipino/Spanish letters, digits, and common punctuation
    if char in allowed:
        return False
    # Allow space and newlines
    if char in (' ', '\n', '\r', '\t'):
        return False
    # Allow combining marks (for diacritics)
    if unicodedata.category(char).startswith('M'):
        return False
    # Otherwise, flag as suspicious
    return True

for entry in data:
    item = entry.get("item", "N/A")
    for key, value in entry.items():
        if isinstance(value, str):
            for c in value:
                if is_suspicious(c):
                    print(
                        f"Item {item} | Field '{key}' | Char: '{c}' | Unicode: U+{ord(c):04X} | Name: {unicodedata.name(c, 'UNKNOWN')}"
                    )

Item 64 | Field 'answer' | Char: 'â' | Unicode: U+00E2 | Name: LATIN SMALL LETTER A WITH CIRCUMFLEX
Item 271 | Field 'answer' | Char: '–' | Unicode: U+2013 | Name: EN DASH
Item 291 | Field 'answer' | Char: '–' | Unicode: U+2013 | Name: EN DASH
Item 301 | Field 'question' | Char: '–' | Unicode: U+2013 | Name: EN DASH
Item 301 | Field 'answer' | Char: '–' | Unicode: U+2013 | Name: EN DASH
Item 321 | Field 'answer' | Char: '–' | Unicode: U+2013 | Name: EN DASH
Item 381 | Field 'answer' | Char: '–' | Unicode: U+2013 | Name: EN DASH
Item 477 | Field 'answer' | Char: '–' | Unicode: U+2013 | Name: EN DASH
