In [None]:
import re

administration_prefix_patterns = {
    "province": [
        r"^(thành phố|t\.\s*phố|tỉnh|tp|tp\.|t\.)\s+",
        r"^(thanh pho|tinh)\s+",  # no diacritics
    ],
    "district": [
        r"^(huyện|quận|thị xã|thành phố|q\.|h\.|tx\.|tp\.)\s+",
        r"^(huyen|quan|thi xa|thanh pho)\s+",  # no diacritics
    ],
    "ward": [
        r"^(phường|xã|thị trấn|p\.|x\.|tt\.)\s+",
        r"^(phuong|xa|thi tran)\s+",  # no diacritics
    ],
}

street_keywords = [
    "số",
    "đường",
    "bis",
    "ấp",
    "khu",
    "block",
    "lô",
    "tổ",
    "ngõ",
    "hẻm",
    "phố",
    "street",
    "road",
    "avenue",
    "lane",
    "area",
    "building",
    "floor",
    "apartment",
    "căn hộ",
    "tầng",
    "toà",
    "tòa",
    "nhà",
    "villa",
    "biệt thự",
    "chung cư",
    "kdc",
    "ktx",
    "ccx",
    "c/c",
    "đs",
    "km",
    "ql",
    "tl",
]

common_province_alias_map = {
    "hcm": "hồ chí minh",
    "hn": "hà nội",
    "hnoi": "hà nội",
    "t.t.h": "thừa thiên huế",
}

# Single letter abbreviations that could be prefixes
SINGLE_LETTER_PREFIXES = {"x", "h", "q", "t", "p"}

# Build BK-tree for fast prefix matching

In [None]:
from typing import Optional, Tuple, List


def levenshtein_distance(s1: str, s2: str) -> int:
    """Calculate Levenshtein distance between two strings"""
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)

    if len(s2) == 0:
        return len(s1)

    previous_row = list(range(len(s2) + 1))
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]


def normalize_for_comparison(text: str) -> str:
    """Normalize text for fuzzy matching by converting to lowercase"""
    return text.lower().strip()


class BKTreeNode:
    """Node for BK-Trie (Burkhard-Keller Trie) structure"""

    def __init__(self, word: str = None):
        self.word = word
        self.normalized_word = normalize_for_comparison(word) if word else None
        self.children = {}  # distance -> [BKTreeNode]
        self.is_terminal = word is not None

    def add_word(self, word: str):
        """Add a word to the BK-Trie"""
        if self.word is None:
            # This is root node, make it the first word
            self.word = word
            self.normalized_word = normalize_for_comparison(word)
            self.is_terminal = True
            return

        # Calculate distance from current node's word
        distance = levenshtein_distance(
            self.normalized_word, normalize_for_comparison(word)
        )

        if distance in self.children:
            # Find appropriate child to recurse into
            added = False
            for child in self.children[distance]:
                if child.word == word:
                    # Word already exists
                    return
                # Try to add to this child
                child.add_word(word)
                added = True
                break
            if not added:
                # Create new child at this distance
                self.children[distance].append(BKTreeNode(word))
        else:
            # Create new distance bucket
            self.children[distance] = [BKTreeNode(word)]

    def search(self, query: str, max_distance: int = 2) -> List[Tuple[str, int]]:
        """Search for words within max_distance of query"""
        results = []
        normalized_query = normalize_for_comparison(query)

        if self.word is not None:
            distance = levenshtein_distance(self.normalized_word, normalized_query)
            if distance <= max_distance:
                results.append((self.word, distance))

        # Search children within distance range
        for child_distance, child_nodes in self.children.items():
            # Only search children if they could contain matches
            if abs(child_distance - distance) <= max_distance:
                for child in child_nodes:
                    results.extend(child.search(query, max_distance))

        return results


class BKTree:
    """BK-Trie for efficient fuzzy string matching"""

    def __init__(self, words: List[str] = None):
        self.root = BKTreeNode()
        self.word_count = 0

        if words:
            for word in words:
                self.add_word(word)

    def add_word(self, word: str):
        """Add a word to the trie"""
        if word and word.strip():
            self.root.add_word(word.strip())
            self.word_count += 1

    def search(self, query: str, max_distance: int = 2) -> List[Tuple[str, int]]:
        """Search for words within max_distance of query"""
        if not query or not query.strip():
            return []

        results = self.root.search(query.strip(), max_distance)
        # Sort by distance (closest matches first)
        return sorted(results, key=lambda x: x[1])

    def get_best_match(
        self, query: str, max_distance: int = 2
    ) -> Optional[Tuple[str, int]]:
        """Get the best (closest) match for query"""
        results = self.search(query, max_distance)
        return results[0] if results else None

    def get_exact_match(self, query: str) -> Optional[str]:
        """Get exact match if exists"""
        results = self.search(query, max_distance=0)
        return results[0][0] if results else None

## Normalization Layer

In [None]:
def normalize_input(address_input: str) -> str:
    """
    Normalize Vietnamese address input for consistent parsing.

    This function standardizes address strings by:
    1. Cleaning whitespace, dot at last word and converting to lowercase
    2. Adding proper spacing around punctuation
    3. Expanding abbreviated prefixes with dots + adding commas
    4. Handling prefix+digit combinations + adding commas
    5. Separating concatenated prefecture words from location names
    6. Handling abbreviated concatenated prefixes + adding commas
    7. Final comma cleanup and validation

    Args:
        address_input (str): Raw Vietnamese address string

    Returns:
        str: Normalized address string ready for component extraction

    Examples:
        >>> normalize_input("P1,Q1,TPHCM")
        "p. 1, q. 1, tp. hcm"

        >>> normalize_input("p tan binh q binh thanh")
        "p. tan binh, q. binh thanh"

        >>> normalize_input("x my thành huyện cai lậy")
        "x. my thành, huyện cai lậy"
    """

    def add_commas_before_admin_prefixes(text: str) -> str:
        """
        Helper function to add commas before administrative prefixes with improved logic
        """
        # More precise admin prefix pattern - only match actual administrative prefixes
        admin_prefixes = r"\b(huyện|quận|phường|xã|thị xã|thành phố|tỉnh|thị trấn|tp\.|tx\.|tt\.|p\.|q\.|h\.|x\.|t\.)\s+"

        matches = list(re.finditer(admin_prefixes, text, flags=re.IGNORECASE))

        if matches:
            # Process matches from right to left to avoid position shifts
            for match in reversed(matches):
                start_pos = match.start()

                # Only add comma if the prefix is not at the beginning
                if start_pos > 0:
                    char_before = text[start_pos - 1]
                    chars_before_2 = (
                        text[max(0, start_pos - 2) : start_pos]
                        if start_pos >= 2
                        else ""
                    )

                    # Get the matched prefix for more precise logic
                    matched_prefix_text = text[start_pos : match.end()].lower()

                    # Skip inserting comma between 't.' and following 'p.'/'phố'/'x.' (for thành phố, thị xã)
                    is_after_t = (
                        re.search(r"\bt\.\s*$", text[:start_pos], flags=re.IGNORECASE)
                        is not None
                    )
                    is_follow_token = (
                        re.match(
                            r"^(p\.|phố|x\.)\s+",
                            matched_prefix_text,
                            flags=re.IGNORECASE,
                        )
                        is not None
                    )

                    if is_after_t and is_follow_token:
                        continue

                    # Don't add comma if already preceded by comma or if preceded by a letter (part of same word)
                    if (
                        char_before != ","
                        and chars_before_2 != ", "
                        and char_before == " "
                    ):
                        text = text[: start_pos - 1] + ", " + text[start_pos:]

        return text

    # Step 1: Basic normalization - lowercase, clean whitespace, remove "-" and dot at last word
    normalized = " ".join(address_input.lower().split())
    normalized = normalized.replace("-", "")
    normalized = normalized.rstrip(".")
    # print(f"Step 1 - Basic normalization: '{normalized}'")

    # Step 2a: Add space after commas for consistent parsing
    normalized = re.sub(r",(?=\S)", ", ", normalized)
    # print(f"Step 2a - After adding space after commas: '{normalized}'")

    # Step 2b: Add space after dots
    normalized = re.sub(r"\.(?=\S)", ". ", normalized)
    # print(f"Step 2b - After adding space after dots: '{normalized}'")

    # Step 3: Expand standalone prefix letters with dots
    # Only match letters followed by SPACE and then letters (not digits)
    normalized = re.sub(r"\b([xhqtp])\s+(?=[a-zA-ZÀ-ỹ])", r"\1. ", normalized)
    # print(f"Step 3 - After expanding standalone prefixes: '{normalized}'")

    # Step 3b: Add commas immediately after creating new prefixes
    normalized = add_commas_before_admin_prefixes(normalized)
    # print(f"Step 3b - After adding commas: '{normalized}'")

    # Step 4: Handle prefix letters next to digit
    # First handle concatenated cases like "q1p2" -> "q. 1 p. 2"
    normalized = re.sub(r"([pq])(\d{1,2})([pq]\d{1,2})", r"\1. \2 \3", normalized)
    # Then handle the general case with word boundaries
    normalized = re.sub(r"\b([pq])(\d{1,2})\b", r"\1. \2", normalized)
    # Also handle cases where digits are followed by letters (like "p1huyện")
    normalized = re.sub(r"([pq])(\d{1,2})([a-zA-ZÀ-ỹ])", r"\1. \2 \3", normalized)
    # print(f"Step 4 - After handling prefix+digit: '{normalized}'")

    # Step 4b: Add commas immediately after creating new prefixes
    normalized = add_commas_before_admin_prefixes(normalized)
    # print(f"Step 4b - After adding commas: '{normalized}'")

    # Step 5: Separate concatenated full prefecture words from location names
    normalized = re.sub(
        r"\b(huyện|quận|phường|xã|thị xã|thành phố|tỉnh|thị trấn)([a-zA-ZÀ-ỹ])",
        r"\1 \2",
        normalized,
    )
    # print(f"Step 5 - After separating concatenated words: '{normalized}'")

    # Step 6: Handle abbreviated concatenated prefixes
    # Match multi-letter abbreviations only
    normalized = re.sub(r"\b(tp|tx|tt)([a-zA-ZÀ-ỹ])", r"\1. \2", normalized)
    # print(f"Step 6 - After handling 'tp', 'tx', 'tt': '{normalized}'")

    # Step 6b: Add commas immediately after creating new prefixes
    normalized = add_commas_before_admin_prefixes(normalized)
    # print(f"Step 6b - After adding commas: '{normalized}'")

    # Step 6c: Merge split abbreviations like 't. p.' to 'tp.'
    normalized = re.sub(r"t\.\s+p\.", "tp.", normalized)
    normalized = re.sub(r"t\.\s+x\.", "tx.", normalized)
    normalized = re.sub(r"t\.\s+t\.", "tt.", normalized)
    # print(f"Step 6c - After merging split abbreviations: '{normalized}'")

    # Step 7: Final cleanup and validation
    # Remove any double spaces and trim
    normalized = re.sub(r"\s+", " ", normalized).strip()

    # Final check: ensure no double commas or misplaced commas
    normalized = re.sub(r",\s*,", ",", normalized)  # Remove double commas
    normalized = re.sub(r",\s*$", "", normalized)  # Remove trailing comma
    print(f"NORMALIZED: '{normalized}'")

    return normalized

In [4]:
def expand_abbreviations(normalized: str) -> str:
    for alias, full in common_province_alias_map.items():
        normalized = re.sub(
            r"\b" + re.escape(alias) + r"\b", full, normalized, flags=re.IGNORECASE
        )

    admin_map = {
        "p.": "phường",
        "q.": "quận",
        "h.": "huyện",
        "x.": "xã",
        "tp.": "thành phố",
        "t.ph": "thành phố",
        "tx.": "thị xã",
        "t.x.": "thị xã",
        "tt.": "thị trấn",
        "t.": "tỉnh",
    }

    for abbrev, full in sorted(admin_map.items(), key=lambda x: -len(x[0])):
        normalized = re.sub(
            re.escape(abbrev), full + " ", normalized, flags=re.IGNORECASE
        )

    normalized = re.sub(r"\s+", " ", normalized).strip()

    return normalized

In [5]:
tests = [
    "P1,Q1,TPHCM",
    "p tan binh q binh thanh",
    "x my thành huyện cai lậy",
    "hcm",
    "hn",
    "hnoi",
    "t.t.h",
    "tp hcm",
]

for test in tests:
    print(f"INPUT: '{test}'")
    normalized = normalize_input(test)
    expanded = expand_abbreviations(normalized)
    print(f"EXPANDED: '{expanded}'")
    print("-----")

INPUT: 'P1,Q1,TPHCM'
NORMALIZED: 'p. 1, q. 1, tp. hcm'
EXPANDED: 'phường 1, quận 1, thành phố hồ chí minh'
-----
INPUT: 'p tan binh q binh thanh'
NORMALIZED: 'p. tan binh, q. binh thanh'
EXPANDED: 'phường tan binh, quận binh thanh'
-----
INPUT: 'x my thành huyện cai lậy'
NORMALIZED: 'x. my thành, huyện cai lậy'
EXPANDED: 'xã my thành, huyện cai lậy'
-----
INPUT: 'hcm'
NORMALIZED: 'hcm'
EXPANDED: 'hồ chí minh'
-----
INPUT: 'hn'
NORMALIZED: 'hn'
EXPANDED: 'hà nội'
-----
INPUT: 'hnoi'
NORMALIZED: 'hnoi'
EXPANDED: 'hà nội'
-----
INPUT: 't.t.h'
NORMALIZED: 't., t. h'
EXPANDED: 'tỉnh , tỉnh h'
-----
INPUT: 'tp hcm'
NORMALIZED: 'tp hcm'
EXPANDED: 'tp hồ chí minh'
-----


## Spelling Correction Layer

In [None]:
from typing import List, Tuple


def load_vietnamese_dictionary(
    dict_path: str = "data/vietnamese_address_dictionary.txt",
) -> List[str]:
    """Load Vietnamese dictionary from file"""
    try:
        with open(dict_path, "r", encoding="utf-8") as f:
            words = [line.strip().lower() for line in f if line.strip()]
        print(f"Loaded {len(words)} words from Vietnamese dictionary")
        return words
    except FileNotFoundError:
        print(f"Dictionary file {dict_path} not found")
        return []


def build_spelling_correction_trie() -> BKTree:
    """Build BK-Trie for spelling correction using Vietnamese dictionary"""
    dictionary_words = load_vietnamese_dictionary()
    dictionary_words = dictionary_words + list(common_province_alias_map.keys())
    # Add common street keywords
    dictionary_words = dictionary_words + street_keywords
    dictionary_words = list(set(dictionary_words))
    if not dictionary_words:
        return BKTree()

    spelling_trie = BKTree(dictionary_words)
    print(f"Built spelling correction trie with {spelling_trie.word_count} words")
    return spelling_trie

In [None]:
spelling_trie = build_spelling_correction_trie()

In [None]:
def is_valid_vietnamese_word(word: str, spelling_trie: BKTree) -> bool:
    """Check if a word exists in the Vietnamese dictionary"""
    if not word or not word.strip():
        return True  # Consider empty words as valid (no correction needed)

    # Check for exact match (case-insensitive)
    exact_match = spelling_trie.get_exact_match(word.lower())
    return exact_match is not None


def calculate_word_similarity_score(original: str, candidate: str) -> float:
    """
    Calculate similarity score between original and candidate word.
    Higher score means better match.
    """
    if not original or not candidate:
        return 0.0

    # Prefer longer matches
    length_bonus = min(len(candidate), len(original)) / max(
        len(candidate), len(original)
    )

    # Prefer words that start with the same characters
    common_prefix = 0
    for i, (c1, c2) in enumerate(zip(original, candidate)):
        if c1 == c2:
            common_prefix += 1
        else:
            break
    prefix_score = common_prefix / max(len(original), len(candidate))

    # Calculate overall similarity
    edit_distance = levenshtein_distance(original, candidate)
    max_length = max(len(original), len(candidate))
    distance_score = 1 - (edit_distance / max_length) if max_length > 0 else 0

    # Combine scores (weighted)
    final_score = (distance_score * 0.6) + (prefix_score * 0.3) + (length_bonus * 0.1)
    return final_score


def correct_word(
    word: str, spelling_trie: BKTree, max_distance: int = 2
) -> Tuple[str, bool, int]:
    """
    Correct a single word using the spelling trie with improved scoring

    Args:
        word: Word to correct
        spelling_trie: BK-Trie containing dictionary words
        max_distance: Maximum edit distance for suggestions

    Returns:
        Tuple of (corrected_word, was_corrected, edit_distance)
    """
    if not word or not word.strip():
        return word, False, 0

    word_lower = word.lower().strip()

    # Skip numbers and very short words
    if word_lower.isdigit() or len(word_lower) < 2:
        return word, False, 0

    # Skip common punctuation and special characters
    if word_lower in {"-", ".", ",", "/", "\\", "(", ")", "[", "]", "{", "}"}:
        return word, False, 0

    # Check if word is already correct
    if is_valid_vietnamese_word(word_lower, spelling_trie):
        return word, False, 0

    # Find spelling corrections
    matches = spelling_trie.search(word_lower, max_distance)

    if matches:
        # Score all matches and pick the best one
        scored_matches = []
        for candidate, distance in matches:
            similarity_score = calculate_word_similarity_score(word_lower, candidate)
            # Combine distance and similarity (lower distance is better, higher similarity is better)
            combined_score = similarity_score - (
                distance * 0.1
            )  # Penalize distance slightly
            scored_matches.append(
                (candidate, distance, similarity_score, combined_score)
            )

        # Sort by combined score (descending)
        scored_matches.sort(key=lambda x: x[3], reverse=True)

        best_match, distance, similarity, combined = scored_matches[0]

        # Only suggest correction if it's reasonable
        if distance <= max_distance and similarity > 0.3:
            return best_match, True, distance

    # No good correction found
    return word, False, float("inf")


def correct_address_spelling(
    address: str, spelling_trie: BKTree, max_distance: int = 2, debug: bool = False
) -> Tuple[str, List[dict]]:
    """
    Correct spelling errors in Vietnamese address text

    Args:
        address: Input address string
        spelling_trie: BK-Trie for spell checking
        max_distance: Maximum edit distance for corrections
        debug: Whether to print debug information

    Returns:
        Tuple of (corrected_address, corrections_made)
    """
    if not address or not address.strip():
        return address, []

    # Tokenize the address while preserving punctuation and spacing
    # Split on whitespace but keep track of original spacing
    tokens = re.findall(r"\S+|\s+", address)

    corrected_tokens = []
    corrections_made = []

    for i, token in enumerate(tokens):
        if token.isspace():
            # Preserve whitespace as-is
            corrected_tokens.append(token)
            continue

        # Clean token of punctuation for spell checking
        clean_token = re.sub(
            r"[^\w\sàáạảãâầấậẩẫăằắặẳẵèéẹẻẽêềếệểễìíịỉĩòóọỏõôồốộổỗơờớợởỡùúụủũưừứựửữỳýỵỷỹđÀÁẠẢÃÂẦẤẬẨẪĂẰẮẶẲẴÈÉẸẺẼÊỀẾỆỂỄÌÍỊỈĨÒÓỌỎÕÔỒỐỘỔỖƠỜỚỢỞỠÙÚỤỦŨƯỪỨỰỬỮỲÝỴỶỸĐ]",
            "",
            token,
        )

        if clean_token:
            corrected_word, was_corrected, distance = correct_word(
                clean_token, spelling_trie, max_distance
            )

            if was_corrected:
                # Replace the clean part while preserving punctuation
                corrected_token = token.replace(clean_token, corrected_word)
                corrected_tokens.append(corrected_token)

                correction_info = {
                    "position": i,
                    "original": clean_token,
                    "corrected": corrected_word,
                    "distance": distance,
                    "full_token_original": token,
                    "full_token_corrected": corrected_token,
                }
                corrections_made.append(correction_info)

                if debug:
                    print(
                        f"Corrected: '{clean_token}' -> '{corrected_word}' (distance: {distance})"
                    )
            else:
                corrected_tokens.append(token)
        else:
            # Token is only punctuation
            corrected_tokens.append(token)

    corrected_address = "".join(corrected_tokens)

    if debug and corrections_made:
        print(f"Original: {address}")
        print(f"Corrected: {corrected_address}")
        print(f"Made {len(corrections_made)} corrections")

    return corrected_address, corrections_made


# Build the spelling correction trie if not already built
if "spelling_trie" not in locals():
    print("Building spelling correction trie...")
    spelling_trie = build_spelling_correction_trie()
    print("Spelling correction trie ready!")


## Suggestion Layer

In [None]:
def clean_component(text, component_type):
    """Remove prefix patterns from component text"""
    if not text:
        return None

    text = text.strip()
    for pattern in administration_prefix_patterns[component_type]:
        text = re.sub(pattern, "", text, flags=re.IGNORECASE).strip()

    return text if text else None


def check_single_letter_prefix(word):
    """
    Check if a word starts with a single letter prefix that could be an abbreviation.
    Returns tuple: (has_prefix, prefix_letter, remaining_word)

    Designed to catch cases like:
    - "tđắk" -> ("t", "đắk") for "tỉnh đắk"
    - "hkrông" -> ("h", "krông") for "huyện krông"
    - "qbình" -> ("q", "bình") for "quận bình"

    But NOT cases like:
    - "thị" (part of "thị trấn")
    - "thành" (part of "thành phố")
    - Standard abbreviations like "tp.", "q.", "h.", "p.", "x.", "tx.", "tt." (handled elsewhere)
    """
    # Ignore known dotted abbreviations entirely (handled by explicit rules)
    if re.match(r"^(tp|tx|tt|q|h|p|x)\.$", word, flags=re.IGNORECASE):
        return False, None, word

    if len(word) > 2 and word[0] in SINGLE_LETTER_PREFIXES:
        # Exclude full administrative words
        full_admin_prefixes = ["thành", "thị", "huyện", "quận", "phường", "tỉnh"]
        for prefix in full_admin_prefixes:
            if word.lower().startswith(prefix):
                return False, None, word

        # Require the remainder to look like a place token: has a space or VN diacritics
        remaining = word[1:]
        if not re.search(r"[à-ỹ\s]", remaining, flags=re.IGNORECASE):
            return False, None, word

        # Also avoid too-short remainders
        if len(remaining) < 2:
            return False, None, word

        return True, word[0], remaining
    return False, None, word

In [None]:
def suggest_address_components(address_input):
    """
    Heuristically suggest province, district, and ward from a (normalized) Vietnamese address.
    Processes right-to-left (common in VN addresses).
    Now includes fallback logic for parts without explicit prefixes and alias mapping.
    """

    # --- Small helpers -----------------------------------------------------
    def add_component(bucket, key, value):
        if value is None:
            return
        value = value.strip()
        if not value:
            return
        if value not in bucket[key]:
            bucket[key].append(value)

    def take_remaining_words(words, start_idx, used_indices):
        return " ".join(
            [
                w
                for j, w in enumerate(words[start_idx:], start=start_idx)
                if j not in used_indices
            ]
        )

    def mark_used_range(used_indices, start_idx, end_idx):
        for j in range(start_idx, end_idx):
            used_indices.add(j)

    def classify_and_add_by_letter(prefix_letter, text):
        nonlocal province_assigned
        if prefix_letter == "t":  # tỉnh
            add_component(result, "province", clean_component(text, "province"))
            province_assigned = True
        elif prefix_letter in ["q", "h"]:  # quận, huyện
            add_component(result, "district", clean_component(text, "district"))
        elif prefix_letter in ["p", "x"]:  # phường, xã
            add_component(result, "ward", clean_component(text, "ward"))

    def word_matches_any(patterns, word):
        return any(re.match(pat, word, flags=re.IGNORECASE) for pat in patterns)

    def contains_unrelated_keywords(text):
        """Check if text contains street/address keywords that should not be administrative components"""
        text_lower = text.lower()
        return any(keyword in text_lower for keyword in street_keywords)

    def apply_alias_mapping(result):
        """Apply known province aliases to improve province identification"""
        # Check each province and try to map aliases
        for i, province in enumerate(result["province"]):
            province_lower = province.lower().strip()

            # Direct lookup in alias map
            if province_lower in common_province_alias_map:
                result["province"][i] = common_province_alias_map[province_lower]
                continue

            # Check for partial matches or common variations
            for alias, full_name in common_province_alias_map.items():
                if alias in province_lower or province_lower in alias:
                    result["province"][i] = full_name
                    break

        return result

    # --- Output bucket -----------------------------------------------------
    result = {
        "province": [],
        "district": [],
        "ward": [],
        "normalized_raw_input": address_input,
        "remaining": None,
        "remaining_parts": [],
    }
    province_assigned = False

    # --- No-comma path: token-by-token right-to-left ----------------------
    if "," not in address_input:
        words = address_input.split()
        used_indices = set()

        for i in reversed(range(len(words))):
            if i in used_indices:
                continue

            word = words[i]

            # Handle multi-token admin prefixes: "t. p." / "t. phố" or "t. x."
            if (
                i + 1 < len(words)
                and words[i] == "t."
                and words[i + 1] in {"p.", "phố", "x."}
            ):
                remaining_text = (
                    take_remaining_words(words, i + 2, used_indices)
                    if i + 2 < len(words)
                    else ""
                )
                if words[i + 1] in {"p.", "phố"}:  # thành phố
                    if not province_assigned:
                        add_component(
                            result,
                            "province",
                            clean_component(remaining_text, "province"),
                        )
                        province_assigned = True
                    else:
                        add_component(
                            result,
                            "district",
                            clean_component(remaining_text, "district"),
                        )
                else:  # thị xã → district-level
                    add_component(
                        result, "district", clean_component(remaining_text, "district")
                    )
                mark_used_range(used_indices, i, len(words))
                continue

            # Single-letter prefix at word start (e.g., "tđắk")
            has_prefix, prefix_letter, remaining_word = check_single_letter_prefix(word)
            if has_prefix:
                component_text = " ".join(
                    [remaining_word]
                    + [
                        words[j]
                        for j in range(i + 1, len(words))
                        if j not in used_indices
                    ]
                )
                classify_and_add_by_letter(prefix_letter, component_text)
                mark_used_range(used_indices, i, len(words))
                continue

            # Standalone single-letter prefix token (e.g., "t")
            if len(word) == 1 and word in SINGLE_LETTER_PREFIXES and i < len(words) - 1:
                remaining_text = take_remaining_words(words, i, used_indices)
                if word == "t":
                    add_component(
                        result, "province", clean_component(remaining_text, "province")
                    )
                    province_assigned = True
                elif word in ["q", "h"]:
                    add_component(
                        result, "district", clean_component(remaining_text, "district")
                    )
                elif word in ["p", "x"]:
                    add_component(
                        result, "ward", clean_component(remaining_text, "ward")
                    )
                mark_used_range(used_indices, i, len(words))
                continue

            # Regular explicit prefix tokens (exact-token match lists)
            province_tokens = [r"^(thành phố|tỉnh|tp|tp\.|t\.)$"]
            district_tokens = [r"^(huyện|quận|thị xã|thành phố|q\.|h\.|tx\.|tp\.)$"]
            ward_tokens = [r"^(phường|xã|thị trấn|p\.|x\.|tt\.)$"]

            if word_matches_any(province_tokens, word):
                remaining_text = take_remaining_words(words, i, used_indices)
                add_component(
                    result, "province", clean_component(remaining_text, "province")
                )
                province_assigned = True
                mark_used_range(used_indices, i, len(words))
                continue

            if i not in used_indices and word_matches_any(district_tokens, word):
                remaining_text = take_remaining_words(words, i, used_indices)
                add_component(
                    result, "district", clean_component(remaining_text, "district")
                )
                mark_used_range(used_indices, i, len(words))
                continue

            if i not in used_indices and word_matches_any(ward_tokens, word):
                remaining_text = take_remaining_words(words, i, used_indices)
                add_component(result, "ward", clean_component(remaining_text, "ward"))
                mark_used_range(used_indices, i, len(words))
                continue

        remaining_words = [
            words[idx] for idx in range(len(words)) if idx not in used_indices
        ]
        remaining_str = " ".join(remaining_words) if remaining_words else None
        result["remaining"] = remaining_str
        result["remaining_parts"] = [remaining_str] if remaining_str else []

        # Apply alias mapping before returning
        result = apply_alias_mapping(result)
        return result

    # --- Comma-separated path: part-by-part right-to-left ------------------
    parts = [part.strip() for part in address_input.split(",")]
    parts = [part for part in parts if part]

    # Keep track of matched parts and unmatched parts
    matched_parts = []
    unmatched_parts = []

    for i, part in enumerate(reversed(parts)):
        matched = False
        words_in_part = part.split()

        # Intentionally disable single-letter heuristic in comma-separated parts
        # Rely only on explicit multi-token and prefix-pattern checks below

        if not matched:
            # Handle leading multi-token admin prefixes inside a part
            if re.match(r"^\s*t\.\s*(p\.|phố)\s+", part, flags=re.IGNORECASE):
                component_text = re.sub(
                    r"^\s*t\.\s*(p\.|phố)\s+", "", part, flags=re.IGNORECASE
                )
                if not province_assigned:
                    add_component(
                        result, "province", clean_component(component_text, "province")
                    )
                    province_assigned = True
                else:
                    add_component(
                        result, "district", clean_component(component_text, "district")
                    )
                matched = True
            elif re.match(r"^\s*t\.\s*x\.\s+", part, flags=re.IGNORECASE):
                component_text = re.sub(
                    r"^\s*t\.\s*x\.\s+", "", part, flags=re.IGNORECASE
                )
                add_component(
                    result, "district", clean_component(component_text, "district")
                )
                matched = True

            # City-level indicator at start: "thành phố" / "tp." → district if province already assigned
            elif re.match(
                r"^\s*(thành phố|t\. phố|t\. p\.|tp\.?)[\s]+", part, flags=re.IGNORECASE
            ):
                if not province_assigned:
                    add_component(result, "province", clean_component(part, "province"))
                    province_assigned = True
                else:
                    add_component(result, "district", clean_component(part, "district"))
                matched = True
            # Match explicit admin prefixes by priority
            elif any(
                re.match(p, part, flags=re.IGNORECASE)
                for p in administration_prefix_patterns["province"]
            ):
                add_component(result, "province", clean_component(part, "province"))
                province_assigned = True
                matched = True
            elif any(
                re.match(p, part, flags=re.IGNORECASE)
                for p in administration_prefix_patterns["district"]
            ):
                add_component(result, "district", clean_component(part, "district"))
                matched = True
            elif any(
                re.match(p, part, flags=re.IGNORECASE)
                for p in administration_prefix_patterns["ward"]
            ):
                add_component(result, "ward", clean_component(part, "ward"))
                matched = True

        if matched:
            matched_parts.append(part)
        else:
            unmatched_parts.append(part)

    # Fallback logic for unmatched parts (with street keyword filtering)
    # Process unmatched parts from right to left (province -> district -> ward)
    if unmatched_parts:
        # Since we processed parts in reverse order, unmatched_parts are also in reverse order
        # So the first unmatched part is the rightmost (likely province)
        for i, part in enumerate(unmatched_parts):
            # Skip parts that contain street/address keywords
            if contains_unrelated_keywords(part):
                result["remaining_parts"].append(part)
                continue

            if i == 0 and not result["province"]:
                # First unmatched part (rightmost) -> province
                add_component(result, "province", part)
            elif i == 1 and not result["district"]:
                # Second unmatched part -> district
                add_component(result, "district", part)
            elif i == 2 and not result["ward"]:
                # Third unmatched part -> ward
                add_component(result, "ward", part)
            else:
                # Any remaining parts go to remaining_parts
                result["remaining_parts"].append(part)

    # Set remaining string for any leftover parts
    if not result["remaining_parts"]:
        # If no fallback parts were used, use the old logic for remaining parts computation
        used_parts = []
        for part in parts:
            part_used = False
            words_in_part = part.split()
            if words_in_part:
                first_word = words_in_part[0]
                has_prefix, _, _ = check_single_letter_prefix(first_word)
                if has_prefix or (
                    len(first_word) == 1 and first_word in SINGLE_LETTER_PREFIXES
                ):
                    used_parts.append(part)
                    part_used = True
            if not part_used:
                if re.match(r"^\s*t\.\s*p\.\s+", part, flags=re.IGNORECASE) or re.match(
                    r"^\s*t\.\s*x\.\s+", part, flags=re.IGNORECASE
                ):
                    used_parts.append(part)
                    part_used = True
            if not part_used:
                for component_type in ["district", "province", "ward"]:
                    if any(
                        re.match(p, part, flags=re.IGNORECASE)
                        for p in administration_prefix_patterns[component_type]
                    ):
                        used_parts.append(part)
                        part_used = True
                        break

        # Add unmatched parts that were assigned by fallback logic to used_parts
        used_parts.extend(matched_parts)
        # Only add unmatched parts that don't contain street keywords and were assigned
        for i, part in enumerate(unmatched_parts):
            if not contains_unrelated_keywords(part) and i < 3:
                used_parts.append(part)

        remaining_parts = [p for p in parts if p not in used_parts]
        result["remaining_parts"] = remaining_parts

    result["remaining"] = (
        ", ".join(result["remaining_parts"]) if result["remaining_parts"] else None
    )

    # Apply alias mapping before returning
    result = apply_alias_mapping(result)

    return result


# Example result
# result = {'province': ['hồ chí minh'], 'district': ['1'], 'ward': [], 'normalized_raw_input': '161/18a cô giang, cô giang , quận 1, tp. hồ chí minh', 'remaining': 'cô giang, 161/18a cô giang', 'remaining_parts': ['cô giang', '161/18a cô giang']}

## Classification Layer

In [None]:
# from typing import List, Dict


# def load_txt_with_encoding(file_path: str) -> List[str]:
#     """Load data from TXT file with automatic encoding detection"""
#     encodings = ["utf-8-sig", "utf-8", "latin-1", "cp1252", "iso-8859-1"]

#     for encoding in encodings:
#         try:
#             with open(file_path, "r", encoding=encoding) as f:
#                 data = [line.strip() for line in f if line.strip()]
#             return data
#         except (UnicodeDecodeError, FileNotFoundError):
#             continue

#     raise Exception(f"Could not read file {file_path} with any supported encoding")


In [None]:
# def build_classification_tries() -> Tuple[BKTree, BKTree, BKTree]:
#     """Build BK-Tries for provinces, districts, and wards"""
#     print("Loading data files...")

#     # Load provinces
#     provinces = load_txt_with_encoding("./data/provinces.txt")
#     print(f"Loaded {len(provinces)} provinces")

#     # Load districts
#     districts = load_txt_with_encoding("./data/districts.txt")
#     print(f"Loaded {len(districts)} districts")

#     # Load wards
#     wards = load_txt_with_encoding("./data/wards.txt")
#     print(f"Loaded {len(wards)} wards")

#     print("\nBuilding BK-Tries...")

#     # Build tries
#     province_trie = BKTree(provinces)
#     print(f"Province trie built with {province_trie.word_count} entries")

#     district_trie = BKTree(districts)
#     print(f"District trie built with {district_trie.word_count} entries")

#     ward_trie = BKTree(wards)
#     print(f"Ward trie built with {ward_trie.word_count} entries")

#     return province_trie, district_trie, ward_trie


# def classify_address_components(
#     suggestion_result: Dict,
#     province_trie: BKTree,
#     district_trie: BKTree,
#     ward_trie: BKTree,
#     max_distance: int = 2,
# ) -> Dict:
#     """
#     Classify suggested address components using BK-Tries

#     Args:
#         suggestion_result: Result from suggest_address_components()
#         province_trie, district_trie, ward_trie: BK-Tries for classification
#         max_distance: Maximum edit distance for fuzzy matching

#     Returns:
#         Dict with classified components and confidence scores
#     """
#     result = {
#         "province": {
#             "original": suggestion_result.get("province", []),
#             "classified": [],
#             "scores": [],
#         },
#         "district": {
#             "original": suggestion_result.get("district", []),
#             "classified": [],
#             "scores": [],
#         },
#         "ward": {
#             "original": suggestion_result.get("ward", []),
#             "classified": [],
#             "scores": [],
#         },
#         "remaining": suggestion_result.get("remaining"),
#         "remaining_parts": suggestion_result.get("remaining_parts", []),
#     }

#     # Classify provinces
#     for province_suggestion in suggestion_result.get("province", []):
#         matches = province_trie.search(province_suggestion, max_distance)
#         if matches:
#             best_match, distance = matches[0]
#             confidence = max(0, 1 - (distance / len(province_suggestion)))
#             result["province"]["classified"].append(best_match)
#             result["province"]["scores"].append(
#                 {
#                     "match": best_match,
#                     "distance": distance,
#                     "confidence": confidence,
#                     "all_matches": matches[:3],  # Top 3 matches
#                 }
#             )
#         else:
#             result["province"]["classified"].append(None)
#             result["province"]["scores"].append(
#                 {
#                     "match": None,
#                     "distance": float("inf"),
#                     "confidence": 0,
#                     "all_matches": [],
#                 }
#             )

#     # Classify districts
#     for district_suggestion in suggestion_result.get("district", []):
#         matches = district_trie.search(district_suggestion, max_distance)
#         if matches:
#             best_match, distance = matches[0]
#             confidence = max(0, 1 - (distance / len(district_suggestion)))
#             result["district"]["classified"].append(best_match)
#             result["district"]["scores"].append(
#                 {
#                     "match": best_match,
#                     "distance": distance,
#                     "confidence": confidence,
#                     "all_matches": matches[:3],
#                 }
#             )
#         else:
#             result["district"]["classified"].append(None)
#             result["district"]["scores"].append(
#                 {
#                     "match": None,
#                     "distance": float("inf"),
#                     "confidence": 0,
#                     "all_matches": [],
#                 }
#             )

#     # Classify wards
#     for ward_suggestion in suggestion_result.get("ward", []):
#         matches = ward_trie.search(ward_suggestion, max_distance)
#         if matches:
#             best_match, distance = matches[0]
#             confidence = max(0, 1 - (distance / len(ward_suggestion)))
#             result["ward"]["classified"].append(best_match)
#             result["ward"]["scores"].append(
#                 {
#                     "match": best_match,
#                     "distance": distance,
#                     "confidence": confidence,
#                     "all_matches": matches[:3],
#                 }
#             )
#         else:
#             result["ward"]["classified"].append(None)
#             result["ward"]["scores"].append(
#                 {
#                     "match": None,
#                     "distance": float("inf"),
#                     "confidence": 0,
#                     "all_matches": [],
#                 }
#             )

#     return result

In [None]:
# province_trie, district_trie, ward_trie = build_classification_tries()

In [None]:
# suggestion = suggest_address_components(
#     normalize_input("QL.9, Xã Phogn Mỹ, Huyện Phong Điền, Tỉnh Thừa Thiên Huế")
# )
# result = classify_address_components(
#     suggestion, province_trie, district_trie, ward_trie
# )
# print(result)

## Test

In [None]:
# import json


# def load_test_cases(filename: str) -> List[Dict]:
#     """Load test cases from JSON file"""
#     try:
#         with open(filename, "r", encoding="utf-8") as f:
#             return json.load(f)
#     except FileNotFoundError:
#         print(f"Warning: {filename} not found, using empty list")
#         return []

In [None]:
# import time
# import csv
# import os


# def evaluate_prediction(predicted: Dict, expected: Dict) -> Dict:
#     """Evaluate a single prediction against expected results"""
#     result = {
#         "province_correct": False,
#         "district_correct": False,
#         "ward_correct": False,
#         "province_match": None,
#         "district_match": None,
#         "ward_match": None,
#     }

#     # Check province
#     if expected.get("province"):
#         predicted_provinces = predicted.get("province", {}).get("classified", [])
#         if predicted_provinces and predicted_provinces[0]:
#             predicted_province = predicted_provinces[0].lower().strip()
#             expected_province = expected["province"].lower().strip()
#             if (
#                 predicted_province == expected_province
#                 or expected_province in predicted_province
#                 or predicted_province in expected_province
#             ):
#                 result["province_correct"] = True
#                 result["province_match"] = predicted_provinces[0]

#     # Check district
#     if expected.get("district"):
#         predicted_districts = predicted.get("district", {}).get("classified", [])
#         if predicted_districts and predicted_districts[0]:
#             predicted_district = predicted_districts[0].lower().strip()
#             expected_district = expected["district"].lower().strip()
#             if (
#                 predicted_district == expected_district
#                 or expected_district in predicted_district
#                 or predicted_district in expected_district
#             ):
#                 result["district_correct"] = True
#                 result["district_match"] = predicted_districts[0]

#     # Check ward
#     if expected.get("ward"):
#         predicted_wards = predicted.get("ward", {}).get("classified", [])
#         if predicted_wards and predicted_wards[0]:
#             predicted_ward = predicted_wards[0].lower().strip()
#             expected_ward = expected["ward"].lower().strip()
#             if (
#                 predicted_ward == expected_ward
#                 or expected_ward in predicted_ward
#                 or predicted_ward in expected_ward
#             ):
#                 result["ward_correct"] = True
#                 result["ward_match"] = predicted_wards[0]

#     return result


# def safe_get_first(data_dict: Dict, component: str) -> str:
#     """Safely get the first classified item or return None"""
#     classified = data_dict.get(component, {}).get("classified", [])
#     return classified[0] if classified and classified[0] else None


# def run_comprehensive_test(
#     test_file_path: str = "test.json", max_display: int = 10
# ) -> Dict:
#     """
#     Run comprehensive test suite matching main.py pattern with timing and export
#     """
#     # Load test cases
#     test_cases = load_test_cases(test_file_path)
#     total_cases = len(test_cases)

#     if total_cases == 0:
#         return {"error": "No test cases found"}

#     print(f"\nTesting address classification with {total_cases} test cases:")
#     print("=" * 80)

#     # Initialize tracking variables
#     correct_predictions = 0
#     total_time = 0
#     max_time = 0
#     failed_cases = []
#     all_test_results = []

#     # Process test cases
#     for i, test_case in enumerate(test_cases):
#         address = test_case["text"]
#         expected = test_case["result"]
#         notes = test_case.get("notes", "")

#         # Time the prediction
#         start_time = time.perf_counter()
#         normalized_address = normalize_input(address)
#         suggestion = suggest_address_components(normalized_address)
#         predicted = classify_address_components(
#             suggestion, province_trie, district_trie, ward_trie
#         )
#         end_time = time.perf_counter()

#         processing_time = (end_time - start_time) * 1000  # Convert to ms
#         total_time += processing_time
#         max_time = max(max_time, processing_time)

#         # Evaluate prediction
#         evaluation = evaluate_prediction(predicted, expected)

#         # Check if prediction matches expected result (all components correct)
#         is_correct = (
#             evaluation["province_correct"]
#             and evaluation["district_correct"]
#             and evaluation["ward_correct"]
#         )

#         if is_correct:
#             correct_predictions += 1

#         # Prepare result data matching main.py format
#         reasons = []
#         pred_province = safe_get_first(predicted, "province")
#         pred_district = safe_get_first(predicted, "district")
#         pred_ward = safe_get_first(predicted, "ward")

#         if not evaluation["province_correct"]:
#             reasons.append("province mismatch")
#         if not evaluation["district_correct"]:
#             reasons.append("district mismatch")
#         if not evaluation["ward_correct"]:
#             reasons.append("ward mismatch")
#         if processing_time > 100:
#             reasons.append(f"time_exceeded ({processing_time:.2f} ms > 100 ms)")

#         fail_reason = "; ".join(reasons) if reasons else ""

#         test_result = {
#             "index": i + 1,
#             "input": address,
#             "normalized": normalized_address,
#             "notes": notes,
#             "expected_province": expected.get("province"),
#             "expected_district": expected.get("district"),
#             "expected_ward": expected.get("ward"),
#             "got_province": pred_province,
#             "got_district": pred_district,
#             "got_ward": pred_ward,
#             "time_ms": round(processing_time, 3),
#             "status": "PASS" if is_correct else "FAIL",
#             "fail_reason": fail_reason,
#         }

#         all_test_results.append(test_result)

#         if not is_correct:
#             failed_cases.append(test_result)

#         # Show results for first few test cases or if incorrect
#         if i < max_display or not is_correct:
#             print(f"\nTest {i + 1}: {address}")
#             print(f"Expected: {expected}")
#             print(
#                 f"Predicted: {{'province': '{pred_province}', 'district': '{pred_district}', 'ward': '{pred_ward}'}}"
#             )
#             print(f"Correct: {'✅' if is_correct else '❌'}")
#             print(f"Processing time: {processing_time:.2f}ms")
#             if notes:
#                 print(f"Notes: {notes}")

#         # Check performance requirements
#         if processing_time > 100:  # 0.1s = 100ms
#             print("⚠️  WARNING: Exceeds maximum time requirement!")
#         elif processing_time > 10:  # 0.01s = 10ms
#             print("⚠️  WARNING: Exceeds average time requirement!")

#     # Summary statistics
#     accuracy = (correct_predictions / total_cases) * 100
#     avg_time = total_time / total_cases

#     print("\n" + "=" * 80)
#     print("SUMMARY:")
#     print(f"Total test cases: {total_cases}")
#     print(f"Correct predictions: {correct_predictions}")
#     print(f"Accuracy: {accuracy:.2f}%")
#     print(f"Average processing time: {avg_time:.2f}ms")
#     print(f"Maximum processing time: {max_time:.2f}ms")

#     print("\n" + "=" * 80)
#     print("PERFORMANCE CONSTRAINTS")
#     print("=" * 80)
#     print(f"Average time ≤ 10ms: {'✓' if avg_time <= 10 else '✗'} ({avg_time:.2f} ms)")
#     print(f"Max time ≤ 100ms: {'✓' if max_time <= 100 else '✗'} ({max_time:.2f} ms)")

#     if avg_time <= 10 and max_time <= 100:
#         print("✅ All performance requirements met!")
#     else:
#         print("⚠️  Performance requirements not met!")

#     return {
#         "total_cases": total_cases,
#         "correct_predictions": correct_predictions,
#         "accuracy": accuracy,
#         "avg_time": avg_time,
#         "max_time": max_time,
#         "failed_cases": failed_cases,
#         "all_test_results": all_test_results,
#     }


# def export_test_results(test_results: Dict):
#     """Export test results to CSV and JSON files matching main.py format"""
#     if "error" in test_results:
#         print("Cannot export results - test failed to run")
#         return

#     all_test_results = test_results["all_test_results"]
#     failed_cases = test_results["failed_cases"]

#     print("\n" + "=" * 80)
#     print("EXPORTING TEST RESULTS")
#     print("=" * 80)

#     try:
#         # CSV headers matching main.py format
#         csv_headers = [
#             "index",
#             "input",
#             "normalized",
#             "notes",
#             "expected_province",
#             "expected_district",
#             "expected_ward",
#             "got_province",
#             "got_district",
#             "got_ward",
#             "time_ms",
#             "status",
#             "fail_reason",
#         ]

#         # Create output directory if it doesn't exist
#         output_dir = "output"
#         os.makedirs(output_dir, exist_ok=True)

#         # Export full test report
#         test_report_path = os.path.join(output_dir, "_test_report.csv")
#         with open(test_report_path, "w", encoding="utf-8-sig", newline="") as cf:
#             writer = csv.DictWriter(cf, fieldnames=csv_headers)
#             writer.writeheader()
#             for row in all_test_results:
#                 writer.writerow(row)
#         print(
#             f"Wrote test report CSV: {test_report_path} ({len(all_test_results)} rows)"
#         )

#         # Export failed cases only
#         failed_cases_path = os.path.join(output_dir, "_failed_cases.csv")
#         with open(failed_cases_path, "w", encoding="utf-8-sig", newline="") as cf:
#             writer = csv.DictWriter(cf, fieldnames=csv_headers)
#             writer.writeheader()
#             for row in failed_cases:
#                 writer.writerow(row)
#         print(f"Wrote failed cases CSV: {failed_cases_path} ({len(failed_cases)} rows)")

#         # Export JSON format for full report
#         json_report_path = os.path.join(output_dir, "_test_report.json")
#         with open(json_report_path, "w", encoding="utf-8") as jf:
#             json.dump(all_test_results, jf, ensure_ascii=False, indent=2)
#         print(
#             f"Wrote test report JSON: {json_report_path} ({len(all_test_results)} rows)"
#         )

#     except Exception as e:
#         print(f"Failed to export test results: {e}")


# def show_failed_cases_summary(test_results: Dict, max_show: int = 10):
#     """Display detailed information about failed test cases"""
#     if "error" in test_results:
#         return

#     failed_cases = test_results.get("failed_cases", [])

#     if not failed_cases:
#         print("🎉 No failed cases!")
#         return

#     print(f"\nFAILED CASES SUMMARY (showing first {min(max_show, len(failed_cases))}):")
#     print("=" * 80)

#     for i, case in enumerate(failed_cases[:max_show]):
#         print(f"\n[{case['index']}] {case['input']}")
#         print(
#             f"Expected - P: {case['expected_province'] or 'N/A'}, D: {case['expected_district'] or 'N/A'}, W: {case['expected_ward'] or 'N/A'}"
#         )
#         print(
#             f"Got      - P: {case['got_province'] or 'N/A'}, D: {case['got_district'] or 'N/A'}, W: {case['got_ward'] or 'N/A'}"
#         )
#         print(f"Time: {case['time_ms']}ms | Issues: {case['fail_reason']}")

#     if len(failed_cases) > max_show:
#         print(f"\n... and {len(failed_cases) - max_show} more failed cases")


# # Run the comprehensive test suite
# print("Building classification tries...")
# province_trie, district_trie, ward_trie = build_classification_tries()

# print("\nRunning comprehensive test suite...")
# test_results = run_comprehensive_test("test.json", max_display=5)

# if "error" not in test_results:
#     show_failed_cases_summary(test_results, max_show=10)
#     export_test_results(test_results)