In [220]:
# NOTE: you CAN change this cell
# If you want to use your own database, download it here
# !gdown ...


In [221]:
# NOTE: you CAN change this cell
# Add more to your needs
# you must place ALL pip install here
!pip install editdistance
!pip install pandas




In [222]:
# NOTE: you CAN change this cell
# import your library here
import time

In [264]:
import pandas as pd
import re
import editdistance  
class Solution:

    def __init__(self):
        # Load data
        self.province_data = pd.read_csv('./provinces.csv', delimiter=";").set_index('Code').to_dict()['FullName']
        self.district_data = pd.read_csv('./districts.csv', delimiter=";").set_index('FullName').to_dict()['Code']
        self.ward_data = pd.read_csv('./wards.csv', delimiter=";").to_dict(orient='records')

    def clean_prefix(self, name):
        """Xoá các tiền tố phường, xã, tỉnh, quận,... và viết hoa chữ cái đầu."""
        cleaned_name = re.sub(r'^(phường|xã|thị trấn|tỉnh|thành phố|quận|thị xã|huyện)\s+', '', name, flags=re.IGNORECASE)
        return cleaned_name.title()

    def find_best_match(self, candidate, options):
        """Tìm kết quả gần nhất """
        candidate = candidate.strip().lower()

        best_match = None
        lowest_distance = float('inf')

        prefixes = ["tỉnh", "thành phố", "huyện", "quận", "thị xã", "phường", "thị trấn", "xóm", "thôn"]

        for option in options:
            option_cleaned = option.lower()
            for prefix in prefixes:
                option_cleaned = option_cleaned.replace(f"{prefix} ", "")
            option_cleaned = option_cleaned.strip()

            distance = editdistance.eval(candidate, option_cleaned) 

            if candidate == option_cleaned:
                return option

            if distance < lowest_distance:
                lowest_distance = distance
                best_match = option

        return best_match

    def classify_address(self, address):
        """Phân loại địa chỉ thành Tỉnh, Huyện và Xã."""
        address = address.lower().strip()
        parts = [part.strip() for part in address.split(',')]

        province_found = None
        district_found = None
        ward_found = None

        if parts:
            # Tìm Tỉnh
            province_candidate = parts[-1]
            province_found = self.find_best_match(province_candidate, list(self.province_data.values()))

            province_code = None
            for code, name in self.province_data.items():
                if name == province_found:
                    province_code = code
                    break

            if len(parts) > 1 and province_code:
                district_candidate = parts[-2]
                district_found = self.find_best_match(district_candidate, list(self.district_data.keys()))
                if district_found:
                    district_code = self.district_data[district_found]

                    if len(parts) > 2:
                        ward_candidate = parts[-3]
                        ward_candidates = [ward['FullName'] for ward in self.ward_data if ward['DistrictCode'] == district_code]
                        ward_found = self.find_best_match(ward_candidate, ward_candidates)

        return {
            "province": self.clean_prefix(province_found) if province_found else '',
            "district": self.clean_prefix(district_found) if district_found else '',
            "ward": self.clean_prefix(ward_found) if ward_found else ''
        }

    def process(self, address: str):
        """Xử lý địa chỉ để trả """
        return self.classify_address(address)


In [265]:
# NOTE: DO NOT change this cell
# This cell is for downloading private test
# !rm -rf test.json
# this link is public test
!gdown --fuzzy https://drive.google.com/file/d/1PBt3U9I3EH885CDhcXspebyKI5Vw6uLB/view?usp=sharing -O test.json

zsh:1: no matches found: https://drive.google.com/file/d/1PBt3U9I3EH885CDhcXspebyKI5Vw6uLB/view?usp=sharing


In [266]:
# NOTE: DO NOT change this cell
# This cell is for scoring

TEAM_NAME = 'DEFAULT_NAME'  # This should be your team name
EXCEL_FILE = f'{TEAM_NAME}.xlsx'

import json
import time
with open('test.json') as f:
    data = json.load(f)

summary_only = True
df = []
solution = Solution()
timer = []
correct = 0
with open('output_results.txt', 'w') as output_file:
    for test_idx, data_point in enumerate(data):
        address = data_point["text"]

        ok = 0
        try:
            start = time.perf_counter_ns()
            result = solution.process(address)
            answer = data_point["result"]
            finish = time.perf_counter_ns()
            timer.append(finish - start)
            

            """TEST """
            # Compare the result with the expected answer
            province_correct = answer["province"] == result["province"]
            district_correct = answer["district"] == result["district"]
            ward_correct = answer["ward"] == result["ward"]
            
            is_correct = province_correct and district_correct and ward_correct

            # Write the results to the output file
            output_file.write(f"Address: {address}\n")
            output_file.write(f"Result: {result}\n")
            output_file.write(f"Expected: {answer}\n")
            
            if is_correct:
                output_file.write("Status: CORRECT\n\n")
            else:
                output_file.write("Status: INCORRECT\n\n")




                
            ok += int(answer["province"] == result["province"])
            ok += int(answer["district"] == result["district"])
            ok += int(answer["ward"] == result["ward"])
            df.append([
                test_idx,
                address,
                answer["province"],
                result["province"],
                int(answer["province"] == result["province"]),
                answer["district"],
                result["district"],
                int(answer["district"] == result["district"]),
                answer["ward"],
                result["ward"],
                int(answer["ward"] == result["ward"]),
                ok,
                timer[-1] / 1_000_000_000,
            ])
        except Exception as e:
            df.append([
                test_idx,
                address,
                answer["province"],
                "EXCEPTION",
                0,
                answer["district"],
                "EXCEPTION",
                0,
                answer["ward"],
                "EXCEPTION",
                0,
                0,
                0,
            ])
            # any failure count as a zero correct
            pass
        correct += ok


        if not summary_only:
            # responsive stuff
            print(f"Test {test_idx:5d}/{len(data):5d}")
            print(f"Correct: {ok}/3")
            print(f"Time Executed: {timer[-1] / 1_000_000_000:.4f}")


print(f"-"*30)
total = len(data) * 3
score_scale_10 = round(correct / total * 10, 2)
if len(timer) == 0:
    timer = [0]
max_time_sec = round(max(timer) / 1_000_000_000, 4)
avg_time_sec = round((sum(timer) / len(timer)) / 1_000_000_000, 4)

import pandas as pd

df2 = pd.DataFrame(
    [[correct, total, score_scale_10, max_time_sec, avg_time_sec]],
    columns=['correct', 'total', 'score / 10', 'max_time_sec', 'avg_time_sec',],
)

columns = [
    'ID',
    'text',
    'province',
    'province_student',
    'province_correct',
    'district',
    'district_student',
    'district_correct',
    'ward',
    'ward_student',
    'ward_correct',
    'total_correct',
    'time_sec',
]

df = pd.DataFrame(df)
df.columns = columns

print(f'{TEAM_NAME = }')
print(f'{EXCEL_FILE = }')
print(df2)

!pip install xlsxwriter
writer = pd.ExcelWriter(EXCEL_FILE, engine='xlsxwriter')
df2.to_excel(writer, index=False, sheet_name='summary')
df.to_excel(writer, index=False, sheet_name='details')
writer.close()


------------------------------
TEAM_NAME = 'DEFAULT_NAME'
EXCEL_FILE = 'DEFAULT_NAME.xlsx'
   correct  total  score / 10  max_time_sec  avg_time_sec
0      788   1350        5.84        0.0083        0.0017
