In [2]:
import os
import json
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from datetime import datetime, timedelta

In [3]:
def get_lottery_result_mien_nam(date_str):
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    options.add_argument('--log-level=3')
    options.add_argument('--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36')

    service = Service(log_output=os.devnull)
    driver = None

    try:
        driver = webdriver.Chrome(options=options, service=service)
        url = f"https://xoso.com.vn/xsmn-{date_str}.html"
        driver.get(url)
        time.sleep(2)

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        driver.quit()

        result_table = soup.find('table', class_='table-result')
        if not result_table:
            print(f"[{date_str}] ‚ùå Kh√¥ng t√¨m th·∫•y b·∫£ng.")
            return None

        print(f"[{date_str}] ‚úÖ ƒê√£ t√¨m th·∫•y b·∫£ng!")

        province_headers = result_table.find("thead").find_all("th")[1:]
        provinces = [th.get_text(strip=True) for th in province_headers]

        all_results = {province: {} for province in provinces}

        tbody = result_table.find("tbody")
        rows = tbody.find_all("tr")
        for row in rows:
            prize_name = row.find("th").get_text(strip=True)
            cells = row.find_all("td")
            for i, cell in enumerate(cells):
                spans = cell.find_all("span")
                values = [span.get_text(strip=True) for span in spans]
                all_results[provinces[i]][prize_name] = values

        special_prizes = []
        for province in provinces:
            special = all_results[province].get("ƒêB", [])
            special_prizes.extend(special)

        final_results = {
            "date": date_str,
            "special_prize": special_prizes,
            "all_results": all_results
        }

        return final_results

    except Exception as e:
        print(f"[{date_str}] ‚ö† L·ªói:", e)
        if driver:
            driver.quit()
        return None

In [4]:
def collect_lottery_data_range(date_start, date_end, output_path, overwrite=False):
    # Kh·ªüi t·∫°o d·ªØ li·ªáu
    if overwrite or not os.path.exists(output_path):
        all_data = []
    else:
        with open(output_path, "r", encoding="utf-8") as f:
            try:
                all_data = json.load(f)
                if not isinstance(all_data, list):
                    all_data = []
            except json.JSONDecodeError:
                all_data = []

    # T·∫°o kho·∫£ng ng√†y
    start_date = datetime.strptime(date_start, "%d-%m-%Y")
    end_date = datetime.strptime(date_end, "%d-%m-%Y")
    current_date = start_date

    existing_dates = {entry['date'] for entry in all_data}

    while current_date <= end_date:
        date_str = current_date.strftime("%d-%m-%Y")
        if overwrite or date_str not in existing_dates:
            result = get_lottery_result_mien_nam(date_str)
            if result:
                all_data.append(result)
        else:
            print(f"[{date_str}] ‚è© B·ªè qua v√¨ ƒë√£ c√≥ trong file.")
        current_date += timedelta(days=1)

    # Ghi to√†n b·ªô d·ªØ li·ªáu v√†o file
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(all_data, f, ensure_ascii=False, indent=2)
        print(f"\n‚úÖ ƒê√£ l∆∞u to√†n b·ªô {len(all_data)} k·∫øt qu·∫£ v√†o '{output_path}'")

In [5]:
def collect_lottery_data_auto(output_path, end_date=None, overwrite=False):
    """
    T·ª± ƒë·ªông t√¨m ng√†y m·ªõi nh·∫•t ƒë√£ c√≥ trong file JSON, ti·∫øp t·ª•c thu th·∫≠p t·ª´ ng√†y ti·∫øp theo ƒë·∫øn h√¥m nay ho·∫∑c ƒë·∫øn end_date (n·∫øu c√≥).
    """
    from datetime import date

    # ƒê·ªçc d·ªØ li·ªáu c≈©
    if os.path.exists(output_path):
        with open(output_path, "r", encoding="utf-8") as f:
            try:
                all_data = json.load(f)
                if not isinstance(all_data, list):
                    all_data = []
            except json.JSONDecodeError:
                all_data = []
    else:
        all_data = []

    # T√¨m ng√†y l·ªõn nh·∫•t ƒë√£ c√≥ trong file
    if all_data:
        existing_dates = [datetime.strptime(entry['date'], "%d-%m-%Y") for entry in all_data]
        last_date = max(existing_dates)
        start_date = last_date + timedelta(days=1)
    else:
        print("‚ö† File ch∆∞a c√≥ d·ªØ li·ªáu. H√£y d√πng collect_lottery_data_range() ƒë·ªÉ kh·ªüi t·∫°o.")
        return

    # Ng√†y k·∫øt th√∫c: m·∫∑c ƒë·ªãnh l√† h√¥m nay n·∫øu kh√¥ng truy·ªÅn
    if end_date is None:
        end_date = datetime.today()
    else:
        end_date = datetime.strptime(end_date, "%d-%m-%Y")

    # Kh√¥ng c√≥ g√¨ ƒë·ªÉ thu th·∫≠p
    if start_date > end_date:
        print("‚è© Kh√¥ng c√≥ ng√†y m·ªõi ƒë·ªÉ c·∫≠p nh·∫≠t.")
        return

    # G·ªçi l·∫°i collect_lottery_data_range() v·ªõi overwrite=False
    date_start_str = start_date.strftime("%d-%m-%Y")
    date_end_str = end_date.strftime("%d-%m-%Y")
    print(f"üìÖ ƒêang thu th·∫≠p t·ª´ {date_start_str} ƒë·∫øn {date_end_str}...")
    collect_lottery_data_range(date_start_str, date_end_str, output_path, overwrite=overwrite)


In [6]:
def collect_lottery_data_range(date_start=None, date_end=None, output_path="xoso_mien_nam.json", overwrite=False):
    from datetime import date

    # ƒê·ªçc d·ªØ li·ªáu c≈© n·∫øu c√≥
    if os.path.exists(output_path) and not overwrite:
        with open(output_path, "r", encoding="utf-8") as f:
            try:
                all_data = json.load(f)
                if not isinstance(all_data, list):
                    all_data = []
            except json.JSONDecodeError:
                all_data = []
    else:
        all_data = []

    # N·∫øu kh√¥ng truy·ªÅn date_start ‚Üí t·ª± l·∫•y t·ª´ file
    if date_start is None:
        if all_data:
            existing_dates = [datetime.strptime(entry["date"], "%d-%m-%Y") for entry in all_data]
            last_date = max(existing_dates)
            start_date = last_date + timedelta(days=1)
        else:
            raise ValueError("‚ö†Ô∏è File r·ªóng ho·∫∑c kh√¥ng t·ªìn t·∫°i. B·∫°n c·∫ßn truy·ªÅn date_start ƒë·ªÉ kh·ªüi t·∫°o d·ªØ li·ªáu.")
    else:
        start_date = datetime.strptime(date_start, "%d-%m-%Y")

    # N·∫øu kh√¥ng truy·ªÅn date_end ‚Üí m·∫∑c ƒë·ªãnh l√† h√¥m nay - 1
    if date_end is None:
        end_date = datetime.today() - timedelta(days=1)
    else:
        end_date = datetime.strptime(date_end, "%d-%m-%Y")

    # Ki·ªÉm tra logic th·ªùi gian
    if start_date > end_date:
        print(f"‚è© Kh√¥ng c√≥ ng√†y m·ªõi ƒë·ªÉ thu th·∫≠p. (start: {start_date.date()}, end: {end_date.date()})")
        return

    # Chu·∫©n b·ªã danh s√°ch ng√†y ƒë√£ c√≥ (ƒë·ªÉ tr√°nh tr√πng)
    existing_dates = {entry["date"] for entry in all_data}

    # B·∫Øt ƒë·∫ßu thu th·∫≠p
    current_date = start_date
    while current_date <= end_date:
        date_str = current_date.strftime("%d-%m-%Y")
        if overwrite or date_str not in existing_dates:
            result = get_lottery_result_mien_nam(date_str)
            if result:
                all_data.append(result)
        else:
            print(f"[{date_str}] ‚è© ƒê√£ c√≥ d·ªØ li·ªáu. B·ªè qua.")
        current_date += timedelta(days=1)

    # Ghi k·∫øt qu·∫£ ra file
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(all_data, f, ensure_ascii=False, indent=2)
        print(f"\n‚úÖ ƒê√£ l∆∞u {len(all_data)} k·∫øt qu·∫£ v√†o '{output_path}'")


In [None]:
output_path = r"D:\TechByte_LoDe\Lo-De-Prediction\xs_data\xsmn_data.json"
collect_lottery_data_range(output_path=output_path)

[30-06-2025] ‚úÖ ƒê√£ t√¨m th·∫•y b·∫£ng!

‚úÖ ƒê√£ l∆∞u 7080 k·∫øt qu·∫£ v√†o 'D:\TechByte_LoDe\Lo-De-Prediction\xsmn_data.json'


In [None]:
output_path_2 = r"D:\TechByte_LoDe\Lo-De-Prediction\xs_data\xsmt_data.json"
collect_lottery_data_range(output_path=output_path_2)

[30-06-2025] ‚úÖ ƒê√£ t√¨m th·∫•y b·∫£ng!

‚úÖ ƒê√£ l∆∞u 6866 k·∫øt qu·∫£ v√†o 'D:\TechByte_LoDe\Lo-De-Prediction\xsmt_data.json'
