In [1]:
import requests
from bs4 import BeautifulSoup
import json
from urllib.parse import urljoin, urlparse, urlunparse

In [3]:


def get_all_links_from_page(page_url, custom_headers=None, base_domain="tlidb.com"):
    """
    Fetches a page and extracts all valid internal links with their text.

    Args:
        page_url (str): The URL of the page to scrape.
        custom_headers (dict, optional): Custom headers for the request.
        base_domain (str): The domain to filter links for (e.g., "tlidb.com").

    Returns:
        dict: A dictionary mapping absolute_url (str) to link_text (str).
              Returns None if the page fetch fails.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    if custom_headers:
        headers.update(custom_headers)

    links_map = {}
    try:
        print(f"Fetching {page_url}...")
        response = requests.get(page_url, headers=headers, timeout=15)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser') # Use response.content for better encoding handling
        print(f"Successfully fetched {page_url}.")
    except requests.exceptions.RequestException as e:
        print(f"Error fetching {page_url}: {e}")
        return None

    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        link_text = a_tag.get_text(strip=True)

        if not link_text:  # Skip links with no visible text
            continue
        if href.startswith('javascript:') or href.startswith('mailto:'): # Skip JS/mailto links
            continue

        absolute_href = urljoin(page_url, href) # Make URL absolute
        parsed_absolute_href = urlparse(absolute_href)

        # Filter for links belonging to the specified base_domain
        if parsed_absolute_href.netloc == base_domain:
            # Normalize by removing fragment for keying, but store full text
            url_key = urlunparse(parsed_absolute_href._replace(fragment=""))
            if url_key not in links_map: # Store first occurrence if multiple links point to same URL
                 links_map[url_key] = link_text
            # If you want to capture all variations, you might need a list of texts per URL
            # else:
            #     if isinstance(links_map[url_key], list):
            #         if link_text not in links_map[url_key]:
            #             links_map[url_key].append(link_text)
            #     elif links_map[url_key] != link_text: # Convert to list if new text found
            #         links_map[url_key] = [links_map[url_key], link_text]


    return links_map

def generate_i18n_mappings():
    """
    Generates i18n mappings by comparing links from English and Chinese homepages.
    """
    url_en = "https://tlidb.com/"
    url_cn = "https://tlidb.com/cn/"

    headers_cn = {
        'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8'
    }

    print("--- Scraping English Page ---")
    english_links = get_all_links_from_page(url_en)
    if english_links is None:
        print("Failed to scrape English page. Aborting.")
        return None

    print(f"Found {len(english_links)} unique English links.")
    # print("Sample English Links:", dict(list(english_links.items())[:5]))


    print("\n--- Scraping Chinese Page ---")
    chinese_links = get_all_links_from_page(url_cn, custom_headers=headers_cn)
    if chinese_links is None:
        print("Failed to scrape Chinese page. Aborting.")
        return None
    print(f"Found {len(chinese_links)} unique Chinese links.")
    # print("Sample Chinese Links:", dict(list(chinese_links.items())[:5]))


    i18n_list = []
    processed_english_hrefs = set() # To ensure one entry per English target URL

    print("\n--- Generating Mappings ---")
    for cn_abs_href, cn_text in chinese_links.items():
        parsed_cn_url = urlparse(cn_abs_href)
        cn_path = parsed_cn_url.path
        expected_en_path = ""

        if cn_path == "/cn/" or cn_path == "/cn": # Root of Chinese site
            expected_en_path = "/"
        elif cn_path.startswith("/cn/"):
            expected_en_path = "/" + cn_path[len("/cn/"):] # Remove /cn/ prefix
        else:
            # This link on the Chinese page might point directly to an English page
            # or an external site. If it's on tlidb.com but not /cn/ prefixed,
            # it's likely already an "English" path.
            if parsed_cn_url.netloc == "tlidb.com":
                expected_en_path = cn_path
            else:
                continue # Skip external links not caught earlier

        if not expected_en_path:
            continue

        # Reconstruct the expected English URL (without fragment, as keys are stored that way)
        expected_en_abs_href = urlunparse(parsed_cn_url._replace(path=expected_en_path, fragment=""))

        if expected_en_abs_href in english_links:
            if expected_en_abs_href not in processed_english_hrefs:
                en_text = english_links[expected_en_abs_href]
                i18n_list.append({
                    "name": [en_text],      # As per requested format with list
                    "name_cn": [cn_text]    # As per requested format with list
                })
                processed_english_hrefs.add(expected_en_abs_href)
            # else:
                # print(f"Skipping duplicate mapping for EN Href: {expected_en_abs_href}")
        # else:
            # print(f"No English match for CN Href: {cn_abs_href} (Expected EN: {expected_en_abs_href})")


    print(f"Generated {len(i18n_list)} i18n mappings.")
    return i18n_list



--- Scraping English Page ---
Fetching https://tlidb.com/...
Successfully fetched https://tlidb.com/.
Found 222 unique English links.

--- Scraping Chinese Page ---
Fetching https://tlidb.com/cn/...
Successfully fetched https://tlidb.com/cn/.
Found 206 unique Chinese links.

--- Generating Mappings ---
Generated 189 i18n mappings.

--- i18n Mappings (JSON) ---
[
    {
        "name": [
            "Hero"
        ],
        "name_cn": [
            "TLIDB"
        ]
    },
    {
        "name": [
            "Destiny"
        ],
        "name_cn": [
            "Destiny"
        ]
    },
    {
        "name": [
            "Active Skill"
        ],
        "name_cn": [
            "Active Skill"
        ]
    },
    {
        "name": [
            "Support Skill"
        ],
        "name_cn": [
            "Support Skill"
        ]
    },
    {
        "name": [
            "Passive Skill"
        ],
        "name_cn": [
            "Passive Skill"
        ]
    },
    {
        "name":

In [7]:
i18n_data = generate_i18n_mappings()

if i18n_data:
    json_output = json.dumps(i18n_data, ensure_ascii=False, indent=4)
    print("\n--- i18n Mappings (JSON) ---")
    print(json_output)

    with open("../data/tlidb_i18n_links.json", "w", encoding="utf-8") as f:
        f.write(json_output)
    print("\nSaved to tlidb_i18n_links.json")

--- Scraping English Page ---
Fetching https://tlidb.com/...
Successfully fetched https://tlidb.com/.
Found 222 unique English links.

--- Scraping Chinese Page ---
Fetching https://tlidb.com/cn/...
Successfully fetched https://tlidb.com/cn/.
Found 206 unique Chinese links.

--- Generating Mappings ---
Generated 189 i18n mappings.

--- i18n Mappings (JSON) ---
[
    {
        "name": [
            "Hero"
        ],
        "name_cn": [
            "TLIDB"
        ]
    },
    {
        "name": [
            "Destiny"
        ],
        "name_cn": [
            "Destiny"
        ]
    },
    {
        "name": [
            "Active Skill"
        ],
        "name_cn": [
            "Active Skill"
        ]
    },
    {
        "name": [
            "Support Skill"
        ],
        "name_cn": [
            "Support Skill"
        ]
    },
    {
        "name": [
            "Passive Skill"
        ],
        "name_cn": [
            "Passive Skill"
        ]
    },
    {
        "name":