In [None]:
import requests
import mwparserfromhell
import wikipediaapi
import json
import re
from typing import List, Tuple, Dict


########################
## Wikidata + Wikipedia helpers
########################

# --- Wikidata query for relationships ---
def get_language_relationships_wikidata(language_name):
    """
    Fetch language relationships from Wikidata using a SPARQL query.

    Notes:
    - Dialects are fetched via property P5019 ("dialect of").
      We query for items (?dialect) where ?dialect wdt:P5019 ?language.
      This avoids location/region pages like "England" or "Ancient Rome".
    """
    # SPARQL: focus on dialect-of (P5019) and keep simple parent/child placeholders.
    sparql_query = f"""
    SELECT ?parentLabel ?childLabel ?dialectLabel ?siblingLabel
    WHERE {{
      ?language rdfs:label "{language_name}"@en .
      ?language wdt:P31/wdt:P279* wd:Q34770 .  # instance/subclass of language

      # Optional: parent/child relationships (kept simple; may vary per item quality)
      OPTIONAL {{ ?language wdt:P279 ?parent . }}  # subclass of (sometimes used for family)
      OPTIONAL {{ ?child wdt:P279 ?language . }}  # children as subclasses (rare but possible)

      # Dialects: items that are 'dialect of' this language
      OPTIONAL {{ ?dialect wdt:P5019 ?language . }}

      # Siblings: other dialects of the same parent (if applicable)
      OPTIONAL {{
        ?sibling wdt:P5019 ?commonParent .
        ?language wdt:P5019 ?commonParent .
        FILTER(?sibling != ?language)
      }}

      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    """
    url = 'https://query.wikidata.org/sparql'
    try:
        headers = {
            'User-Agent': 'LanguageRelationshipFetcher/1.1 (educational@example.com)'
        }
        response = requests.get(url, params={'query': sparql_query, 'format': 'json'}, headers=headers, timeout=20)
        response.raise_for_status()
        data = response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error querying Wikidata: {e}")
        return {}

    results = {
        "parents": set(),
        "children": set(),
        "dialects": set(),
        "siblings": set(),
    }

    for item in data.get('results', {}).get('bindings', []):
        if 'parentLabel' in item:
            results['parents'].add(item['parentLabel']['value'])
        if 'childLabel' in item:
            results['children'].add(item['childLabel']['value'])
        if 'dialectLabel' in item:
            results['dialects'].add(item['dialectLabel']['value'])
        if 'siblingLabel' in item:
            results['siblings'].add(item['siblingLabel']['value'])

    for key in results:
        results[key] = sorted(list(results[key]))
    return results


########################
## Wikipedia infobox parsing (mwparserfromhell)
########################

# --- Original infobox parser using wikipediaapi + mwparserfromhell ---
def get_language_relationships_infobox(language_name):
    """
    Fetches language relationships from the Wikipedia infobox.
    """
    wiki_wiki = wikipediaapi.Wikipedia('LanguageTreeBuilder/1.0 (educational@example.com)', 'en')
    page = wiki_wiki.page(language_name)

    if not page.exists():
        print(f"Page for '{language_name}' not found on Wikipedia.")
        return {}

    wikicode = mwparserfromhell.parse(page.text)

    infoboxes = wikicode.filter_templates(matches=lambda t: t.name.strip().lower().startswith('infobox language'))

    if not infoboxes:
        return {}
    infobox = infoboxes[0]

    results = {"parents": set(), "dialects": set()}
    parent_params = ['family', 'fam', 'family1', 'fam1', 'ancestor', 'ancestors']
    dialect_params = ['dialects', 'varieties']

    for param in infobox.params:
        param_name = param.name.strip().lower()
        param_value = param.value.strip_code().strip()

        if any(p in param_name for p in parent_params):
            parents = [p.strip() for p in param_value.replace('\n', ',').split(',') if p.strip()]
            results['parents'].update(parents)

        if any(d in param_name for d in dialect_params):
            dialects = [d.strip() for d in param_value.replace('\n', ',').split(',') if d.strip()]
            results['dialects'].update(dialects)

    for key in results:
        results[key] = sorted(list(results[key]))
    return results



########################
## Raw wikitext dialect extractor (dia1..dia40)
########################
WIKI_API = "https://en.wikipedia.org/w/api.php"
HEADERS = {"User-Agent": "LanguageTreeNotebook/1.0 (educational)"}


def _get_page_content(title: str) -> str:
    """Fetch raw wikitext content of the given page title."""
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "revisions",
        "rvprop": "content",
        "rvslots": "main",
    }
    try:
        r = requests.get(WIKI_API, params=params, headers=HEADERS, timeout=15)
        r.raise_for_status()
        data = r.json()
        pages = data.get("query", {}).get("pages", {})
        if not pages:
            return ""
        page = next(iter(pages.values()))
        revs = page.get("revisions")
        if not revs:
            return ""
        return revs[0].get("slots", {}).get("main", {}).get("*", "")
    except Exception:
        return ""


def _find_wiki_links(text: str) -> List[str]:
    """Return list of linked page titles from wiki link markup [[Title|...]]."""
    if not text:
        return []
    links = []
    for m in re.finditer(r"\[\[([^|#\]]+)(?:\|[^\]]*)?\]\]", text):
        t = m.group(1).strip()
        if t:
            links.append(t)
    return links


def _extract_infobox(wikitext: str) -> Dict[str, str]:
    """Extract raw key->value pairs from the Infobox (language or language family)."""
    if not wikitext:
        return {}

    start = wikitext.find("{{Infobox language")
    if start == -1:
        start = wikitext.find("{{Infobox language family")
    if start == -1:
        m = re.search(r"\{\{infobox\s+(language|language family)", wikitext, re.IGNORECASE)
        if m:
            start = m.start()
        else:
            return {}

    # Find the matching closing braces for the infobox
    pos = start + 2
    depth = 1
    end = -1
    while pos < len(wikitext):
        if wikitext[pos:pos+2] == "{{":
            depth += 1
            pos += 2
        elif wikitext[pos:pos+2] == "}}":
            depth -= 1
            pos += 2
            if depth == 0:
                end = pos
                break
        else:
            pos += 1
    if end == -1:
        return {}

    content = wikitext[start:end]
    raw: Dict[str, str] = {}
    current_key = None
    current_val_lines: List[str] = []

    for line in content.split("\n"):
        s = line.strip()
        if s.lower().startswith("{{infobox language"):
            continue
        if s.startswith("|") and "=" in s:
            if current_key is not None and current_val_lines:
                raw[current_key] = "\n".join(current_val_lines).strip()
            key, val = s[1:].split("=", 1)
            current_key = key.strip()
            current_val_lines = [val.strip()]
        elif s.startswith("|") and current_key is not None:
            current_val_lines.append(s[1:].strip())
        elif current_key is not None and not s.startswith("|"):
            current_val_lines.append(s)

    if current_key is not None and current_val_lines:
        raw[current_key] = "\n".join(current_val_lines).strip()

    return raw


def _get_page_categories(title: str) -> List[str]:
    """Fetch non-hidden categories for a Wikipedia page title."""
    params = {
        "action": "query",
        "format": "json",
        "prop": "categories",
        "titles": title,
        "clshow": "!hidden",
        "cllimit": "50",
    }
    try:
        r = requests.get(WIKI_API, params=params, headers=HEADERS, timeout=15)
        r.raise_for_status()
        data = r.json()
        pages = data.get("query", {}).get("pages", {})
        if not pages:
            return []
        page = next(iter(pages.values()))
        cats = page.get("categories", []) or []
        return [c.get("title", "") for c in cats if c.get("title")]
    except Exception:
        return []


def _is_probable_geopage(title: str) -> bool:
    """
    Heuristic: exclude pages that are likely countries/regions/geography.
    Uses categories to spot geographic topics.
    """
    cats = _get_page_categories(title)
    if not cats:
        # No categories found; don't exclude solely based on that.
        return False
    joined = " ".join(cats).lower()
    geo_keywords = [
        "country","countries","sovereign state","sovereign states","empire","empires",
        "kingdom","kingdoms","roman","province","provinces","city","cities",
        "populated places","regions of","counties of","states of","geography of",
    ]
    return any(k in joined for k in geo_keywords)


def _filter_dialect_titles(titles: List[str], language_name: str) -> List[str]:
    """
    Remove obvious non-dialect entries such as list pages, media namespaces,
    the language itself, and geographic pages (countries/regions).
    """
    out: List[str] = []
    seen = set()
    for t in titles:
        tt = (t or "").strip()
        if not tt:
            continue
        low = tt.lower()
        # Exclude list/maintenance and non-article namespaces
        if low.startswith("list of") or low.startswith("outline of") or low.startswith("history of"):
            continue
        if any(tt.startswith(ns) for ns in ("Category:", "File:", "Image:", "Template:")):
            continue
        # Exclude the language's main page variants
        if tt in {language_name, f"{language_name} language", f"{language_name} Language"}:
            continue
        # Exclude likely geo pages (countries/regions)
        if _is_probable_geopage(tt):
            continue
        if tt not in seen:
            seen.add(tt)
            out.append(tt)
    return out


def get_dialect_relationships(language_name: str) -> List[Tuple[str, str, str]]:
    """
    Return only (dialect, 'dialect_of', language_name) tuples extracted from the
    language's Wikipedia infobox. Looks at 'dialects' and 'dia1'..'dia40' fields,
    and filters out non-dialect pages (e.g., countries, lists).
    """
    if not language_name or not isinstance(language_name, str):
        return []

    # Try a few common page title variations
    candidates = [
        f"{language_name} language",
        language_name,
        f"{language_name} Language",
        f"{language_name} languages",
        f"{language_name} language family",
    ]

    wikitext = ""
    for t in candidates:
        wikitext = _get_page_content(t)
        if wikitext and ("{{Infobox language" in wikitext or "{{Infobox language family" in wikitext):
            break
    if not wikitext:
        return []

    infobox_raw = _extract_infobox(wikitext)
    if not infobox_raw:
        return []

    found: List[str] = []

    # Collect from explicit 'dialects' field if it contains links
    if "dialects" in infobox_raw:
        found.extend(_find_wiki_links(infobox_raw["dialects"]))

    # Collect from dia1..dia40 fields
    for i in range(1, 41):
        k = f"dia{i}"
        if k in infobox_raw:
            found.extend(_find_wiki_links(infobox_raw[k]))

    # Deduplicate while preserving order
    seen = set()
    ordered: List[str] = []
    for d in found:
        if d not in seen:
            seen.add(d)
            ordered.append(d)

    # Filter out non-dialect pages
    dialects = _filter_dialect_titles(ordered, language_name)

    return [(d, "dialect_of", language_name) for d in dialects]



########################
## Pipeline
########################
def get_language_relationships(language_name):
    """
    Main pipeline function to get language relationships.
    """
    print(f"--- Fetching relationships for: {language_name} ---\n")

    print("1. Querying Wikidata...")
    wikidata_results = get_language_relationships_wikidata(language_name)
    print("Done.\n")

    print("2. Parsing Wikipedia Infobox (mwparserfromhell)...")
    infobox_results = get_language_relationships_infobox(language_name)
    print("Done.\n")

    print("3. Extracting dialects from raw Infobox wikitext (dia1..dia40)...")
    dialect_tuples = get_dialect_relationships(language_name)
    wikitext_dialects = [d for (d, _rel, _lang) in dialect_tuples]
    print(f"   Found {len(wikitext_dialects)} dialects via wikitext parser.\n")

    combined_parents = set(wikidata_results.get("parents", []))
    if "parents" in infobox_results:
        combined_parents.update(infobox_results["parents"])

    # Filter infobox-derived dialects as well
    infobox_dialects = infobox_results.get("dialects", [])
    infobox_dialects_filtered = _filter_dialect_titles(infobox_dialects, language_name)

    combined_dialects = set(wikidata_results.get("dialects", []))
    combined_dialects.update(infobox_dialects_filtered)
    combined_dialects.update(wikitext_dialects)

    return {
        "language": language_name,
        "parents": sorted(list(combined_parents)),
        "children": wikidata_results.get("children", []),
        "siblings": wikidata_results.get("siblings", []),
        "dialects": sorted(list(combined_dialects)),
    }



if __name__ == '__main__':
    language_to_check = "English"
    relationships_data = get_language_relationships(language_to_check)
    print("--- Combined Results ---")
    print(json.dumps(relationships_data, indent=2))

    print("\n" + "="*30 + "\n")


--- Fetching relationships for: English ---

1. Querying Wikidata...
Done.

2. Parsing Wikipedia Infobox (mwparserfromhell)...
Done.

2. Parsing Wikipedia Infobox (mwparserfromhell)...
Done.

3. Extracting dialects from raw Infobox wikitext (dia1..dia40)...
Done.

3. Extracting dialects from raw Infobox wikitext (dia1..dia40)...
   Found 6 dialects via wikitext parser.

--- Combined Results ---
{
  "language": "English",
  "parents": [
    "Anglic"
  ],
  "children": [
    "African Nova Scotian English",
    "American Indian English",
    "Bahamian English",
    "Bajan English",
    "Basic English",
    "Bay Islands English",
    "British English",
    "Brittonicisms in English",
    "Brunei English",
    "Buddhist Hybrid English",
    "Butler English",
    "Cameroonian English",
    "Caribbean English",
    "Chancery Standard",
    "Channel Island English",
    "Cyrillic English",
    "Degawa English",
    "Denglisch",
    "Ebonics",
    "English as a second or foreign language",
    

In [17]:
# Quick checks
for lang in ["Latin", "English"]:
    print("\n===", lang, "===")
    data = get_language_relationships(lang)
    print("Dialects:", data.get("dialects", [])[:25])


=== Latin ===
--- Fetching relationships for: Latin ---

1. Querying Wikidata...
Done.

2. Parsing Wikipedia Infobox (mwparserfromhell)...
Done.

2. Parsing Wikipedia Infobox (mwparserfromhell)...
Done.

3. Extracting dialects from raw Infobox wikitext (dia1..dia40)...
Done.

3. Extracting dialects from raw Infobox wikitext (dia1..dia40)...
   Found 0 dialects via wikitext parser.

Dialects: []

=== English ===
--- Fetching relationships for: English ---

1. Querying Wikidata...
   Found 0 dialects via wikitext parser.

Dialects: []

=== English ===
--- Fetching relationships for: English ---

1. Querying Wikidata...
Done.

2. Parsing Wikipedia Infobox (mwparserfromhell)...
Done.

2. Parsing Wikipedia Infobox (mwparserfromhell)...
Done.

3. Extracting dialects from raw Infobox wikitext (dia1..dia40)...
Done.

3. Extracting dialects from raw Infobox wikitext (dia1..dia40)...
   Found 6 dialects via wikitext parser.

Dialects: ['Australian English', 'Caribbean English', 'Hiberno-English