In [13]:
import requests
import mwparserfromhell
import wikipediaapi
import json
import re
from typing import List, Tuple, Dict


# --- Wikidata query for relationships ---
def get_language_relationships_wikidata(language_name):
    """
    Fetches language relationships from Wikidata using a SPARQL query.
    """
    sparql_query = f"""
    SELECT ?parentLabel ?childLabel ?dialectLabel ?siblingLabel
    WHERE {{
      ?language rdfs:label "{language_name}"@en .
      ?language wdt:P31/wdt:P279* wd:Q34770 .

      OPTIONAL {{ ?language wdt:P279 ?parent . }}
      OPTIONAL {{ ?language wdt:P155 ?parent . }}
      OPTIONAL {{ ?language wdt:P220 ?parent . }}
      OPTIONAL {{ ?child wdt:P155 ?language . BIND(?language as ?parent) }}

      OPTIONAL {{ ?child wdt:P155 ?language . }}

      OPTIONAL {{ ?language wdt:P2341 ?dialect . }}
      OPTIONAL {{ ?dialect wdt:P629 ?language . }}

      OPTIONAL {{
        ?language wdt:P155 ?commonParent .
        ?sibling wdt:P155 ?commonParent .
        FILTER(?sibling != ?language)
      }}

      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
    }}
    """
    url = 'https://query.wikidata.org/sparql'
    try:
        headers = {
            'User-Agent': 'LanguageRelationshipFetcher/1.0 (educational@example.com)'
        }
        response = requests.get(url, params={'query': sparql_query, 'format': 'json'}, headers=headers)
        response.raise_for_status()
        data = response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error querying Wikidata: {e}")
        return {}

    results = {
        "parents": set(),
        "children": set(),
        "dialects": set(),
        "siblings": set()
    }

    for item in data.get('results', {}).get('bindings', []):
        if 'parentLabel' in item:
            results['parents'].add(item['parentLabel']['value'])
        if 'childLabel' in item:
            results['children'].add(item['childLabel']['value'])
        if 'dialectLabel' in item:
            results['dialects'].add(item['dialectLabel']['value'])
        if 'siblingLabel' in item:
            results['siblings'].add(item['siblingLabel']['value'])

    for key in results:
        results[key] = sorted(list(results[key]))

    return results


# --- Original infobox parser using wikipediaapi + mwparserfromhell ---
def get_language_relationships_infobox(language_name):
    """
    Fetches language relationships from the Wikipedia infobox.
    """
    wiki_wiki = wikipediaapi.Wikipedia('LanguageTreeBuilder/1.0 (educational@example.com)', 'en')
    page = wiki_wiki.page(language_name)

    if not page.exists():
        print(f"Page for '{language_name}' not found on Wikipedia.")
        return {}

    wikicode = mwparserfromhell.parse(page.text)

    infoboxes = wikicode.filter_templates(matches=lambda t: t.name.strip().lower().startswith('infobox language'))

    if not infoboxes:
        return {}
    infobox = infoboxes[0]

    results = {"parents": set(), "dialects": set()}
    parent_params = ['family', 'fam', 'family1', 'fam1', 'ancestor', 'ancestors']
    dialect_params = ['dialects', 'varieties']

    for param in infobox.params:
        param_name = param.name.strip().lower()
        param_value = param.value.strip_code().strip()

        if any(p in param_name for p in parent_params):
            parents = [p.strip() for p in param_value.replace('\n', ',').split(',') if p.strip()]
            results['parents'].update(parents)

        if any(d in param_name for d in dialect_params):
            dialects = [d.strip() for d in param_value.replace('\n', ',').split(',') if d.strip()]
            results['dialects'].update(dialects)

    for key in results:
        results[key] = sorted(list(results[key]))
    return results


# --- New: Dialect-only extractor using raw Wikipedia wikitext (handles dia1..dia40) ---
WIKI_API = "https://en.wikipedia.org/w/api.php"
HEADERS = {"User-Agent": "LanguageTreeNotebook/1.0 (educational)"}


def _get_page_content(title: str) -> str:
    """Fetch raw wikitext content of the given page title."""
    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "revisions",
        "rvprop": "content",
        "rvslots": "main",
    }
    try:
        r = requests.get(WIKI_API, params=params, headers=HEADERS, timeout=15)
        r.raise_for_status()
        data = r.json()
        pages = data.get("query", {}).get("pages", {})
        if not pages:
            return ""
        page = next(iter(pages.values()))
        revs = page.get("revisions")
        if not revs:
            return ""
        return revs[0].get("slots", {}).get("main", {}).get("*", "")
    except Exception:
        return ""


def _find_wiki_links(text: str) -> List[str]:
    """Return list of linked page titles from wiki link markup [[Title|...]]."""
    if not text:
        return []
    links = []
    for m in re.finditer(r"\[\[([^|#\]]+)(?:\|[^\]]*)?\]\]", text):
        t = m.group(1).strip()
        if t:
            links.append(t)
    return links


def _extract_infobox(wikitext: str) -> Dict[str, str]:
    """Extract raw key->value pairs from the Infobox (language or language family)."""
    if not wikitext:
        return {}

    start = wikitext.find("{{Infobox language")
    if start == -1:
        start = wikitext.find("{{Infobox language family")
    if start == -1:
        m = re.search(r"\{\{infobox\s+(language|language family)", wikitext, re.IGNORECASE)
        if m:
            start = m.start()
        else:
            return {}

    # Find the matching closing braces for the infobox
    pos = start + 2
    depth = 1
    end = -1
    while pos < len(wikitext):
        if wikitext[pos:pos+2] == "{{":
            depth += 1
            pos += 2
        elif wikitext[pos:pos+2] == "}}":
            depth -= 1
            pos += 2
            if depth == 0:
                end = pos
                break
        else:
            pos += 1
    if end == -1:
        return {}

    content = wikitext[start:end]
    raw: Dict[str, str] = {}
    current_key = None
    current_val_lines: List[str] = []

    for line in content.split("\n"):
        s = line.strip()
        if s.lower().startswith("{{infobox language"):
            continue
        if s.startswith("|") and "=" in s:
            if current_key is not None and current_val_lines:
                raw[current_key] = "\n".join(current_val_lines).strip()
            key, val = s[1:].split("=", 1)
            current_key = key.strip()
            current_val_lines = [val.strip()]
        elif s.startswith("|") and current_key is not None:
            current_val_lines.append(s[1:].strip())
        elif current_key is not None and not s.startswith("|"):
            current_val_lines.append(s)

    if current_key is not None and current_val_lines:
        raw[current_key] = "\n".join(current_val_lines).strip()

    return raw


def get_dialect_relationships(language_name: str) -> List[Tuple[str, str, str]]:
    """
    Return only (dialect, 'dialect_of', language_name) tuples extracted from the
    language's Wikipedia infobox. Looks at 'dialects' and 'dia1'..'dia40' fields.
    """
    if not language_name or not isinstance(language_name, str):
        return []

    # Try a few common page title variations
    candidates = [
        f"{language_name} language",
        language_name,
        f"{language_name} Language",
        f"{language_name} languages",
        f"{language_name} language family",
    ]

    wikitext = ""
    for t in candidates:
        wikitext = _get_page_content(t)
        if wikitext and ("{{Infobox language" in wikitext or "{{Infobox language family" in wikitext):
            break
    if not wikitext:
        return []

    infobox_raw = _extract_infobox(wikitext)
    if not infobox_raw:
        return []

    found: List[str] = []

    # Collect from explicit 'dialects' field if it contains links
    if "dialects" in infobox_raw:
        found.extend(_find_wiki_links(infobox_raw["dialects"]))

    # Collect from dia1..dia40 fields
    for i in range(1, 41):
        k = f"dia{i}"
        if k in infobox_raw:
            found.extend(_find_wiki_links(infobox_raw[k]))

    # Deduplicate while preserving order
    seen = set()
    dialects: List[str] = []
    for d in found:
        if d not in seen:
            seen.add(d)
            dialects.append(d)

    return [(d, "dialect_of", language_name) for d in dialects]


# --- Pipeline ---
def get_language_relationships(language_name):
    """
    Main pipeline function to get language relationships.
    """
    print(f"--- Fetching relationships for: {language_name} ---\n")

    print("1. Querying Wikidata...")
    wikidata_results = get_language_relationships_wikidata(language_name)
    print("Done.\n")

    print("2. Parsing Wikipedia Infobox (mwparserfromhell)...")
    infobox_results = get_language_relationships_infobox(language_name)
    print("Done.\n")

    print("3. Extracting dialects from raw Infobox wikitext (dia1..dia40)...")
    dialect_tuples = get_dialect_relationships(language_name)
    wikitext_dialects = [d for (d, _rel, _lang) in dialect_tuples]
    print(f"   Found {len(wikitext_dialects)} dialects via wikitext parser.\n")

    combined_parents = set(wikidata_results.get("parents", []))
    if "parents" in infobox_results:
        combined_parents.update(infobox_results["parents"])

    combined_dialects = set(wikidata_results.get("dialects", []))
    if "dialects" in infobox_results:
        combined_dialects.update(infobox_results["dialects"])
    combined_dialects.update(wikitext_dialects)

    return {
        "language": language_name,
        "parents": sorted(list(combined_parents)),
        "children": wikidata_results.get("children", []),
        "siblings": wikidata_results.get("siblings", []),
        "dialects": sorted(list(combined_dialects)),
    }


if __name__ == '__main__':
    language_to_check = "Latin"
    relationships_data = get_language_relationships(language_to_check)
    print("--- Combined Results ---")
    print(json.dumps(relationships_data, indent=2))

    print("\n" + "="*30 + "\n")


--- Fetching relationships for: Latin ---

1. Querying Wikidata...
Done.

2. Parsing Wikipedia Infobox (mwparserfromhell)...
Done.

2. Parsing Wikipedia Infobox (mwparserfromhell)...
Done.

3. Extracting dialects from raw Infobox wikitext (dia1..dia40)...
Done.

3. Extracting dialects from raw Infobox wikitext (dia1..dia40)...
   Found 0 dialects via wikitext parser.

--- Combined Results ---
{
  "language": "Latin",
  "parents": [
    "Latino-Faliscan",
    "Southern European language"
  ],
  "children": [],
  "siblings": [],
  "dialects": [
    "Ancient Rome"
  ]
}


   Found 0 dialects via wikitext parser.

--- Combined Results ---
{
  "language": "Latin",
  "parents": [
    "Latino-Faliscan",
    "Southern European language"
  ],
  "children": [],
  "siblings": [],
  "dialects": [
    "Ancient Rome"
  ]
}


