In [2]:
import requests, time

WIKIDATA_SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
HEADERS = {"User-Agent": "LanguageFamilyTreeBot/1.0 (https://example.com)", "Accept": "application/json"}
# API URLs
WIKIPEDIA_API: str = "https://en.wikipedia.org/w/api.php"
WIKIDATA_API: str = "https://www.wikidata.org/wiki/Special:EntityData/{}.json"
SPARQL_API: str = "https://query.wikidata.org/sparql"
WIKIDATA_QUERY_API: str = "https://www.wikidata.org/w/api.php"

MAX_QIDS_PER_CALL = 50  # Wikidata wbgetentities practical limit
MAX_RETRIES = 4
BACKOFF_BASE = 0.8
MAX_NODES = 1500  # safety cap to avoid runaway expansion


def safe_get_json(url: str, *, params: dict, headers: dict | None = None):
    """GET a JSON response with retry & backoff; return None on hard failure."""
    merged_headers = {**HEADERS, **(headers or {})}
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            resp = requests.get(url, params=params, headers=merged_headers, timeout=20)
            status = resp.status_code
            if status == 429:  # rate limited
                wait = BACKOFF_BASE * attempt * 2
                print(f"Rate limited (429). Sleeping {wait:.2f}s ...")
                time.sleep(wait)
                continue
            if status >= 500:
                wait = BACKOFF_BASE * attempt
                print(f"Server error {status}. Retry {attempt}/{MAX_RETRIES} after {wait:.2f}s")
                time.sleep(wait)
                continue
            if status != 200:
                print(f"Non-200 status {status} for {url} params={params}")
                return None
            text = resp.text.strip()
            if not text:
                wait = BACKOFF_BASE * attempt
                print(f"Empty body. Retry {attempt}/{MAX_RETRIES} after {wait:.2f}s")
                time.sleep(wait)
                continue
            return resp.json()
        except ValueError as ve:  # JSON decode
            wait = BACKOFF_BASE * attempt
            print(f"JSON decode error attempt {attempt}: {ve}. Backing off {wait:.2f}s")
            time.sleep(wait)
        except requests.RequestException as re:
            wait = BACKOFF_BASE * attempt
            print(f"Request error attempt {attempt}: {re}. Backing off {wait:.2f}s")
            time.sleep(wait)
    return None


def chunked(iterable, size):
    it = list(iterable)
    for i in range(0, len(it), size):
        yield it[i:i + size]


def get_language_labels(qids):
    """Batch-fetch labels for a set of Q-ids (returns dict). Robust with retries & chunking."""
    qids = list({q for q in qids if q})
    if not qids:
        return {}
    results = {}
    for group in chunked(qids, MAX_QIDS_PER_CALL):
        params = {
            "action": "wbgetentities",
            "ids": "|".join(group),
            "props": "labels",
            "languages": "en",
            "format": "json",
            "origin": "*",
        }
        data = safe_get_json(WIKIDATA_QUERY_API, params=params)
        if not data:
            for q in group:  # leave unresolved
                results.setdefault(q, q)
            continue
        entities = data.get("entities", {})
        for qid, ent in entities.items():
            label = ent.get("labels", {}).get("en", {}).get("value")
            if label:
                results[qid] = label
            else:
                results.setdefault(qid, qid)
        time.sleep(0.1)  # politeness
    return results

# Simple caches
LABEL_CACHE: dict[str, str] = {}
VALID_QIDS: set[str] = set()
INVALID_QIDS: set[str] = set()


def is_valid_language(qid):
    """Check if a QID represents a valid language, dialect, or language family."""
    if not qid:
        return False
    query = f"""
    SELECT ?class WHERE {{
      wd:{qid} (wdt:P31/wdt:P279*) ?class.
    }}
    """
    response = safe_get_json(WIKIDATA_SPARQL_ENDPOINT, params={'query': query, 'format': 'json'}) or {}
    results = response.get('results', {}).get('bindings', [])
    classes = [r['class']['value'].split('/')[-1] for r in results]
    valid_classes = [
        'Q34770',    # language
        'Q33742',    # natural language
        'Q20162172', # human language
        'Q33384',    # dialect
        'Q25209536', # variety of language
        'Q1288568',  # modern language
        'Q25295',    # language family
        'Q1072694',  # constructed language
        'Q17376908', # language isolate
        'Q11755682', # proto-language
        "Q45762"
    ]
    return any(c in valid_classes for c in classes)


def is_valid_language_cached(qid: str) -> bool:
    if qid in VALID_QIDS:
        return True
    if qid in INVALID_QIDS:
        return False
    ok = is_valid_language(qid)
    (VALID_QIDS if ok else INVALID_QIDS).add(qid)
    return ok


def get_label(qid: str) -> str:
    """Return English label for a QID (falls back to QID)."""
    if not qid:
        return qid
    if qid in LABEL_CACHE:
        return LABEL_CACHE[qid]
    labels = get_language_labels([qid])
    label = labels.get(qid, qid)
    LABEL_CACHE[qid] = label
    return label


def get_wikidata_entity_id(language_name):
    """Return the Wikidata Q-identifier for a language name."""
    try:
        # First try direct page lookup
        params = {
            "action": "query",
            "titles": f"{language_name} language",
            "prop": "pageprops",
            "ppprop": "wikibase_item",
            "format": "json",
        }
        data = safe_get_json(WIKIPEDIA_API, params=params) or {}
        pages = data.get("query", {}).get("pages", {})
        for page in pages.values():
            if "pageprops" in page and "wikibase_item" in page["pageprops"]:
                return page["pageprops"]["wikibase_item"]
        # Search fallback
        params = {
            "action": "query",
            "list": "search",
            "srsearch": f"{language_name} language",
            "srlimit": 1,
            "format": "json",
        }
        data = safe_get_json(WIKIPEDIA_API, params=params) or {}
        search = data.get("query", {}).get("search", [])
        if search:
            page_title = search[0]["title"]
            params = {
                "action": "query",
                "titles": page_title,
                "prop": "pageprops",
                "ppprop": "wikibase_item",
                "format": "json",
            }
            data = safe_get_json(WIKIPEDIA_API, params=params) or {}
            pages = data.get("query", {}).get("pages", {})
            for page in pages.values():
                if "pageprops" in page and "wikibase_item" in page["pageprops"]:
                    return page["pageprops"]["wikibase_item"]
    except Exception as e:
        print(f"Error getting QID for {language_name}: {e}")
    return None


def get_parents(entity_id):
    query = f"""
    SELECT ?parent ?parentLabel WHERE {{
      wd:{entity_id} wdt:P279 ?parent.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    response = safe_get_json(WIKIDATA_SPARQL_ENDPOINT, params={'query': query, 'format': 'json'}) or {}
    results = response.get('results', {}).get('bindings', [])
    return [(r['parent']['value'].split('/')[-1], r['parentLabel']['value']) for r in results]


def get_children_by_p527(entity_id):
    query = f"""
    SELECT ?child ?childLabel WHERE {{
      wd:{entity_id} wdt:P527 ?child.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    response = safe_get_json(WIKIDATA_SPARQL_ENDPOINT, params={'query': query, 'format': 'json'}) or {}
    results = response.get('results', {}).get('bindings', [])
    return [(r['child']['value'].split('/')[-1], r['childLabel']['value']) for r in results]


def get_children(entity_id):
    query = f"""
    SELECT ?child ?childLabel WHERE {{
      ?child wdt:P279 wd:{entity_id}.
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
    }}
    """
    response = safe_get_json(WIKIDATA_SPARQL_ENDPOINT, params={'query': query, 'format': 'json'}) or {}
    results = response.get('results', {}).get('bindings', [])
    return [(r['child']['value'].split('/')[-1], r['childLabel']['value']) for r in results]


def build_language_family_tree(entity_id, depth, current_depth=1, visited=None):
    if visited is None:
        visited = set()
    if current_depth > depth or entity_id in visited:
        return []
    if len(visited) >= MAX_NODES:
        print("Max node cap reached; stopping expansion.")
        return []

    visited.add(entity_id)
    relations: list[tuple[str, str, str]] = []
    current_label = get_label(entity_id)

    # Parents (superclasses). Filter invalid before using.
    parents = get_parents(entity_id)
    if entity_id == "Q1680":  # debug for English
        print("Parents of English:", parents)
    for parent_id, parent_label in parents:
        if is_valid_language_cached(parent_id):
            relations.append((current_label, "Child of", parent_label))
            relations.extend(build_language_family_tree(parent_id, depth, current_depth + 1, visited))

    # Children by P527 (parts/members)
    for child_id, child_label in get_children_by_p527(entity_id):
        if child_id != entity_id and is_valid_language_cached(child_id):
            relations.append((child_label, "Child of", current_label))
            relations.extend(build_language_family_tree(child_id, depth, current_depth + 1, visited))

    # Children by reverse P279 (subclasses)
    for child_id, child_label in get_children(entity_id):
        if child_id != entity_id and child_id not in visited and is_valid_language_cached(child_id):
            relations.append((child_label, "Child of", current_label))
            relations.extend(build_language_family_tree(child_id, depth, current_depth + 1, visited))

    return relations


def get_language_family(language_name, depth):
    entity_id = get_wikidata_entity_id(language_name)
    if not entity_id:
        raise ValueError(f"Language '{language_name}' not found in Wikidata.")
    # Root language is forced valid to ensure at least a starting point
    VALID_QIDS.add(entity_id)
    relations = build_language_family_tree(entity_id, depth)
    unique_relations = list({(r[0], r[1], r[2]) for r in relations})
    formatted_relations = [{"language1": rel[0], "relationship": rel[1], "language2": rel[2]} for rel in unique_relations]
    return formatted_relations

# Example usage
family_tree = get_language_family("English", 2)
print(f"Entries: {len(family_tree)}")
print(family_tree[:10])  # preview first 10

Entries: 214
[{'language1': 'English language in Ukraine', 'relationship': 'Child of', 'language2': 'languages of Ukraine'}, {'language1': 'received pronunciation', 'relationship': 'Child of', 'language2': 'Standard English'}, {'language1': 'Gustavia English', 'relationship': 'Child of', 'language2': 'Caribbean English'}, {'language1': 'Palauan English', 'relationship': 'Child of', 'language2': 'English'}, {'language1': 'Jewish English varieties', 'relationship': 'Child of', 'language2': 'Jewish languages'}, {'language1': 'Quebec English', 'relationship': 'Child of', 'language2': 'Canadian English'}, {'language1': 'Zambian English', 'relationship': 'Child of', 'language2': 'English'}, {'language1': 'Belizean English', 'relationship': 'Child of', 'language2': 'Caribbean English'}, {'language1': 'English in the Netherlands', 'relationship': 'Child of', 'language2': 'English'}, {'language1': 'Pacific Northwest English', 'relationship': 'Child of', 'language2': 'North American English'}]


In [None]:
# (Reserved cell) – helper / scratch space.
# Visualization cells will follow below.


In [3]:
# Build a NetworkX graph from the family_tree list of dicts (language1, relationship, language2)
try:
    import networkx as nx
except ImportError:
    raise SystemExit("Please install networkx: pip install networkx")

from collections import defaultdict, deque

if 'family_tree' not in globals():
    raise RuntimeError("family_tree not defined – run the retrieval cell first.")

def build_graph(relations):
    G = nx.DiGraph()
    for rel in relations:
        l1 = rel["language1"]
        l2 = rel["language2"]
        label = rel["relationship"]
        # Edge direction: parent -> child (reverse of 'Child of')
        if label == "Child of":
            parent, child = l2, l1
        else:
            parent, child = l1, l2
        G.add_node(parent)
        G.add_node(child)
        G.add_edge(parent, child, relationship=label)
    return G

family_graph = build_graph(family_tree)
print(f"Nodes: {family_graph.number_of_nodes()}, Edges: {family_graph.number_of_edges()}")

# Derive depths (heuristic BFS from nodes with no predecessors)
roots = [n for n in family_graph.nodes() if family_graph.in_degree(n) == 0]
node_depth = {n: 0 for n in roots}
for r in roots:
    q = deque([(r, 0)])
    while q:
        node, d = q.popleft()
        for child in family_graph.successors(node):
            if child not in node_depth or d + 1 < node_depth[child]:
                node_depth[child] = d + 1
                q.append((child, d + 1))
print("Depth levels computed (sample):", list(node_depth.items())[:10])

Nodes: 193, Edges: 214
Depth levels computed (sample): [('languages of Ukraine', 0), ('Jewish languages', 0), ('languages of Sweden', 0), ('languages of the Falkland Islands', 0), ('languages of Europe', 0), ('languages of Switzerland', 0), ('languages of Puerto Rico', 0), ('languages of Lebanon', 0), ('Anglo-Frisian', 0), ('languages of Spain', 0)]


In [4]:
# Interactive visualization with PyVis (HTML)
try:
    from pyvis.network import Network
except ImportError:
    raise SystemExit("Please install pyvis: pip install pyvis")

if 'family_graph' not in globals():
    raise RuntimeError("family_graph not defined – run the graph build cell first.")

palette = [
    '#1f77b4', '#2ca02c', '#ff7f0e', '#9467bd', '#8c564b',
    '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'
]

net = Network(height='750px', width='100%', directed=True, notebook=True, bgcolor='#ffffff', font_color='#222222')
net.toggle_physics(True)
net.set_options("""
var options = {
  physics: { stabilization: { iterations: 150 }, barnesHut: { gravitationalConstant: -4000 } },
  edges: { arrows: { to: { enabled: true } }, smooth: { type: 'dynamic' } }
}
""")

for node in family_graph.nodes():
    depth = globals().get('node_depth', {}).get(node, 0)
    color = palette[depth % len(palette)]
    net.add_node(node, label=node, color=color, level=depth)

for u, v, data in family_graph.edges(data=True):
    rel = data.get('relationship', '')
    net.add_edge(u, v, title=rel)

net.show('language_family_tree.html')
print("Generated language_family_tree.html (open in sidebar or a browser).")



JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)

In [6]:
# ASCII tree fallback (choose roots heuristically: nodes with no incoming edges)
from collections import defaultdict

if 'family_graph' not in globals():
    raise RuntimeError("family_graph not defined – run the graph build cell first.")

incoming = defaultdict(int)
for u, v in family_graph.edges():
    incoming[v] += 1

roots = [n for n in family_graph.nodes() if incoming[n] == 0] or list(family_graph.nodes())[:1]

PRINT_LIMIT = 400  # avoid huge console spam
printed = 0

def print_tree(node, prefix="", visited=None):
    global printed
    if printed >= PRINT_LIMIT:
        return
    if visited is None:
        visited = set()
    depth = globals().get('node_depth', {}).get(node, 0)
    print(f"{prefix}{node} [d={depth}]")
    printed += 1
    if node in visited:
        print(prefix + "  (cycle)")
        return
    visited.add(node)
    children = list(family_graph.successors(node))
    for i, child in enumerate(children):
        if printed >= PRINT_LIMIT:
            print("... output truncated ...")
            return
        is_last = i == len(children) - 1
        connector = "└─ " if is_last else "├─ "
        print_tree(child, prefix + connector, visited)

for r in roots:
    if printed >= PRINT_LIMIT:
        break
    print_tree(r)
if printed >= PRINT_LIMIT:
    print(f"Truncated after {PRINT_LIMIT} nodes.")

languages of Ukraine [d=0]
└─ English language in Ukraine [d=1]
Jewish languages [d=0]
└─ Jewish English varieties [d=1]
languages of Sweden [d=0]
└─ English language in Sweden [d=1]
languages of the Falkland Islands [d=0]
└─ Falkland Islands English [d=1]
languages of Europe [d=0]
└─ English language in Europe [d=1]
languages of Switzerland [d=0]
└─ English language in Switzerland [d=1]
languages of Puerto Rico [d=0]
└─ English language in Puerto Rico [d=1]
languages of Lebanon [d=0]
└─ English language in Lebanon [d=1]
Anglo-Frisian [d=0]
└─ Anglic [d=1]
└─ ├─ Yola [d=2]
└─ ├─ Middle English [d=2]
└─ ├─ ├─ Kentish Dialect [d=3]
└─ ├─ ├─ Southern Dialect [d=3]
└─ ├─ ├─ Central Middle English [d=3]
└─ ├─ ├─ Southern Middle English [d=3]
└─ ├─ ├─ East Midland Dialect [d=3]
└─ ├─ ├─ Northern Middle English [d=3]
└─ ├─ ├─ West Midland Dialect [d=3]
└─ ├─ ├─ Late Middle English [d=3]
└─ ├─ ├─ Northern Dialect [d=3]
└─ ├─ ├─ Midland Middle English [d=3]
└─ ├─ ├─ Q1877420 [d=3]
└─ ├─ └─ Earl

In [16]:
from SPARQLWrapper import SPARQLWrapper, JSON

class WikidataLanguageValidator:
    def __init__(self):
        self.sparql = SPARQLWrapper("https://query.wikidata.org/sparql")
        self.valid_types = {
            "language": "wd:Q34770",          # language
            "language_family": "wd:Q25295", # language family
            "dialect": "wd:Q33384"  ,           # dialect
            "extinct_language":"wd:Q38058796",
            "dead_language":"wd:Q45762"
        }

    def validate_qid(self, qid):
        # SPARQL query to check instance of and subclasses for valid types
        query = f"""
        SELECT ?type ?typeLabel WHERE {{
          VALUES ?item {{ wd:{qid} }} 
          {{
            ?item wdt:P31 ?type.
          }} UNION {{
            ?item wdt:P279 ?type.
          }}
          VALUES ?validType {{ { ' '.join(self.valid_types.values())} }}
          FILTER(?type IN (?validType))
          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
        }}
        LIMIT 1
        """
        
        self.sparql.setQuery(query)
        self.sparql.setReturnFormat(JSON)
        results = self.sparql.query().convert()

        if results["results"]["bindings"]:
            type_uri = results["results"]["bindings"][0]["type"]["value"]
            type_label = results["results"]["bindings"][0]["typeLabel"]["value"]

            for key, val in self.valid_types.items():
                if type_uri.endswith(val.split(":")[1]):
                    return True, key

        return False, None

# Example usage:
validator = WikidataLanguageValidator()
valid, classification = validator.validate_qid("Q56395")  # e.g., "English language" QID = Q1860
print(valid, classification)


True extinct_language


In [18]:
import requests

WIKIDATA_SPARQL_ENDPOINT = "https://query.wikidata.org/sparql"
HEADERS = {"User-Agent": "LanguageFamilyTreeBot/1.0 (https://example.com)"}

def get_distribution_map_image(qid):
    query = f"""
    SELECT ?image WHERE {{
      wd:{qid} wdt:P1846 ?image.
    }}
    """
    response = requests.get(WIKIDATA_SPARQL_ENDPOINT, headers=HEADERS, params={'query': query, 'format': 'json'})
    results = response.json().get('results', {}).get('bindings', [])
    if results:
        # The image URL usually is in the form of a Wikimedia Commons URL or file name
        image_url = results[0]['image']['value']
        return image_url
    else:
        return None

# Example usage:
qid = 'Q1860'  # English language QID
image = get_distribution_map_image(qid)
print(image)


http://commons.wikimedia.org/wiki/Special:FilePath/Anglospeak%20%28subnational%20version%29.svg


In [3]:
import requests
import json
import re
from typing import Dict, Any, Optional, List
from urllib.parse import quote

class LanguageInfoboxExtractor:
    """
    A robust class to extract language information from Wikipedia infoboxes
    using QID (Wikidata identifier) with multiple fallback methods.
    """
    
    def __init__(self):
        self.wikidata_api = "https://www.wikidata.org/w/api.php"
        self.wikipedia_api = "https://en.wikipedia.org/w/api.php"
        self.sparql_endpoint = "https://query.wikidata.org/sparql"
        
    def extract_language_info(self, qid: str) -> Dict[str, Any]:
        """
        Main function to extract language information from infobox using QID.
        
        Args:
            qid (str): Wikidata QID (e.g., 'Q1860' for English)
            
        Returns:
            Dict containing extracted language information
        """
        try:
            # Method 1: Try Wikidata API first (most reliable)
            wikidata_info = self._get_wikidata_info(qid)
            
            # Method 2: Get Wikipedia page and extract infobox
            wikipedia_info = self._get_wikipedia_infobox(qid)
            
            # Method 3: Use SPARQL for comprehensive data
            sparql_info = self._get_sparql_info(qid)
            
            # Merge all information with priority: SPARQL > Wikidata > Wikipedia
            merged_info = self._merge_info(sparql_info, wikidata_info, wikipedia_info)
            merged_info['qid'] = qid
            
            return merged_info
            
        except Exception as e:
            return {"error": f"Failed to extract information for {qid}: {str(e)}"}
    
    def _get_wikidata_info(self, qid: str) -> Dict[str, Any]:
        """Extract language information using Wikidata API."""
        try:
            params = {
                'action': 'wbgetentities',
                'format': 'json',
                'ids': qid,
                'props': 'labels|descriptions|claims|sitelinks',
                'languages': 'en',
                'sitefilter': 'enwiki'
            }
            
            response = requests.get(self.wikidata_api, params=params, timeout=10)
            data = response.json()
            
            if 'entities' not in data or qid not in data['entities']:
                return {}
                
            entity = data['entities'][qid]
            
            # Extract relevant language properties
            info = {
                'name': entity.get('labels', {}).get('en', {}).get('value', ''),
                'description': entity.get('descriptions', {}).get('en', {}).get('value', ''),
                'wikipedia_url': self._get_wikipedia_url(entity.get('sitelinks', {})),
            }
            
            # Extract claims for language-specific properties
            claims = entity.get('claims', {})
            
            # Map common language properties based on Template:Infobox language
            property_mapping = {
                'P31': 'instance_of',           # Instance of
                'P279': 'subclass_of',          # Subclass of  
                'P17': 'country',               # Country/states
                'P495': 'country_of_origin',    # Country of origin
                'P2341': 'indigenous_to',       # Indigenous to/region
                'P1098': 'speakers',            # Number of speakers
                'P1999': 'pronunciation',       # Pronunciation
                'P1705': 'native_name',         # Native name
                'P138': 'named_after',          # Named after
                'P361': 'part_of',              # Part of (language family)
                'P527': 'has_parts',            # Has parts (dialects)
                'P155': 'follows',              # Follows (early forms)
                'P156': 'followed_by',          # Followed by
                'P460': 'said_to_be_same_as',   # Said to be same as
                'P1412': 'languages_spoken',    # Languages spoken
                'P103': 'native_language',      # Native language
                'P282': 'writing_system',       # Writing system/script
                'P37': 'official_language_in',  # Official language in
                'P1885': 'minority_language_in', # Minority language in
                'P1018': 'language_regulatory_body', # Regulated by/agency
                'P219': 'iso_639_1',           # ISO 639-1 code
                'P218': 'iso_639_2',           # ISO 639-2 code  
                'P220': 'iso_639_3',           # ISO 639-3 code
                'P1394': 'linguist_list',      # Linguist List code
                'P1233': 'glottolog',          # Glottolog code
                'P3823': 'aiatsis',            # AIATSIS code
                'P1216': 'guthrie_code',       # Guthrie code
                'P3133': 'linguasphere',       # Linguasphere code
                'P5755': 'ietf_tag',           # IETF language tag
                'P625': 'coordinates',          # Geographic coordinates
                'P18': 'image',                 # Image
                'P242': 'locator_map',          # Locator map
            }
            
            for prop_id, prop_name in property_mapping.items():
                if prop_id in claims:
                    info[prop_name] = self._extract_claim_values(claims[prop_id])
            
            return info
            
        except Exception as e:
            print(f"Wikidata API error: {e}")
            return {}
    
    def _get_wikipedia_infobox(self, qid: str) -> Dict[str, Any]:
        """Extract infobox from Wikipedia page using QID."""
        try:
            # First get Wikipedia page title from QID
            page_title = self._get_wikipedia_title(qid)
            if not page_title:
                return {}
                
            # Get page content with infobox
            params = {
                'action': 'parse',
                'format': 'json',
                'page': page_title,
                'prop': 'wikitext',
                'section': 0
            }
            
            response = requests.get(self.wikipedia_api, params=params, timeout=10)
            data = response.json()
            
            if 'parse' not in data or 'wikitext' not in data['parse']:
                return {}
                
            wikitext = data['parse']['wikitext']['*']
            
            # Extract infobox using regex
            infobox_data = self._parse_infobox(wikitext)
            
            return infobox_data
            
        except Exception as e:
            print(f"Wikipedia infobox error: {e}")
            return {}
    
    def _get_sparql_info(self, qid: str) -> Dict[str, Any]:
        """Get comprehensive language information using SPARQL."""
        try:
            query = f"""
            PREFIX wd: <http://www.wikidata.org/entity/>
            PREFIX wdt: <http://www.wikidata.org/prop/direct/>
            PREFIX wikibase: <http://wikiba.se/ontology#>
            PREFIX bd: <http://www.bigdata.com/rdf#>
            
            SELECT ?property ?propertyLabel ?value ?valueLabel WHERE {{
              wd:{qid} ?predicate ?value .
              ?property wikibase:directClaim ?predicate .
              FILTER(?predicate != wdt:P31 || ?value != wd:Q34770)
              SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
            }}
            LIMIT 200
            """
            
            headers = {
                'Accept': 'application/sparql-results+json',
                'User-Agent': 'Language-Infobox-Extractor/1.0'
            }
            
            response = requests.get(
                self.sparql_endpoint,
                params={'query': query},
                headers=headers,
                timeout=15
            )
            
            data = response.json()
            
            if 'results' not in data or 'bindings' not in data['results']:
                return {}
                
            sparql_info = {}
            for binding in data['results']['bindings']:
                prop_label = binding.get('propertyLabel', {}).get('value', '')
                value_label = binding.get('valueLabel', {}).get('value', '')
                
                if prop_label and value_label:
                    if prop_label not in sparql_info:
                        sparql_info[prop_label] = []
                    if value_label not in sparql_info[prop_label]:
                        sparql_info[prop_label].append(value_label)
            
            return sparql_info
            
        except Exception as e:
            print(f"SPARQL error: {e}")
            return {}
    
    def _get_wikipedia_url(self, sitelinks: Dict) -> str:
        """Extract Wikipedia URL from sitelinks."""
        if 'enwiki' in sitelinks:
            title = sitelinks['enwiki']['title']
            return f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
        return ""
    
    def _get_wikipedia_title(self, qid: str) -> Optional[str]:
        """Get Wikipedia page title from QID."""
        try:
            params = {
                'action': 'wbgetentities',
                'format': 'json',
                'ids': qid,
                'props': 'sitelinks',
                'sitefilter': 'enwiki'
            }
            
            response = requests.get(self.wikidata_api, params=params, timeout=10)
            data = response.json()
            
            if ('entities' in data and qid in data['entities'] and 
                'sitelinks' in data['entities'][qid] and 
                'enwiki' in data['entities'][qid]['sitelinks']):
                return data['entities'][qid]['sitelinks']['enwiki']['title']
                
        except Exception:
            pass
        return None
    
    def _extract_claim_values(self, claims: List) -> List[str]:
        """Extract values from Wikidata claims."""
        values = []
        for claim in claims:
            try:
                if 'mainsnak' in claim and 'datavalue' in claim['mainsnak']:
                    datavalue = claim['mainsnak']['datavalue']
                    if datavalue['type'] == 'wikibase-entityid':
                        # For entity references, we'd need another API call to get labels
                        entity_id = datavalue['value']['id']
                        values.append(f"wd:{entity_id}")
                    elif datavalue['type'] == 'string':
                        values.append(datavalue['value'])
                    elif datavalue['type'] == 'quantity':
                        values.append(str(datavalue['value']['amount']))
                    elif datavalue['type'] == 'time':
                        values.append(datavalue['value']['time'])
            except Exception:
                continue
        return values
    
    def _parse_infobox(self, wikitext: str) -> Dict[str, Any]:
        """Parse infobox from Wikipedia wikitext."""
        infobox_info = {}
        
        # Look for infobox language template
        infobox_pattern = r'{{\\s*[Ii]nfobox\\s+[Ll]anguage[^}]*?}}'
        match = re.search(infobox_pattern, wikitext, re.DOTALL)
        
        if not match:
            return {}
            
        infobox_text = match.group(0)
        
        # Extract parameters using more robust regex
        lines = infobox_text.split('\n')
        current_param = None
        current_value = ""
        
        for line in lines[1:-1]:  # Skip first {{ and last }}
            line = line.strip()
            if line.startswith('|'):
                # Save previous parameter
                if current_param:
                    infobox_info[current_param.lower().strip()] = current_value.strip()
                
                # Start new parameter
                if '=' in line:
                    param, value = line[1:].split('=', 1)
                    current_param = param.strip()
                    current_value = value.strip()
                else:
                    current_param = line[1:].strip()
                    current_value = ""
            else:
                # Continue current parameter value
                if current_param:
                    current_value += " " + line
        
        # Save last parameter
        if current_param:
            infobox_info[current_param.lower().strip()] = current_value.strip()
        
        # Clean up values
        for key, value in infobox_info.items():
            # Remove templates and links
            value = re.sub(r'{{[^}]*}}', '', value)
            value = re.sub(r'\\[\\[[^\\]]*\\]\\]', '', value)
            value = re.sub(r'\\s+', ' ', value)
            infobox_info[key] = value.strip()
        
        return infobox_info
    
    def _merge_info(self, sparql_info: Dict, wikidata_info: Dict, wikipedia_info: Dict) -> Dict[str, Any]:
        """Merge information from all sources with structured output."""
        merged = {
            'extraction_methods': [],
            'basic_info': {},
            'language_family': {},
            'speakers_info': {},
            'geographic_info': {},
            'language_codes': {},
            'writing_system': {},
            'official_status': {},
            'dialects': {},
            'infobox_fields': {},
            'all_properties': {}
        }
        
        # Track which methods provided data
        if sparql_info:
            merged['extraction_methods'].append('SPARQL')
        if wikidata_info:
            merged['extraction_methods'].append('Wikidata API')
        if wikipedia_info:
            merged['extraction_methods'].append('Wikipedia infobox')
        
        # Merge basic information
        if wikidata_info:
            merged['basic_info'] = {
                'name': wikidata_info.get('name', ''),
                'native_name': wikidata_info.get('native_name', []),
                'description': wikidata_info.get('description', ''),
                'wikipedia_url': wikidata_info.get('wikipedia_url', '')
            }
            
            # Categorize information
            merged['language_family'] = {
                'part_of': wikidata_info.get('part_of', []),
                'has_parts': wikidata_info.get('has_parts', [])
            }
            
            merged['speakers_info'] = {
                'speakers': wikidata_info.get('speakers', [])
            }
            
            merged['geographic_info'] = {
                'country': wikidata_info.get('country', []),
                'indigenous_to': wikidata_info.get('indigenous_to', []),
                'coordinates': wikidata_info.get('coordinates', [])
            }
            
            merged['language_codes'] = {
                'iso_639_1': wikidata_info.get('iso_639_1', []),
                'iso_639_2': wikidata_info.get('iso_639_2', []),
                'iso_639_3': wikidata_info.get('iso_639_3', []),
                'glottolog': wikidata_info.get('glottolog', []),
                'linguist_list': wikidata_info.get('linguist_list', []),
                'linguasphere': wikidata_info.get('linguasphere', []),
                'ietf_tag': wikidata_info.get('ietf_tag', [])
            }
            
            merged['writing_system'] = {
                'script': wikidata_info.get('writing_system', [])
            }
            
            merged['official_status'] = {
                'official_in': wikidata_info.get('official_language_in', []),
                'minority_in': wikidata_info.get('minority_language_in', []),
                'regulated_by': wikidata_info.get('language_regulatory_body', [])
            }
        
        # Add Wikipedia infobox raw data
        if wikipedia_info:
            merged['infobox_fields'] = wikipedia_info
        
        # Add all SPARQL properties
        if sparql_info:
            merged['all_properties'] = sparql_info
        
        return merged

# Convenience function for easy usage
def extract_language_info_by_qid(qid: str) -> Dict[str, Any]:
    """
    Extract language information from Wikipedia infobox using QID.
    
    Args:
        qid (str): Wikidata QID (e.g., 'Q1860' for English, 'Q150' for French)
        
    Returns:
        Dict containing comprehensive language information
        
    Example:
        >>> info = extract_language_info_by_qid('Q1860')  # English
        >>> print(info['basic_info']['name'])
        'English'
        >>> print(info['language_codes']['iso_639_1'])
        ['en']
    """
    extractor = LanguageInfoboxExtractor()
    return extractor.extract_language_info(qid)

# Helper function to get QID from language name
def get_qid_from_language_name(language_name: str) -> Optional[str]:
    """
    Get QID from language name using Wikidata search.
    
    Args:
        language_name (str): Name of the language (e.g., 'English', 'French')
        
    Returns:
        QID string if found, None otherwise
    """
    try:
        search_url = "https://www.wikidata.org/w/api.php"
        params = {
            'action': 'wbsearchentities',
            'format': 'json',
            'language': 'en',
            'type': 'item',
            'search': language_name,
            'limit': 5
        }
        
        response = requests.get(search_url, params=params, timeout=10)
        data = response.json()
        
        if 'search' in data and data['search']:
            # Look for items that are instances of language or natural language
            for item in data['search']:
                description = item.get('description', '').lower()
                if any(keyword in description for keyword in ['language', 'linguistic']):
                    return item['id']
                    
        return None
        
    except Exception as e:
        print(f"Error searching for QID: {e}")
        return None

# Testing function
def test_extraction():
    """Test the extraction with common language QIDs."""
    test_cases = {
        'Q1860': 'English',
        'Q150': 'French', 
        'Q188': 'German',
        'Q1321': 'Spanish',
        'Q7737': 'Russian',
        'Q9058': 'Chinese',
        'Q5287': 'Japanese'
    }
    
    results = {}
    for qid, lang_name in test_cases.items():
        print(f"Testing {lang_name} ({qid})...")
        try:
            info = extract_language_info_by_qid(qid)
            if 'error' not in info:
                print(f"✓ Successfully extracted data for {lang_name}")
                print(f"  Methods used: {', '.join(info.get('extraction_methods', []))}")
                print(f"  Basic info: {info.get('basic_info', {}).get('name', 'Unknown')}")
            else:
                print(f"✗ Error for {lang_name}: {info['error']}")
            results[qid] = info
        except Exception as e:
            print(f"✗ Failed to extract {lang_name}: {e}")
            results[qid] = {"error": str(e)}
    
    return results

if __name__ == "__main__":
    # Example usage
    print("Language Infobox Extractor")
    print("=" * 50)
    
    # Test with English
    qid = "Q1860"  # English
    print(f"\nExtracting information for QID: {qid}")
    
    try:
        language_info = extract_language_info_by_qid(qid)
        print(json.dumps(language_info, indent=2, ensure_ascii=False))
    except Exception as e:
        print(f"Error: {e}")
    
    # Optional: Run tests for multiple languages
    # print("\n" + "=" * 50)
    # print("Running tests for multiple languages...")
    # test_results = test_extraction()

Language Infobox Extractor

Extracting information for QID: Q1860
Wikidata API error: Expecting value: line 1 column 1 (char 0)
{
  "extraction_methods": [
    "SPARQL"
  ],
  "basic_info": {},
  "language_family": {},
  "speakers_info": {},
  "geographic_info": {},
  "language_codes": {},
  "writing_system": {},
  "official_status": {},
  "dialects": {},
  "infobox_fields": {},
  "all_properties": {
    "country": [
      "Samoa",
      "Solomon Islands",
      "Vanuatu",
      "Papua New Guinea",
      "Palau",
      "Nauru",
      "Federated States of Micronesia",
      "Marshall Islands",
      "Kiribati",
      "Fiji",
      "Guyana",
      "Trinidad and Tobago",
      "Saint Vincent and the Grenadines",
      "Saint Lucia",
      "Saint Kitts and Nevis",
      "Jamaica",
      "Grenada",
      "The Bahamas",
      "Antigua and Barbuda",
      "Dominica",
      "Jersey",
      "Israel",
      "Lebanon",
      "Maldives",
      "Malaysia",
      "Oman",
      "Pakistan",
      "Sri

In [20]:
import requests
import re
from typing import Dict, Any, Optional
import json

class WikipediaInfoboxExtractor:
    """
    A focused class to extract language information specifically from Wikipedia infoboxes.
    Uses only Wikipedia API and infobox parsing - no Wikidata or SPARQL.
    """
    
    def __init__(self):
        self.wikipedia_api = "https://en.wikipedia.org/w/api.php"
        
    def extract_infobox_by_qid(self, qid: str) -> Dict[str, Any]:
        """
        Extract infobox data using Wikidata QID to find the Wikipedia page.
        
        Args:
            qid (str): Wikidata QID (e.g., 'Q1860' for English)
            
        Returns:
            Dict containing parsed infobox fields
        """
        try:
            # Get Wikipedia page title from QID
            page_title = self._get_wikipedia_title_from_qid(qid)
            if not page_title:
                return {"error": f"No Wikipedia page found for QID: {qid}"}
            
            return self.extract_infobox_by_title(page_title)
            
        except Exception as e:
            return {"error": f"Failed to extract infobox for {qid}: {str(e)}"}
    
    def extract_infobox_by_title(self, page_title: str) -> Dict[str, Any]:
        """
        Extract infobox data from Wikipedia page by title.
        
        Args:
            page_title (str): Wikipedia page title (e.g., 'English language')
            
        Returns:
            Dict containing parsed infobox fields
        """
        try:
            # Get page wikitext
            wikitext = self._get_page_wikitext(page_title)
            if not wikitext:
                return {"error": f"Could not retrieve page content for: {page_title}"}
            
            # Extract and parse infobox
            infobox_data = self._parse_language_infobox(wikitext)
            
            if not infobox_data:
                return {"error": f"No language infobox found on page: {page_title}"}
            
            # Add metadata
            result = {
                "page_title": page_title,
                "wikipedia_url": f"https://en.wikipedia.org/wiki/{page_title.replace(' ', '_')}",
                "infobox_type": "language",
                "fields": infobox_data,
                "field_count": len(infobox_data)
            }
            
            return result
            
        except Exception as e:
            return {"error": f"Failed to extract infobox from {page_title}: {str(e)}"}
    
    def _get_wikipedia_title_from_qid(self, qid: str) -> Optional[str]:
        """Get Wikipedia page title from Wikidata QID."""
        try:
            wikidata_api = "https://www.wikidata.org/w/api.php"
            params = {
                'action': 'wbgetentities',
                'format': 'json',
                'ids': qid,
                'props': 'sitelinks',
                'sitefilter': 'enwiki'
            }
            
            response = requests.get(wikidata_api, params=params, timeout=10)
            data = response.json()
            
            if ('entities' in data and qid in data['entities'] and 
                'sitelinks' in data['entities'][qid] and 
                'enwiki' in data['entities'][qid]['sitelinks']):
                return data['entities'][qid]['sitelinks']['enwiki']['title']
                
        except Exception:
            pass
        return None
    
    def _get_page_wikitext(self, page_title: str) -> Optional[str]:
        """Get raw wikitext from Wikipedia page."""
        try:
            params = {
                'action': 'parse',
                'format': 'json',
                'page': page_title,
                'prop': 'wikitext',
                'section': 0  # Only get the first section where infobox usually is
            }
            HEADERS = {"User-Agent": "LanguageFamilyTreeService/1.0 (https://example.com)", "Accept": "application/json"}
    
            response = requests.get(self.wikipedia_api, params=params, timeout=10)
            data = response.json()
            
            if 'parse' in data and 'wikitext' in data['parse']:
                return data['parse']['wikitext']['*']
                
        except Exception as e:
            print(f"Error getting wikitext: {e}")
            
        return None
    
    def _parse_language_infobox(self, wikitext: str) -> Dict[str, str]:
        """
        Parse Template:Infobox language from wikitext.
        
        Returns:
            Dictionary with infobox field names as keys and values as strings
        """
        # Look for infobox language template - case insensitive
        infobox_pattern = r'{{\\s*[Ii]nfobox\\s+[Ll]anguage\\s*([^{}]*(?:{[^{}]*}[^{}]*)*)}}'
        
        match = re.search(infobox_pattern, wikitext, re.DOTALL)
        if not match:
            return {}
        
        infobox_content = match.group(1)
        
        # Parse parameters using a more robust approach
        fields = {}
        
        # Split by | but be careful about nested templates and links
        parts = self._split_infobox_params(infobox_content)
        
        for part in parts:
            part = part.strip()
            if '=' in part:
                key, value = part.split('=', 1)
                key = key.strip()
                value = value.strip()
                
                if key and value:
                    # Clean the value
                    cleaned_value = self._clean_infobox_value(value)
                    if cleaned_value:  # Only add non-empty values
                        fields[key] = cleaned_value
        
        return fields
    
    def _split_infobox_params(self, content: str) -> list:
        """
        Split infobox content by | while respecting nested templates and links.
        """
        parts = []
        current_part = ""
        brace_depth = 0
        bracket_depth = 0
        
        i = 0
        while i < len(content):
            char = content[i]
            
            if char == '{':
                if i + 1 < len(content) and content[i + 1] == '{':
                    brace_depth += 1
                    current_part += '{{'
                    i += 2
                    continue
            elif char == '}':
                if i + 1 < len(content) and content[i + 1] == '}':
                    brace_depth -= 1
                    current_part += '}}'
                    i += 2
                    continue
            elif char == '[':
                if i + 1 < len(content) and content[i + 1] == '[':
                    bracket_depth += 1
                    current_part += '[['
                    i += 2
                    continue
            elif char == ']':
                if i + 1 < len(content) and content[i + 1] == ']':
                    bracket_depth -= 1
                    current_part += ']]'
                    i += 2
                    continue
            elif char == '|' and brace_depth == 0 and bracket_depth == 0:
                if current_part.strip():
                    parts.append(current_part.strip())
                current_part = ""
                i += 1
                continue
            
            current_part += char
            i += 1
        
        # Add the last part
        if current_part.strip():
            parts.append(current_part.strip())
        
        return parts
    
    def _clean_infobox_value(self, value: str) -> str:
        """
        Clean infobox value by removing wiki markup but preserving content.
        """
        # Remove comments
        value = re.sub(r'<!--.*?-->', '', value, flags=re.DOTALL)
        
        # Handle templates - remove simple ones, extract content from others
        def replace_template(match):
            template_content = match.group(1)
            # For simple formatting templates, try to extract useful content
            if '|' in template_content:
                parts = template_content.split('|')
                # Return the last part which is usually the display text
                return parts[-1].strip()
            return ''
        
        value = re.sub(r'{{([^{}]+)}}', replace_template, value)
        
        # Handle links - extract display text or target
        def replace_link(match):
            link_content = match.group(1)
            if '|' in link_content:
                # [[target|display text]] -> display text
                return link_content.split('|')[-1].strip()
            else:
                # [[target]] -> target
                return link_content.strip()
        
        value = re.sub(r'\\[\\[([^\\]]+)\\]\\]', replace_link, value)
        
        # Remove remaining markup
        value = re.sub(r"'{2,}", '', value)  # Remove bold/italic markup
        value = re.sub(r'<[^>]+>', '', value)  # Remove HTML tags
        value = re.sub(r'\\n+', ' ', value)  # Replace newlines with spaces
        value = re.sub(r'\\s+', ' ', value)  # Normalize whitespace
        
        return value.strip()
    
    def get_infobox_template_fields(self) -> Dict[str, str]:
        """
        Return a mapping of common Template:Infobox language fields and their descriptions.
        Based on the official documentation.
        """
        return {
            # Basic information
            "name": "Language name",
            "altname": "Alternative name",
            "nativename": "Native name",
            "pronunciation": "Pronunciation in IPA",
            "acceptance": "Questionable acceptance/status",
            
            # Geographic and demographic
            "states": "Countries where mainly spoken",
            "state": "Alias for states", 
            "region": "Geographic regions",
            "ethnicity": "Associated ethnic groups",
            "speakers": "Number of native speakers",
            "date": "Date of speaker estimate",
            "dateprefix": "Text before date",
            "ref": "Reference for speaker data",
            "speakers2": "Second line of speaker data",
            "era": "Era of use (historical languages)",
            "extinct": "Date/info about extinction",
            "revived": "Revival information",
            
            # Language classification
            "familycolor": "Language family color code",
            "family": "Language family description",
            "fam1": "Broadest language family",
            "fam2": "More specific subfamily", 
            "fam3": "Even more specific group",
            "fam4": "Fourth level classification",
            "fam5": "Fifth level classification",
            "fam6": "Sixth level classification",
            "fam7": "Seventh level classification",
            "fam8": "Eighth level classification",
            "fam9": "Ninth level classification",
            "fam10": "Tenth level classification",
            "fam11": "Eleventh level classification",
            "fam12": "Twelfth level classification",
            "fam13": "Thirteenth level classification",
            "fam14": "Fourteenth level classification",
            "fam15": "Fifteenth level classification",
            "ancestor": "Ancestral form",
            "ancestor2": "Second ancestral form",
            "ancestor3": "Third ancestral form",
            "ancestor4": "Fourth ancestral form",
            "ancestor5": "Fifth ancestral form",
            "ancestor6": "Sixth ancestral form",
            "ancestor7": "Seventh ancestral form",
            "ancestor8": "Eighth ancestral form",
            "protoname": "Proto-language name",
            
            # Dialects and standards
            "dialects": "Dialect information",
            "dia1": "First dialect",
            "dia2": "Second dialect",
            "dia3": "Third dialect",
            "dia4": "Fourth dialect",
            "dia5": "Fifth dialect",
            "listclass": "List class for dialects",
            "standards": "Standard forms",
            "stand1": "First standard",
            "stand2": "Second standard",
            "stand3": "Third standard",
            "stand4": "Fourth standard",
            "stand5": "Fifth standard",
            "stand6": "Sixth standard",
            
            # Writing and communication
            "script": "Writing system",
            "sign": "Sign language forms",
            "posteriori": "A posteriori sources (conlangs)",
            
            # Official status
            "nation": "Countries where official",
            "minority": "Countries where minority language",
            "agency": "Regulatory body",
            "development_body": "Development organization",
            
            # Language codes
            "iso1": "ISO 639-1 code",
            "iso1comment": "ISO 639-1 comment",
            "iso2": "ISO 639-2 code",
            "iso2b": "ISO 639-2 bibliographic code",
            "iso2t": "ISO 639-2 terminological code",
            "iso2comment": "ISO 639-2 comment",
            "iso3": "ISO 639-3 code",
            "iso3comment": "ISO 639-3 comment",
            "iso6": "ISO 639-6 code",
            "isoexception": "ISO exception type",
            "lc1": "First dialect ISO code",
            "ld1": "First dialect name",
            "lc2": "Second dialect ISO code",
            "ld2": "Second dialect name",
            "linglist": "Linguist List code",
            "lingname": "Linguist List name",
            "linglist2": "Second Linguist List code",
            "lingname2": "Second Linguist List name",
            "glotto": "Glottolog code",
            "glottorefname": "Glottolog reference name",
            "glotto2": "Second Glottolog code",
            "glottorefname2": "Second Glottolog reference name",
            "aiatsis": "AIATSIS code",
            "aiatsisname": "AIATSIS name",
            "aiatsis2": "Second AIATSIS code",
            "aiatsisname2": "Second AIATSIS name",
            "guthrie": "Guthrie code (Bantu)",
            "ELP": "Endangered Languages Project",
            "ELPname": "ELP name",
            "ELP2": "Second ELP link",
            "ELPname2": "Second ELP name",
            "glottopedia": "Glottopedia code",
            "lingua": "Linguasphere code",
            "lingua_ref": "Linguasphere reference",
            "ietf": "IETF language tag",
            
            # Media
            "image": "Image file name",
            "imagescale": "Image scale",
            "imagealt": "Image alt text",
            "imagecaption": "Image caption",
            "imageheader": "Image header",
            "map": "Map file name",
            "mapscale": "Map scale",
            "mapalt": "Map alt text",
            "mapcaption": "Map caption",
            "map2": "Second map",
            "mapalt2": "Second map alt text",
            "mapcaption2": "Second map caption",
            "pushpin_map": "Pushpin map",
            "pushpin_image": "Pushpin image",
            "pushpin_map_alt": "Pushpin map alt text",
            "pushpin_map_caption": "Pushpin map caption",
            "pushpin_mapsize": "Pushpin map size",
            "pushpin_label": "Pushpin label",
            "pushpin_label_position": "Pushpin label position",
            "coordinates": "Geographic coordinates",
            
            # Constructed languages
            "creator": "Language creator",
            "created": "Year created",
            "setting": "Usage setting",
            
            # Speaker information
            "speakers_label": "Speaker label override",
            "refname": "Reference name",
            
            # Formatting
            "boxsize": "Infobox width override",
            "fontcolor": "Font color override",
            "module": "Embedded module",
            "notice": "Footer notice"
        }

# Convenience functions for easy usage

def extract_infobox_by_qid(qid: str) -> Dict[str, Any]:
    """
    Extract Wikipedia language infobox using QID.
    
    Args:
        qid (str): Wikidata QID (e.g., 'Q1860' for English)
        
    Returns:
        Dict with infobox fields and metadata
        
    Example:
        >>> result = extract_infobox_by_qid('Q1860')
        >>> print(result['fields']['name'])  # 'English'
        >>> print(result['fields']['iso1'])  # 'en'
    """
    extractor = WikipediaInfoboxExtractor()
    return extractor.extract_infobox_by_qid(qid)

def extract_infobox_by_title(page_title: str) -> Dict[str, Any]:
    """
    Extract Wikipedia language infobox by page title.
    
    Args:
        page_title (str): Wikipedia page title (e.g., 'English language')
        
    Returns:
        Dict with infobox fields and metadata
        
    Example:
        >>> result = extract_infobox_by_title('French language')
        >>> print(result['fields']['name'])  # 'French'
        >>> print(result['fields']['nativename'])  # 'français'
    """
    extractor = WikipediaInfoboxExtractor()
    return extractor.extract_infobox_by_title(page_title)

def get_supported_fields() -> Dict[str, str]:
    """
    Get list of supported infobox fields with descriptions.
    
    Returns:
        Dict mapping field names to descriptions
        
    Example:
        >>> fields = get_supported_fields()
        >>> print(fields['name'])  # 'Language name'
        >>> print(fields['speakers'])  # 'Number of native speakers'
    """
    extractor = WikipediaInfoboxExtractor()
    return extractor.get_infobox_template_fields()

def search_language_pages(search_term: str, limit: int = 10) -> list:
    """
    Search for language pages on Wikipedia.
    
    Args:
        search_term (str): Search term (e.g., 'French language')
        limit (int): Maximum number of results
        
    Returns:
        List of page titles
    """
    try:
        api_url = "https://en.wikipedia.org/w/api.php"
        params = {
            'action': 'opensearch',
            'format': 'json',
            'search': search_term + ' language',
            'limit': limit,
            'namespace': 0,
            'suggest': True
        }
        
        response = requests.get(api_url, params=params, timeout=10)
        data = response.json()
        
        if len(data) >= 2:
            return data[1]  # Return the list of titles
            
    except Exception as e:
        print(f"Search error: {e}")
    
    return []

# Testing function
def test_infobox_extraction():
    """Test infobox extraction with various languages."""
    test_cases = [
        ('Q1860', 'English'),
        ('Q150', 'French'),
        ('Q188', 'German'),
        ('Q1321', 'Spanish'),
        ('Q7737', 'Russian')
    ]
    
    results = {}
    for qid, lang_name in test_cases:
        print(f"\\nTesting {lang_name} ({qid})...")
        try:
            result = extract_infobox_by_qid(qid)
            if 'error' not in result:
                field_count = result.get('field_count', 0)
                print(f"✓ Extracted {field_count} infobox fields")
                if 'fields' in result and 'name' in result['fields']:
                    print(f"  Language name: {result['fields']['name']}")
                if 'fields' in result and 'speakers' in result['fields']:
                    print(f"  Speakers: {result['fields']['speakers']}")
            else:
                print(f"✗ Error: {result['error']}")
            results[qid] = result
        except Exception as e:
            print(f"✗ Exception: {e}")
            results[qid] = {"error": str(e)}
    
    return results

if __name__ == "__main__":
    print("Wikipedia Language Infobox Extractor")
    print("=" * 50)
    
    # Example: Extract English language infobox
    print("\\nExtracting English language infobox (Q1860)...")
    result = extract_infobox_by_qid('Q1860')
    
    if 'error' not in result:
        print(f"✓ Successfully extracted infobox from: {result['page_title']}")
        print(f"✓ Found {result['field_count']} fields")
        print("\\nSample fields:")
        for key, value in list(result['fields'].items())[:8]:
            # Truncate long values for display
            display_value = value[:100] + "..." if len(value) > 100 else value
            print(f"  {key}: {display_value}")
        
        print(f"\\n✓ Wikipedia URL: {result['wikipedia_url']}")
    else:
        print(f"✗ Error: {result['error']}")
    
    # Show some supported fields
    print("\\n" + "=" * 50)
    print("Some supported Template:Infobox language fields:")
    fields = get_supported_fields()
    important_fields = ['name', 'nativename', 'speakers', 'family', 'script', 'iso1', 'iso3']
    for field in important_fields:
        if field in fields:
            print(f"  {field}: {fields[field]}")
    print(f"\\n... and {len(fields) - len(important_fields)} more fields supported")

Wikipedia Language Infobox Extractor
\nExtracting English language infobox (Q1860)...
✗ Error: No Wikipedia page found for QID: Q1860
Some supported Template:Infobox language fields:
  name: Language name
  nativename: Native name
  speakers: Number of native speakers
  family: Language family description
  script: Writing system
  iso1: ISO 639-1 code
  iso3: ISO 639-3 code
\n... and 121 more fields supported
