In [4]:
import requests
from bs4 import BeautifulSoup

LANGUAGE = "Spanish_language"
url = f"https://en.wikipedia.org/wiki/{LANGUAGE}"
headers = {"User-Agent": "Mozilla/5.0 (compatible; MyPythonApp/1.0)"}

res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.text, "html.parser")

# Find infobox
infobox = soup.find("table", {"class": "infobox"})
rows = infobox.find_all("tr")

language_family = []
for row in rows:
    if row.th and "Language family" in row.th.get_text():
        # Extract nested hierarchy
        items = row.td.find_all(["a", "b"])
        language_family = [i.get_text(strip=True) for i in items]
        break

print("Language Family Hierarchy (from HTML):", " → ".join(language_family))


Language Family Hierarchy (from HTML): Indo-European → Italic → Latino-Faliscan → Latin → Romance → Italo-Western → Western Romance → Ibero-Romance → West Iberian → Castilian → [2] → [3] → Spanish


In [11]:
import re

def clean_language_family_text(text):
    """
    Clean language family text by removing various edge cases:
    - Reference numbers like [1], [2], [citation needed]
    - Extra whitespace and newlines
    - Common Wikipedia artifacts
    """
    if not text:
        return ""
    
    # Remove reference citations like [1], [2], [citation needed], [dubious], etc.
    text = re.sub(r'\[[\d\w\s\-,;:]+\]', '', text)
    
    # Remove common Wikipedia notation artifacts
    text = re.sub(r'\(disputed\)', '', text)
    text = re.sub(r'\(questionable\)', '', text)
    text = re.sub(r'\(uncertain\)', '', text)
    text = re.sub(r'\s*\?\s*', '', text)  # Remove question marks
    
    # Clean up whitespace
    text = re.sub(r'\s+', ' ', text)  # Multiple spaces to single space
    text = text.strip()
    
    return text

def extract_language_family_improved(language_name):
    """
    Extract language family hierarchy from Wikipedia with improved edge case handling
    """
    url = f"https://en.wikipedia.org/wiki/{language_name}"
    headers = {"User-Agent": "Mozilla/5.0 (compatible; MyPythonApp/1.0)"}
    
    try:
        res = requests.get(url, headers=headers)
        res.raise_for_status()
        soup = BeautifulSoup(res.text, "html.parser")
        
        # Find infobox
        infobox = soup.find("table", {"class": "infobox"})
        if not infobox:
            return []
        
        rows = infobox.find_all("tr")
        
        for row in rows:
            # Look for language family row
            if row.th and "Language family" in row.th.get_text():
                td = row.td
                if not td:
                    continue
                
                # Method 1: Try to extract from nested structure (ul/li)
                language_family = []
                lists = td.find_all(['ul', 'ol'])
                
                if lists:
                    # Extract from nested lists
                    for lst in lists:
                        items = lst.find_all('li')
                        for item in items:
                            # Get text from links or bold text
                            links = item.find_all('a')
                            if links:
                                for link in links:
                                    text = clean_language_family_text(link.get_text())
                                    if text and text not in language_family:
                                        language_family.append(text)
                            else:
                                # Fallback to getting all text
                                text = clean_language_family_text(item.get_text())
                                if text and text not in language_family:
                                    language_family.append(text)
                else:
                    # Method 2: Extract from direct links and bold text
                    elements = td.find_all(['a', 'b', 'strong'])
                    for element in elements:
                        text = clean_language_family_text(element.get_text())
                        if text and text not in language_family:
                            language_family.append(text)
                
                # Method 3: Fallback - parse the entire text and split
                if not language_family:
                    full_text = clean_language_family_text(td.get_text())
                    # Try to split by common separators
                    parts = re.split(r'[→>]|\s+', full_text)
                    language_family = [part.strip() for part in parts if part.strip()]
                
                # Final cleanup - remove empty strings and duplicates while preserving order
                cleaned_family = []
                for item in language_family:
                    if item and item not in cleaned_family:
                        cleaned_family.append(item)
                
                return cleaned_family
                
    except Exception as e:
        print(f"Error extracting language family for {language_name}: {e}")
        return []
    
    return []

# Test with Spanish
LANGUAGE = "Latino-Faliscan_language"
language_family_improved = extract_language_family_improved(LANGUAGE)
print("Improved Language Family Hierarchy:", " → ".join(language_family_improved))
print("Number of levels:", len(language_family_improved))

Error extracting language family for Latino-Faliscan_language: 404 Client Error: Not Found for url: https://en.wikipedia.org/wiki/Latino-Faliscan_language
Improved Language Family Hierarchy: 
Number of levels: 0
