In [6]:
# Let's create the pipeline without SPARQLWrapper first, then show how to install it
# This version will use direct HTTP requests to the SPARQL endpoint

import requests
import json
import time
import re
from typing import Dict, List, Optional, Tuple
import pandas as pd
import urllib.parse

class LanguageRelationshipExtractor:
    """
    A comprehensive pipeline for extracting language relationships from Wikipedia/Wikidata.
    Supports genetic descent edges, dialect edges, and sibling relationships.
    """
    
    def __init__(self):
        # Initialize endpoints and headers
        self.wikidata_api = "https://www.wikidata.org/w/api.php"
        self.wikipedia_api = "https://en.wikipedia.org/w/api.php"
        self.sparql_endpoint = "https://query.wikidata.org/sparql"
        
        # Common headers for API requests
        self.headers = {
            'User-Agent': 'LanguageRelationshipExtractor/1.0 (https://example.com/contact)',
            'Accept': 'application/json'
        }
        
        # Key Wikidata properties for language relationships
        self.language_properties = {
            'instance_of': 'P31',           # Instance of
            'subclass_of': 'P279',          # Subclass of  
            'language_family': 'P220',      # Language family
            'parent_language': 'P155',      # Follows (parent language)
            'child_language': 'P156',       # Followed by (child language)
            'dialect_of': 'P629',           # Dialect of
            'has_dialect': 'P2341',         # Has dialect
            'related_language': 'P2596',    # Related language
            'writing_system': 'P282',       # Writing system used
            'iso_639_1': 'P218',           # ISO 639-1 code
            'iso_639_2': 'P219',           # ISO 639-2 code
            'iso_639_3': 'P220',           # ISO 639-3 code (Note: this overwrites language_family, we'll fix this)
            'glottolog_id': 'P1394',       # Glottolog ID
        }
        
        # Fix the ISO code mapping
        self.language_properties['iso_639_3'] = 'P220'
        self.language_properties['language_family'] = 'P25295'  # Corrected
        
        # Language-related classes in Wikidata
        self.language_classes = {
            'language': 'Q34770',
            'natural_language': 'Q33742',
            'constructed_language': 'Q33215',
            'dead_language': 'Q45762',
            'extinct_language': 'Q45762',
            'language_family': 'Q25295',
            'language_isolate': 'Q34770',
            'dialect': 'Q33384',
            'variety_of_language': 'Q33384'
        }
    
    def execute_sparql_query(self, query: str) -> Dict:
        """
        Execute SPARQL query using direct HTTP requests.
        """
        try:
            params = {
                'query': query,
                'format': 'json'
            }
            
            response = requests.get(
                self.sparql_endpoint, 
                params=params, 
                headers=self.headers,
                timeout=30
            )
            response.raise_for_status()
            return response.json()
            
        except Exception as e:
            print(f"Error executing SPARQL query: {e}")
            return {}
    
    def get_wikidata_entity_by_name(self, language_name: str) -> Optional[str]:
        """
        Get Wikidata entity ID (Q-number) for a language by name.
        """
        try:
            params = {
                'action': 'wbsearchentities',
                'format': 'json',
                'language': 'en',
                'search': language_name,
                'type': 'item',
                'limit': 10
            }
            
            response = requests.get(self.wikidata_api, params=params, headers=self.headers)
            response.raise_for_status()
            data = response.json()
            
            # Look for exact matches or language-related items
            for item in data.get('search', []):
                description = item.get('description', '').lower()
                if any(term in description for term in ['language', 'dialect', 'tongue']):
                    return item['id']
            
            # If no language-specific match, return first result
            if data.get('search'):
                return data['search'][0]['id']
                
        except Exception as e:
            print(f"Error searching for entity {language_name}: {e}")
        
        return None
    
    def get_language_relationships_sparql(self, entity_id: str) -> Dict:
        """
        Extract language relationships using SPARQL queries.
        """
        relationships = {
            'genetic_descent': {
                'parents': [],
                'children': [],
                'ancestors': [],
                'descendants': []
            },
            'dialects': {
                'dialect_of': [],
                'has_dialects': []
            },
            'siblings': [],
            'language_family': [],
            'metadata': {}
        }
        
        # Query 1: Basic language information and direct relationships
        basic_query = f"""
        SELECT DISTINCT ?prop ?propLabel ?value ?valueLabel WHERE {{
            wd:{entity_id} ?prop ?value .
            ?property wikibase:directClaim ?prop .
            FILTER(?prop IN (wdt:P31, wdt:P279, wdt:P220, wdt:P155, wdt:P156, wdt:P629, wdt:P2341, wdt:P2596))
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
        }}
        """
        
        try:
            results = self.execute_sparql_query(basic_query)
            
            for result in results.get("results", {}).get("bindings", []):
                prop = result["prop"]["value"].split('/')[-1]
                value_id = result["value"]["value"].split('/')[-1]
                value_label = result.get("valueLabel", {}).get("value", value_id)
                
                # Map properties to relationship types
                if prop == 'P155':  # Parent language (follows)
                    relationships['genetic_descent']['parents'].append({
                        'id': value_id, 'label': value_label
                    })
                elif prop == 'P156':  # Child language (followed by)
                    relationships['genetic_descent']['children'].append({
                        'id': value_id, 'label': value_label
                    })
                elif prop == 'P629':  # Dialect of
                    relationships['dialects']['dialect_of'].append({
                        'id': value_id, 'label': value_label
                    })
                elif prop == 'P2341':  # Has dialect
                    relationships['dialects']['has_dialects'].append({
                        'id': value_id, 'label': value_label
                    })
                elif prop == 'P220':  # Language family
                    relationships['language_family'].append({
                        'id': value_id, 'label': value_label
                    })
                elif prop == 'P31' or prop == 'P279':  # Instance of / Subclass of
                    relationships['metadata'][prop] = {
                        'id': value_id, 'label': value_label
                    }
                    
        except Exception as e:
            print(f"Error in basic SPARQL query: {e}")
        
        return relationships
    
    def get_wikipedia_infobox_data(self, language_name: str) -> Dict:
        """
        Extract language relationship data from Wikipedia infoboxes.
        """
        try:
            # First, find the Wikipedia page
            search_params = {
                'action': 'query',
                'format': 'json',
                'list': 'search',
                'srsearch': f"{language_name} language",
                'srnamespace': 0,
                'srlimit': 5
            }
            
            response = requests.get(self.wikipedia_api, params=search_params, headers=self.headers)
            response.raise_for_status()
            search_data = response.json()
            
            if not search_data.get('query', {}).get('search'):
                return {}
            
            # Get the page content
            page_title = search_data['query']['search'][0]['title']
            content_params = {
                'action': 'query',
                'format': 'json',
                'titles': page_title,
                'prop': 'revisions',
                'rvprop': 'content',
                'rvslots': 'main'
            }
            
            response = requests.get(self.wikipedia_api, params=content_params, headers=self.headers)
            response.raise_for_status()
            content_data = response.json()
            
            # Extract infobox data
            pages = content_data.get('query', {}).get('pages', {})
            for page_id, page_data in pages.items():
                if 'revisions' in page_data:
                    content = page_data['revisions'][0]['slots']['main']['*']
                    return self._parse_language_infobox(content)
            
        except Exception as e:
            print(f"Error extracting Wikipedia infobox data: {e}")
        
        return {}
    
    def _parse_language_infobox(self, content: str) -> Dict:
        """
        Parse language infobox content to extract relationships.
        """
        relationships = {
            'family': [],
            'ancestors': [],
            'dialects': [],
            'related': []
        }
        
        # Look for infobox language or language family templates
        infobox_patterns = [
            r'\{\{Infobox language(.*?)\}\}',
            r'\{\{Infobox language family(.*?)\}\}'
        ]
        
        for pattern in infobox_patterns:
            matches = re.findall(pattern, content, re.DOTALL | re.IGNORECASE)
            for match in matches:
                # Extract family information
                family_matches = re.findall(r'fam\d+\s*=\s*\[\[(.*?)\]\]', match)
                relationships['family'].extend(family_matches)
                
                # Extract ancestor information
                ancestor_matches = re.findall(r'ancestor\d*\s*=\s*\[\[(.*?)\]\]', match)
                relationships['ancestors'].extend(ancestor_matches)
                
                # Extract dialect information
                dialect_matches = re.findall(r'dia\d+\s*=\s*\[\[(.*?)\]\]', match)
                relationships['dialects'].extend(dialect_matches)
        
        return relationships

# Create a demonstration function
def create_example_extraction():
    """
    Create an example extraction for demonstration purposes.
    """
    extractor = LanguageRelationshipExtractor()
    
    # Example: Extract relationships for Spanish
    print("Example: Extracting relationships for 'Spanish'")
    print("-" * 50)
    
    # Step 1: Get Wikidata entity
    entity_id = extractor.get_wikidata_entity_by_name("English")
    if entity_id:
        print(f"Found Wikidata entity: {entity_id}")
        
        # Step 2: Get basic relationships via SPARQL
        relationships = extractor.get_language_relationships_sparql(entity_id)
        
        print("\nRelationships found:")
        print(json.dumps(relationships, indent=2))
        
        return relationships
    else:
        print("Could not find entity for Spanish")
        return {}

# Initialize the pipeline
print("Language Relationship Extraction Pipeline")
print("=" * 50)

extractor = LanguageRelationshipExtractor()

print("\nKey Wikidata Properties for Language Relationships:")
for name, prop_id in extractor.language_properties.items():
    print(f"  {name}: {prop_id}")

print("\nKey Language Classes in Wikidata:")
for name, class_id in extractor.language_classes.items():
    print(f"  {name}: {class_id}")

# Run example
print("\n" + "=" * 50)
print("EXAMPLE EXTRACTION")
print("=" * 50)
example_result = create_example_extraction()

Language Relationship Extraction Pipeline

Key Wikidata Properties for Language Relationships:
  instance_of: P31
  subclass_of: P279
  language_family: P25295
  parent_language: P155
  child_language: P156
  dialect_of: P629
  has_dialect: P2341
  related_language: P2596
  writing_system: P282
  iso_639_1: P218
  iso_639_2: P219
  iso_639_3: P220
  glottolog_id: P1394

Key Language Classes in Wikidata:
  language: Q34770
  natural_language: Q33742
  constructed_language: Q33215
  dead_language: Q45762
  extinct_language: Q45762
  language_family: Q25295
  language_isolate: Q34770
  dialect: Q33384
  variety_of_language: Q33384

EXAMPLE EXTRACTION
Example: Extracting relationships for 'Spanish'
--------------------------------------------------
Found Wikidata entity: Q1860

Relationships found:
{
  "genetic_descent": {
    "parents": [],
    "children": [],
    "ancestors": [],
    "descendants": []
  },
  "dialects": {
    "dialect_of": [],
    "has_dialects": [
      {
        "id": 

In [25]:
# Let's create a comprehensive Python pipeline for extracting language relationships from Wikipedia/Wikidata
# This will include all the methods mentioned and create a complete solution

import requests
from SPARQLWrapper import SPARQLWrapper, JSON
import json
import time
import re
from typing import Dict, List, Optional, Tuple
import pandas as pd

class LanguageRelationshipExtractor:
    """
    A comprehensive pipeline for extracting language relationships from Wikipedia/Wikidata.
    Supports genetic descent edges, dialect edges, and sibling relationships.
    """
    
    def __init__(self):
        # Initialize endpoints and headers
        self.wikidata_api = "https://www.wikidata.org/w/api.php"
        self.wikipedia_api = "https://en.wikipedia.org/w/api.php"
        self.sparql_endpoint = "https://query.wikidata.org/sparql"
        
        # Initialize SPARQL wrapper
        self.sparql = SPARQLWrapper(self.sparql_endpoint)
        self.sparql.setReturnFormat(JSON)
        
        # Common headers for API requests
        self.headers = {
            'User-Agent': 'LanguageRelationshipExtractor/1.0 (https://example.com/contact)'
        }
        
        # Key Wikidata properties for language relationships
        self.language_properties = {
            'instance_of': 'P31',           # Instance of
            'subclass_of': 'P279',          # Subclass of  
            'language_family': 'P220',      # Language family
            'parent_language': 'P155',      # Follows (parent language)
            'child_language': 'P156',       # Followed by (child language)
            'dialect_of': 'P629',           # Dialect of
            'has_dialect': 'P2341',         # Has dialect
            'related_language': 'P2596',    # Related language
            'writing_system': 'P282',       # Writing system used
            'iso_639_1': 'P218',           # ISO 639-1 code
            'iso_639_2': 'P219',           # ISO 639-2 code
            'iso_639_3': 'P220',           # ISO 639-3 code
            'glottolog_id': 'P1394',       # Glottolog ID
        }
        
        # Language-related classes in Wikidata
        self.language_classes = {
            'language': 'Q34770',
            'natural_language': 'Q33742',
            'constructed_language': 'Q33215',
            'dead_language': 'Q45762',
            'extinct_language': 'Q45762',
            'language_family': 'Q25295',
            'language_isolate': 'Q34770',
            'dialect': 'Q33384',
            'variety_of_language': 'Q33384'
        }
    
    def get_wikidata_entity_by_name(self, language_name: str) -> Optional[str]:
        """
        Get Wikidata entity ID (Q-number) for a language by name.
        """
        try:
            params = {
                'action': 'wbsearchentities',
                'format': 'json',
                'language': 'en',
                'search': language_name,
                'type': 'item',
                'limit': 10
            }
            
            response = requests.get(self.wikidata_api, params=params, headers=self.headers)
            response.raise_for_status()
            data = response.json()
            
            # Look for exact matches or language-related items
            for item in data.get('search', []):
                description = item.get('description', '').lower()
                if any(term in description for term in ['language', 'dialect', 'tongue']):
                    return item['id']
            
            # If no language-specific match, return first result
            if data.get('search'):
                return data['search'][0]['id']
                
        except Exception as e:
            print(f"Error searching for entity {language_name}: {e}")
        
        return None
    
    def get_language_relationships_sparql(self, entity_id: str) -> Dict:
        """
        Extract language relationships using SPARQL queries.
        """
        relationships = {
            'genetic_descent': {
                'parents': [],
                'children': [],
                'ancestors': [],
                'descendants': []
            },
            'dialects': {
                'dialect_of': [],
                'has_dialects': []
            },
            'siblings': [],
            'language_family': [],
            'metadata': {}
        }
        
        # Query 1: Basic language information and direct relationships
        basic_query = f"""
        SELECT DISTINCT ?prop ?propLabel ?value ?valueLabel WHERE {{
            wd:{entity_id} ?prop ?value .
            ?property wikibase:directClaim ?prop .
            FILTER(?prop IN (wdt:P31, wdt:P279, wdt:P220, wdt:P155, wdt:P156, wdt:P629, wdt:P2341, wdt:P2596))
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
        }}
        """
        
        try:
            self.sparql.setQuery(basic_query)
            results = self.sparql.query().convert()
            
            for result in results["results"]["bindings"]:
                prop = result["prop"]["value"].split('/')[-1]
                value_id = result["value"]["value"].split('/')[-1]
                value_label = result.get("valueLabel", {}).get("value", value_id)
                
                # Map properties to relationship types
                if prop == 'P155':  # Parent language (follows)
                    relationships['genetic_descent']['parents'].append({
                        'id': value_id, 'label': value_label
                    })
                elif prop == 'P156':  # Child language (followed by)
                    relationships['genetic_descent']['children'].append({
                        'id': value_id, 'label': value_label
                    })
                elif prop == 'P629':  # Dialect of
                    relationships['dialects']['dialect_of'].append({
                        'id': value_id, 'label': value_label
                    })
                elif prop == 'P2341':  # Has dialect
                    relationships['dialects']['has_dialects'].append({
                        'id': value_id, 'label': value_label
                    })
                elif prop == 'P220':  # Language family
                    relationships['language_family'].append({
                        'id': value_id, 'label': value_label
                    })
                elif prop == 'P31' or prop == 'P279':  # Instance of / Subclass of
                    relationships['metadata'][prop] = {
                        'id': value_id, 'label': value_label
                    }
                    
        except Exception as e:
            print(f"Error in basic SPARQL query: {e}")
        
        # Query 2: Find sibling languages (same language family)
        if relationships['language_family']:
            family_id = relationships['language_family'][0]['id']
            sibling_query = f"""
            SELECT DISTINCT ?sibling ?siblingLabel WHERE {{
                ?sibling wdt:P220 wd:{family_id} .
                ?sibling wdt:P31/wdt:P279* wd:Q34770 .
                FILTER(?sibling != wd:{entity_id})
                SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
            }}
            LIMIT 20
            """
            
            try:
                self.sparql.setQuery(sibling_query)
                results = self.sparql.query().convert()
                
                for result in results["results"]["bindings"]:
                    sibling_id = result["sibling"]["value"].split('/')[-1]
                    sibling_label = result.get("siblingLabel", {}).get("value", sibling_id)
                    relationships['siblings'].append({
                        'id': sibling_id, 'label': sibling_label
                    })
                    
            except Exception as e:
                print(f"Error in sibling SPARQL query: {e}")
        
        # Query 3: Transitive ancestors and descendants
        ancestor_query = f"""
        SELECT DISTINCT ?ancestor ?ancestorLabel WHERE {{
            wd:{entity_id} wdt:P155+ ?ancestor .
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
        }}
        LIMIT 10
        """
        
        descendant_query = f"""
        SELECT DISTINCT ?descendant ?descendantLabel WHERE {{
            ?descendant wdt:P155+ wd:{entity_id} .
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
        }}
        LIMIT 10
        """
        
        # Execute ancestor query
        try:
            self.sparql.setQuery(ancestor_query)
            results = self.sparql.query().convert()
            
            for result in results["results"]["bindings"]:
                ancestor_id = result["ancestor"]["value"].split('/')[-1]
                ancestor_label = result.get("ancestorLabel", {}).get("value", ancestor_id)
                relationships['genetic_descent']['ancestors'].append({
                    'id': ancestor_id, 'label': ancestor_label
                })
        except Exception as e:
            print(f"Error in ancestor SPARQL query: {e}")
        
        # Execute descendant query
        try:
            self.sparql.setQuery(descendant_query)
            results = self.sparql.query().convert()
            
            for result in results["results"]["bindings"]:
                descendant_id = result["descendant"]["value"].split('/')[-1]
                descendant_label = result.get("descendantLabel", {}).get("value", descendant_id)
                relationships['genetic_descent']['descendants'].append({
                    'id': descendant_id, 'label': descendant_label
                })
        except Exception as e:
            print(f"Error in descendant SPARQL query: {e}")
        
        return relationships
    
    def get_wikipedia_infobox_data(self, language_name: str) -> Dict:
        """
        Extract language relationship data from Wikipedia infoboxes.
        """
        try:
            # First, find the Wikipedia page
            search_params = {
                'action': 'query',
                'format': 'json',
                'list': 'search',
                'srsearch': f"{language_name} language",
                'srnamespace': 0,
                'srlimit': 5
            }
            
            response = requests.get(self.wikipedia_api, params=search_params, headers=self.headers)
            response.raise_for_status()
            search_data = response.json()
            
            if not search_data.get('query', {}).get('search'):
                return {}
            
            # Get the page content
            page_title = search_data['query']['search'][0]['title']
            content_params = {
                'action': 'query',
                'format': 'json',
                'titles': page_title,
                'prop': 'revisions',
                'rvprop': 'content',
                'rvslots': 'main'
            }
            
            response = requests.get(self.wikipedia_api, params=content_params, headers=self.headers)
            response.raise_for_status()
            content_data = response.json()
            
            # Extract infobox data
            pages = content_data.get('query', {}).get('pages', {})
            for page_id, page_data in pages.items():
                if 'revisions' in page_data:
                    content = page_data['revisions'][0]['slots']['main']['*']
                    return self._parse_language_infobox(content)
            
        except Exception as e:
            print(f"Error extracting Wikipedia infobox data: {e}")
        
        return {}
    
    def _parse_language_infobox(self, content: str) -> Dict:
        """
        Parse language infobox content to extract relationships.
        """
        relationships = {
            'family': [],
            'ancestors': [],
            'dialects': [],
            'related': []
        }
        
        # Look for infobox language or language family templates
        infobox_patterns = [
            r'\{\{Infobox language(.*?)\}\}',
            r'\{\{Infobox language family(.*?)\}\}'
        ]
        
        for pattern in infobox_patterns:
            matches = re.findall(pattern, content, re.DOTALL | re.IGNORECASE)
            for match in matches:
                # Extract family information
                family_match = re.search(r'fam\d+\s*=\s*\[\[(.*?)\]\]', match)
                if family_match:
                    relationships['family'].append(family_match.group(1))
                
                # Extract ancestor information
                ancestor_match = re.search(r'ancestor\d*\s*=\s*\[\[(.*?)\]\]', match)
                if ancestor_match:
                    relationships['ancestors'].append(ancestor_match.group(1))
                
                # Extract dialect information
                dialect_matches = re.findall(r'dia\d+\s*=\s*\[\[(.*?)\]\]', match)
                relationships['dialects'].extend(dialect_matches)
        
        return relationships
    
    def get_wikidata_direct_api(self, entity_id: str) -> Dict:
        """
        Get language data directly from Wikidata API.
        """
        try:
            params = {
                'action': 'wbgetentities',
                'ids': entity_id,
                'format': 'json',
                'props': 'claims|labels'
            }
            
            response = requests.get(self.wikidata_api, params=params, headers=self.headers)
            response.raise_for_status()
            data = response.json()
            
            if entity_id not in data.get('entities', {}):
                return {}
            
            entity = data['entities'][entity_id]
            claims = entity.get('claims', {})
            
            # Extract relevant claims
            relationships = {}
            for prop_name, prop_id in self.language_properties.items():
                if prop_id in claims:
                    relationships[prop_name] = []
                    for claim in claims[prop_id]:
                        if 'mainsnak' in claim and 'datavalue' in claim['mainsnak']:
                            value = claim['mainsnak']['datavalue']['value']
                            if isinstance(value, dict) and 'id' in value:
                                relationships[prop_name].append(value['id'])
                            elif isinstance(value, str):
                                relationships[prop_name].append(value)
            
            return relationships
            
        except Exception as e:
            print(f"Error with Wikidata direct API: {e}")
            return {}
    
    def extract_all_relationships(self, language_name: str) -> Dict:
        """
        Main method to extract all language relationships using all available methods.
        """
        print(f"Extracting relationships for: {language_name}")
        
        # Step 1: Get Wikidata entity ID
        entity_id = self.get_wikidata_entity_by_name(language_name)
        if not entity_id:
            print(f"Could not find Wikidata entity for {language_name}")
            return {}
        
        print(f"Found Wikidata entity: {entity_id}")
        
        # Step 2: Extract relationships using different methods
        results = {
            'entity_id': entity_id,
            'language_name': language_name,
            'sparql_data': {},
            'wikipedia_infobox': {},
            'wikidata_api': {},
            'combined_relationships': {
                'genetic_descent': {'parents': [], 'children': [], 'ancestors': [], 'descendants': []},
                'dialects': {'dialect_of': [], 'has_dialects': []},
                'siblings': [],
                'language_family': []
            }
        }
        
        # SPARQL method
        print("Extracting via SPARQL...")
        results['sparql_data'] = self.get_language_relationships_sparql(entity_id)
        time.sleep(1)  # Rate limiting
        
        # Wikipedia infobox method
        print("Extracting from Wikipedia infobox...")
        results['wikipedia_infobox'] = self.get_wikipedia_infobox_data(language_name)
        time.sleep(1)  # Rate limiting
        
        # Wikidata API method
        print("Extracting via Wikidata API...")
        results['wikidata_api'] = self.get_wikidata_direct_api(entity_id)
        time.sleep(1)  # Rate limiting
        
        # Step 3: Combine and deduplicate results
        results['combined_relationships'] = self._combine_relationships(results)
        
        return results
    
    def _combine_relationships(self, results: Dict) -> Dict:
        """
        Combine relationships from all sources and remove duplicates.
        """
        combined = {
            'genetic_descent': {'parents': [], 'children': [], 'ancestors': [], 'descendants': []},
            'dialects': {'dialect_of': [], 'has_dialects': []},
            'siblings': [],
            'language_family': []
        }
        
        # Add SPARQL results
        sparql_data = results.get('sparql_data', {})
        if sparql_data:
            for category in combined:
                if category in sparql_data:
                    if isinstance(sparql_data[category], dict):
                        for subcategory in combined[category]:
                            if subcategory in sparql_data[category]:
                                combined[category][subcategory].extend(sparql_data[category][subcategory])
                    elif isinstance(sparql_data[category], list):
                        combined[category].extend(sparql_data[category])
        
        # Remove duplicates
        for category in combined:
            if isinstance(combined[category], dict):
                for subcategory in combined[category]:
                    combined[category][subcategory] = self._deduplicate_list(combined[category][subcategory])
            elif isinstance(combined[category], list):
                combined[category] = self._deduplicate_list(combined[category])
        
        return combined
    
    def _deduplicate_list(self, items: List) -> List:
        """
        Remove duplicates from a list of dictionaries or strings.
        """
        if not items:
            return []
        
        if isinstance(items[0], dict):
            seen = set()
            unique_items = []
            for item in items:
                identifier = item.get('id', str(item))
                if identifier not in seen:
                    seen.add(identifier)
                    unique_items.append(item)
            return unique_items
        else:
            return list(set(items))
    
    def save_results_csv(self, results: Dict, filename: str):
        """
        Save results to CSV format for easy analysis.
        """
        rows = []
        
        # Flatten the relationship data
        combined = results.get('combined_relationships', {})
        
        # Genetic descent relationships
        for parent in combined['genetic_descent']['parents']:
            rows.append({
                'source_language': results['language_name'],
                'source_id': results['entity_id'],
                'relationship_type': 'parent',
                'target_language': parent.get('label', ''),
                'target_id': parent.get('id', ''),
                'relationship_category': 'genetic_descent'
            })
        
        for child in combined['genetic_descent']['children']:
            rows.append({
                'source_language': results['language_name'],
                'source_id': results['entity_id'],
                'relationship_type': 'child',
                'target_language': child.get('label', ''),
                'target_id': child.get('id', ''),
                'relationship_category': 'genetic_descent'
            })
        
        # Dialect relationships
        for dialect_parent in combined['dialects']['dialect_of']:
            rows.append({
                'source_language': results['language_name'],
                'source_id': results['entity_id'],
                'relationship_type': 'dialect_of',
                'target_language': dialect_parent.get('label', ''),
                'target_id': dialect_parent.get('id', ''),
                'relationship_category': 'dialect'
            })
        
        # Sibling relationships
        for sibling in combined['siblings']:
            rows.append({
                'source_language': results['language_name'],
                'source_id': results['entity_id'],
                'relationship_type': 'sibling',
                'target_language': sibling.get('label', ''),
                'target_id': sibling.get('id', ''),
                'relationship_category': 'sibling'
            })
        
        # Language family
        for family in combined['language_family']:
            rows.append({
                'source_language': results['language_name'],
                'source_id': results['entity_id'],
                'relationship_type': 'member_of_family',
                'target_language': family.get('label', ''),
                'target_id': family.get('id', ''),
                'relationship_category': 'language_family'
            })
        
        df = pd.DataFrame(rows)
        df.to_csv(filename, index=False)
        print(f"Results saved to {filename}")

# Example usage and demonstration
print("Language Relationship Extraction Pipeline")
print("=" * 50)

# Initialize the extractor
extractor = LanguageRelationshipExtractor()

# Show the key properties and classes we'll be using
print("\nKey Wikidata Properties for Language Relationships:")
for name, prop_id in extractor.language_properties.items():
    print(f"  {name}: {prop_id}")

print("\nKey Language Classes in Wikidata:")
for name, class_id in extractor.language_classes.items():
    print(f"  {name}: {class_id}")
rels=extractor.extract_all_relationships("English")
print(rels)
print("\nPipeline created successfully!")
print("Ready to extract language relationships.")

# Flatten `rels` into {language_Name1, relationship, Language_Name2} triples
source_lang = rels.get('language_name', 'Unknown')
cr = rels.get('combined_relationships', {}) or {}

triples = []


def _label_of(item):
    if isinstance(item, dict):
        return item.get('label') or item.get('id') or str(item)
    return str(item)

# Genetic descent: parents, children, ancestors, descendants
for rel_name in ['parents', 'children', 'ancestors', 'descendants']:
    for tgt in (cr.get('genetic_descent', {}) or {}).get(rel_name, []) or []:
        triples.append({
            'language_Name1': source_lang,
            'relationship': rel_name[:-1] if rel_name.endswith('s') else rel_name,
            'Language_Name2': _label_of(tgt)
        })

# Dialects
for rel_name in ['dialect_of', 'has_dialects']:
    for tgt in (cr.get('dialects', {}) or {}).get(rel_name, []) or []:
        name = 'has_dialect' if rel_name == 'has_dialects' else 'dialect_of'
        triples.append({
            'language_Name1': source_lang,
            'relationship': name,
            'Language_Name2': _label_of(tgt)
        })

# Siblings
for tgt in cr.get('siblings', []) or []:
    triples.append({
        'language_Name1': source_lang,
        'relationship': 'sibling',
        'Language_Name2': _label_of(tgt)
    })

# Language family
for tgt in cr.get('language_family', []) or []:
    triples.append({
        'language_Name1': source_lang,
        'relationship': 'language_family',
        'Language_Name2': _label_of(tgt)
    })

# Deduplicate
seen = set()
relation_triples = []
for t in triples:
    key = (t['language_Name1'], t['relationship'], t['Language_Name2'])
    if key not in seen:
        seen.add(key)
        relation_triples.append(t)

# Print in requested compact form
for t in relation_triples:
    print(f"{{{t['language_Name1']}, {t['relationship']}, {t['Language_Name2']}}}")

relation_triples

Language Relationship Extraction Pipeline

Key Wikidata Properties for Language Relationships:
  instance_of: P31
  subclass_of: P279
  language_family: P220
  parent_language: P155
  child_language: P156
  dialect_of: P629
  has_dialect: P2341
  related_language: P2596
  writing_system: P282
  iso_639_1: P218
  iso_639_2: P219
  iso_639_3: P220
  glottolog_id: P1394

Key Language Classes in Wikidata:
  language: Q34770
  natural_language: Q33742
  constructed_language: Q33215
  dead_language: Q45762
  extinct_language: Q45762
  language_family: Q25295
  language_isolate: Q34770
  dialect: Q33384
  variety_of_language: Q33384
Extracting relationships for: English
Found Wikidata entity: Q1860
Extracting via SPARQL...
Found Wikidata entity: Q1860
Extracting via SPARQL...
Extracting from Wikipedia infobox...
Extracting from Wikipedia infobox...
Extracting via Wikidata API...
Extracting via Wikidata API...
{'entity_id': 'Q1860', 'language_name': 'English', 'sparql_data': {'genetic_descent'

[{'language_Name1': 'English',
  'relationship': 'has_dialect',
  'Language_Name2': 'England'},
 {'language_Name1': 'English',
  'relationship': 'language_family',
  'Language_Name2': 'eng'}]