In [None]:

# ontology_schema_extractor



from rdflib import Graph, Namespace, RDF, RDFS, OWL
import json
from collections import defaultdict

class OntologySchemaReader:
    def __init__(self, ontology_file_path):
        self.ontology_file_path = ontology_file_path
        self.graph = Graph()
        self.classes = set()
        self.properties = set()
        self.object_properties = set()
        self.data_properties = set()
        self.property_domains = defaultdict(list)
        self.property_ranges = defaultdict(list)
        self.class_hierarchy = defaultdict(list)
        
    def load_ontology(self):
        """Load ontology from file (supports .owl, .ttl, .rdf, .n3, etc.)"""
        try:
            print(f" Loading ontology from: {self.ontology_file_path}")
            self.graph.parse(self.ontology_file_path)
            print(f" Successfully loaded ontology with {len(self.graph)} triples")
            return True
        except Exception as e:
            print(f" Error loading ontology: {e}")
            return False
    
    def extract_classes(self):
        """Extract only meaningful top-level classes from the ontology"""
        print(" Extracting top-level classes...")
        
        all_classes = set()
        subclasses = set()
        
        # First, collect all meaningful classes (excluding blank nodes)
        for subject in self.graph.subjects(RDF.type, OWL.Class):
            class_name = self._get_local_name(subject)
            if (class_name and 
                not class_name.startswith('_') and 
                not class_name.startswith('N') and
                len(class_name) < 50 and  # Reasonable class name length
                class_name.replace('_', '').replace('-', '').isalnum()):  # Valid identifier
                all_classes.add(class_name)
        
        for subject in self.graph.subjects(RDF.type, RDFS.Class):
            class_name = self._get_local_name(subject)
            if (class_name and 
                not class_name.startswith('_') and 
                not class_name.startswith('N') and
                len(class_name) < 50 and
                class_name.replace('_', '').replace('-', '').isalnum()):
                all_classes.add(class_name)
        
        # Find all classes that are subclasses of others
        for subclass, superclass in self.graph.subject_objects(RDFS.subClassOf):
            subclass_name = self._get_local_name(subclass)
            superclass_name = self._get_local_name(superclass)
            if (subclass_name and subclass_name in all_classes and
                not subclass_name.startswith('_') and 
                not subclass_name.startswith('N')):
                subclasses.add(subclass_name)
        
        # Top-level classes are those that are NOT subclasses of others
        # OR are explicitly top-level (like owl:Thing)
        self.classes = all_classes - subclasses
        
        # Also include classes that only inherit from owl:Thing or similar generic classes
        generic_superclasses = {'Thing', 'Resource', 'Class', 'Property'}
        for subclass, superclass in self.graph.subject_objects(RDFS.subClassOf):
            subclass_name = self._get_local_name(subclass)
            superclass_name = self._get_local_name(superclass)
            if (subclass_name and superclass_name and 
                superclass_name in generic_superclasses and
                subclass_name in all_classes and
                subclass_name not in self.classes):
                self.classes.add(subclass_name)
        
        print(f"   Found {len(self.classes)} meaningful top-level classes (filtered out {len(all_classes) - len(self.classes)} blank nodes/system classes)")
        return self.classes
    
    def extract_properties(self):
        """Extract only meaningful top-level/core properties from the ontology"""
        print(" Extracting top-level properties...")
        
        all_object_props = set()
        all_data_props = set()
        sub_properties = set()
        
        # Extract all meaningful object properties
        for subject in self.graph.subjects(RDF.type, OWL.ObjectProperty):
            prop_name = self._get_local_name(subject)
            if (prop_name and 
                not prop_name.startswith('_') and 
                not prop_name.startswith('N') and
                len(prop_name) < 50 and
                prop_name.replace('_', '').replace('-', '').isalnum()):
                all_object_props.add(prop_name)
        
        # Extract all meaningful data properties
        for subject in self.graph.subjects(RDF.type, OWL.DatatypeProperty):
            prop_name = self._get_local_name(subject)
            if (prop_name and 
                not prop_name.startswith('_') and 
                not prop_name.startswith('N') and
                len(prop_name) < 50 and
                prop_name.replace('_', '').replace('-', '').isalnum()):
                all_data_props.add(prop_name)
        
        # Find properties that are sub-properties of others
        for subprop, superprop in self.graph.subject_objects(RDFS.subPropertyOf):
            subprop_name = self._get_local_name(subprop)
            if (subprop_name and 
                not subprop_name.startswith('_') and 
                not subprop_name.startswith('N')):
                sub_properties.add(subprop_name)
        
        # Top-level properties are those that are NOT sub-properties
        top_level_object_props = all_object_props - sub_properties
        top_level_data_props = all_data_props - sub_properties
        
        # Filter out very specific/technical properties
        excluded_patterns = ['annotation', 'meta', 'internal', 'system', 'owl', 'rdf', 'rdfs']
        
        def is_meaningful_property(prop_name):
            prop_lower = prop_name.lower()
            return not any(pattern in prop_lower for pattern in excluded_patterns)
        
        self.object_properties = {p for p in top_level_object_props if is_meaningful_property(p)}
        self.data_properties = {p for p in top_level_data_props if is_meaningful_property(p)}
        self.properties = self.object_properties | self.data_properties
        
        total_filtered = (len(all_object_props) + len(all_data_props)) - len(self.properties)
        print(f"   Found {len(self.object_properties)} meaningful object properties")
        print(f"   Found {len(self.data_properties)} meaningful data properties")
        print(f"   Total meaningful properties: {len(self.properties)} (filtered out {total_filtered} system/blank properties)")
        return self.properties
    
    def extract_property_domains_ranges(self):
        """Extract domain and range information for top-level properties only"""
        print(" Extracting property domains and ranges for top-level properties...")
        
        # Only extract domain/range for our selected top-level properties
        for prop, domain in self.graph.subject_objects(RDFS.domain):
            prop_name = self._get_local_name(prop)
            domain_name = self._get_local_name(domain)
            # Only include if property is in our top-level properties and domain is in our top-level classes
            if (prop_name and domain_name and 
                prop_name in self.properties and domain_name in self.classes):
                self.property_domains[prop_name].append(domain_name)
        
        for prop, range_obj in self.graph.subject_objects(RDFS.range):
            prop_name = self._get_local_name(prop)
            range_name = self._get_local_name(range_obj)
            # Only include if property is in our top-level properties and range is in our top-level classes
            if (prop_name and range_name and 
                prop_name in self.properties and range_name in self.classes):
                self.property_ranges[prop_name].append(range_name)
        
        print(f"   Found domain info for {len(self.property_domains)} top-level properties")
        print(f"   Found range info for {len(self.property_ranges)} top-level properties")
    
    def extract_class_hierarchy(self):
        """Extract only direct top-level class relationships"""
        print(" Extracting top-level class hierarchy...")
        
        # Only include hierarchies involving top-level classes
        for subclass, superclass in self.graph.subject_objects(RDFS.subClassOf):
            subclass_name = self._get_local_name(subclass)
            superclass_name = self._get_local_name(superclass)
            
            # Only include if both are in our top-level classes
            if (subclass_name and superclass_name and 
                subclass_name in self.classes and superclass_name in self.classes and 
                not superclass_name.startswith('_')):
                self.class_hierarchy[superclass_name].append(subclass_name)
        
        hierarchy_count = sum(len(v) for v in self.class_hierarchy.values())
        print(f"   Found {hierarchy_count} top-level subclass relationships")
        
        return self.class_hierarchy
    
    def _get_local_name(self, uri):
        """Extract local name from URI, filtering out blank nodes and system URIs"""
        try:
            uri_str = str(uri)
            
            # Skip blank nodes (they start with _: or look like random IDs)
            if uri_str.startswith('_:') or uri_str.startswith('N') and len(uri_str) > 10:
                return None
            
            # Skip system/built-in URIs
            system_namespaces = [
                'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
                'http://www.w3.org/2000/01/rdf-schema#', 
                'http://www.w3.org/2002/07/owl#',
                'http://www.w3.org/2001/XMLSchema#'
            ]
            
            for namespace in system_namespaces:
                if uri_str.startswith(namespace):
                    return None
            
            # Extract local name
            if '#' in uri_str:
                local_name = uri_str.split('#')[-1]
            elif '/' in uri_str:
                local_name = uri_str.split('/')[-1]
            else:
                local_name = uri_str
            
            # Additional filtering for blank node patterns
            if (local_name.startswith('N') and len(local_name) > 20 and 
                all(c.isalnum() for c in local_name[1:])):
                return None
                
            # Skip if it looks like a generated ID
            if len(local_name) > 30 and local_name.replace('-', '').replace('_', '').isalnum():
                return None
                
            return local_name
            
        except:
            return None
    
    def build_schema_string(self):
        """Build the ontology schema string for use in prompts"""
        print(" Building schema string...")
        
        schema_parts = []
        
        # Add classes
        if self.classes:
            sorted_classes = sorted(self.classes)
            schema_parts.append(f"Ontology Classes:\n- {', '.join(sorted_classes)}")
        
        # Add properties with domain/range info
        if self.properties:
            schema_parts.append("\nOntology Relationships:")
            for prop in sorted(self.properties):
                domains = self.property_domains.get(prop, [])
                ranges = self.property_ranges.get(prop, [])
                
                if domains and ranges:
                    for domain in domains:
                        for range_item in ranges:
                            schema_parts.append(f"- {prop} ({domain} -> {range_item})")
                else:
                    schema_parts.append(f"- {prop}")
        
        # Add class hierarchy
        if self.class_hierarchy:
            schema_parts.append("\nClass Hierarchy:")
            for superclass, subclasses in sorted(self.class_hierarchy.items()):
                for subclass in subclasses:
                    schema_parts.append(f"- {subclass} subClassOf {superclass}")
        
        return "\n".join(schema_parts)
    
    def extract_all(self):
        """Extract all schema components"""
        if not self.load_ontology():
            return False
        
        self.extract_classes()
        self.extract_properties()
        self.extract_property_domains_ranges()
        self.extract_class_hierarchy()
        
        return True
    
    def print_schema_summary(self):
        """Print summary of extracted schema"""
        print("\n" + "="*60)
        print(" ONTOLOGY SCHEMA SUMMARY")
        print("="*60)
        
        print(f"\n Statistics:")
        print(f"   • Top-Level Classes: {len(self.classes)}")
        print(f"   • Top-Level Properties: {len(self.properties)}")
        print(f"   • Top-Level Object Properties: {len(self.object_properties)}")
        print(f"   • Top-Level Data Properties: {len(self.data_properties)}")
        print(f"   • Properties with Domains: {len(self.property_domains)}")
        print(f"   • Properties with Ranges: {len(self.property_ranges)}")
        print(f"   • Top-Level Hierarchical Relations: {sum(len(v) for v in self.class_hierarchy.values())}")
        
        print(f"\n Top-Level Classes:")
        for cls in sorted(self.classes):
            print(f"   • {cls}")
        
        print(f"\n Top-Level Properties:")
        for prop in sorted(self.properties):
            prop_type = ""
            if prop in self.object_properties:
                prop_type = " (Object Property)"
            elif prop in self.data_properties:
                prop_type = " (Data Property)"
            
            domain_range = ""
            domains = self.property_domains.get(prop, [])
            ranges = self.property_ranges.get(prop, [])
            if domains and ranges:
                domain_range = f" [{', '.join(domains)} -> {', '.join(ranges)}]"
            
            print(f"   • {prop}{prop_type}{domain_range}")
        
        if self.class_hierarchy:
            print(f"\n Top-Level Class Hierarchy:")
            for superclass, subclasses in sorted(self.class_hierarchy.items()):
                print(f"   • {superclass}:")
                for subclass in subclasses:
                    print(f"     └── {subclass}")
    
    def save_schema(self, output_file="ontology_schema.json"):
        """Save extracted schema to JSON file"""
        schema_data = {
            "ontology_file": self.ontology_file_path,
            "extraction_timestamp": str(pd.Timestamp.now()) if 'pd' in globals() else "unknown",
            "classes": list(self.classes),
            "object_properties": list(self.object_properties),
            "data_properties": list(self.data_properties),
            "all_properties": list(self.properties),
            "property_domains": dict(self.property_domains),
            "property_ranges": dict(self.property_ranges),
            "class_hierarchy": dict(self.class_hierarchy),
            "schema_string": self.build_schema_string(),
            "statistics": {
                "total_classes": len(self.classes),
                "total_properties": len(self.properties),
                "object_properties": len(self.object_properties),
                "data_properties": len(self.data_properties)
            }
        }
        
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(schema_data, f, indent=2, ensure_ascii=False)
        
        print(f"\n Schema saved to: {output_file}")

def main():
    """Main function to extract ontology schema from file"""
    print(" ONTOLOGY SCHEMA EXTRACTOR")
    print("="*40)
    
    # Specify your ontology file path here
    ontology_file = input(" Enter path to your ontology file (.owl, .ttl, .rdf, etc.): ").strip()
    
    if not ontology_file:
        print(" No file path provided")
        return
    
    # Create schema reader
    reader = OntologySchemaReader(ontology_file)
    
    # Extract schema
    if reader.extract_all():
        # Print summary
        reader.print_schema_summary()
        
        # Print the schema string for use in KG extraction
        schema_string = reader.build_schema_string()
        print(f"\n SCHEMA STRING FOR KG EXTRACTION:")
        print("="*60)
        print(schema_string)
        print("="*60)
        
        # Save to file
        reader.save_schema()
        
        print(f"\n SUCCESS! Top-level ontology schema extracted successfully.")
        print(f" Use the 'schema_string' from the JSON file in your KG extraction code.")
        print(f" This schema contains only the core/top-level concepts for cleaner extraction.")
        
    else:
        print("Failed to extract ontology schema")

if __name__ == "__main__":
    main()