In [55]:
import xml.sax
import re
import networkx as nx

class PageHandler(xml.sax.ContentHandler):
    def __init__(self, graph):
        self.graph = graph
        self.current_tag = ""
        self.title = ""
        self.ns = ""
        self.page_count = 0

    def startElement(self, tag, attributes):
        self.current_tag = tag
        if tag == "page":
            self.title = ""
            self.ns = ""

    def characters(self, content):
        if self.current_tag == "title":
            self.title += content
        elif self.current_tag == "ns":
            self.ns += content

    def endElement(self, tag):
        if tag == "page":
            if self.ns.strip() == "0":
                self.graph.add_node(self.title.strip())
                self.page_count += 1
                if self.page_count % 100000 == 0:
                    print(f"{self.page_count} pages added to the graph...")

class LinkHandler(xml.sax.ContentHandler):
    def __init__(self, graph):
        self.graph = graph
        self.current_tag = ""
        self.in_text_tag = False
        self.title = ""
        self.text = ""
        self.page_count = 0

    def startElement(self, tag, attributes):
        self.current_tag = tag
        if tag == "text":
            self.in_text_tag = True
            self.text = ""
        elif tag == "title":
            self.title = ""

    def characters(self, content):
        if self.in_text_tag:
            self.text += content
        if self.current_tag == "title":
            self.title += content

    def endElement(self, tag):
        if tag == "text":
            self.in_text_tag = False
            self.page_count += 1

            source = self.title.strip()
            if source in self.graph:  
                links = re.findall(r"\[\[([^\]\|]+)(?:\|[^\]]+)?\]\]", self.text)
                for link in links:
                    target = link.strip()
                    if target in self.graph:  
                        self.graph.add_edge(source, target)

if __name__ == "__main__":
    graph = nx.DiGraph()

    print("Starting analysis of XML file...")

    # Étape 1 : Collecte des titres de pages
    parser = xml.sax.make_parser()
    handler = PageHandler(graph)
    parser.setContentHandler(handler)
    parser.parse("arywiki-20240920-pages-articles.xml")


    print(f"{len(graph.nodes)} nodes added to the graph.")

    # Étape 3 : Ajout des liens au graphe
    parser = xml.sax.make_parser()
    handler = LinkHandler(graph)
    parser.setContentHandler(handler)
    parser.parse("arywiki-20240920-pages-articles.xml")

    print("Analysis finished.")

Starting analysis of XML file...
30674 nodes added to the graph.
Analysis finished.


In [56]:
def BFS(graph, starting_node, ending_node):
    # creating the predecessor dict
    visited = set()
    queue = [starting_node]
    predecessor = {starting_node:None}
    while queue:
        curr = queue.pop(0)
        for neigh in graph.neighbors(curr):
            if neigh not in visited and neigh not in predecessor:
                queue.append(neigh)
                predecessor[neigh] = curr
                visited.add(neigh)
        
    # path finding
    path = []
    curr = ending_node
    while curr != starting_node:
        path.append(curr)
        curr = predecessor[curr]
    path.append(starting_node)
    return path[::-1]

In [59]:
starting_node = 'وان پيس'
ending_node = 'لإمبراطورية الرومانية'
if starting_node in graph and ending_node in graph:
    path = BFS(graph, starting_node, ending_node)
    print(path)

['وان پيس', 'لمغريب', 'لإمبراطورية الرومانية']
