In [1]:
import os
import bz2
import random
import linecache
import xml.etree.ElementTree as ET

In [74]:
def extractWikiPageIDs(filePath, idMap):
    readFile = bz2.open(filePath, "rb")
    filePath = os.getcwd() + "/data/txt/wikiPageIDs.txt"
    writeFile = open(filePath, "w", encoding = "utf-8")
    
    wikiPage = ""
    while True:
        line = str(readFile.readline(), "utf-8")
        if not line:
            break

        if "<page>" in line:
            wikiPage = line
        elif "</page>" in line:
            wikiPage += line

            id = ""
            title = ""
            root = ET.fromstring(wikiPage)
            for c1 in root:
                if c1.tag == "id":
                    id = c1.text
                if c1.tag == "title":
                    title = c1.text
            
            idMap[title] = id
            writeFile.write(title.lower() + " " + id + "\n")
        else:
            wikiPage += line
    
    print("ID Extraction Completed!")
    readFile.close()
    writeFile.close()

In [75]:
def buildWikiGraph(filePath):
    idMap = {}
    extractWikiPageIDs(filePath, idMap)

    readFile = bz2.open(filePath, "rb")
    filePath = os.getcwd() + "/data/txt/adjacencyList.txt"
    writeFile = open(filePath, "w", encoding = "utf-8")

    wikiPage = ""
    while True:
        line = str(readFile.readline(), "utf-8")
        if not line:
            break

        if "<page>" in line:
            wikiPage = line
        elif "</page>" in line:
            wikiPage += line

            id = ""
            content = ""
            root = ET.fromstring(wikiPage)
            for c1 in root:
                if c1.tag == "id":
                    id = c1.text
                if c1.tag == "revision":
                    for c2 in c1:
                        if c2.tag == "text":
                            content = c2.text
            
            links = []
            if content is not None:
                links = content.split("[[")
                for i in range(len(links)):
                    links[i] = links[i].split("]]")[0]
                    if "|" in links[i]:
                        links[i] = links[i].split("|")[0]
                links = links[1:]

            writeFile.write(id)
            for link in links:
                if link in idMap:
                    writeFile.write(" " + idMap[link])
            writeFile.write("\n")
        else:
            wikiPage += line
    
    print("WikiGraph Build Completed!")
    readFile.close()
    writeFile.close()

In [8]:
def printWikiPages(wikiPages, iterations, k):
    filePath = os.getcwd() + "/data/txt/wikiPageIDs.txt"
    readFile = open(filePath, "r", encoding = "utf-8")
    filePath = os.getcwd() + "/data/txt/results.txt"
    writeFile = open(filePath, "w", encoding = "utf-8")
    
    while True:
        line = readFile.readline()
        if not line:
            break

        line = line.split(" ")
        pageName = line[0]
        for i in range(1, len(line) - 1):
            pageName = pageName + " " + line[i]

        for i in range(len(wikiPages)):
            if wikiPages[i] == line[len(line) - 1][:-1]:
                wikiPages[i] = pageName
    
    counter = 1
    writeFile.write("Total number of iterations: " + str(iterations) + "\n\n")
    for w in wikiPages:
        if "category:" in w or "wikipedia:" in w or "file:" in w or "template:" in w or "portal:" in w:
            continue

        writeFile.write(f"Rank {counter}: " + w + "\n")
        if counter == k:
            break
        counter += 1
        
    print("Results Generation Completed!")
    readFile.close()
    writeFile.close()

In [10]:
def pageRank(filePath, k):
    readFile = open(filePath, "r", encoding = "utf-8")

    currentIndex = 1
    indexMap = {}
    while True:
        line = readFile.readline()
        if not line:
            break
        
        currentID = line.split(" ")[0]
        if len(line.split(" ")) == 1:
            currentID = currentID[:-1]
        indexMap[currentID] = currentIndex

        currentIndex += 1

    readFile.close()


    iterations, teleportationProbability = 100000000, 1
    visitCount = {}
    visited = []

    line = linecache.getline(filePath, random.randint(1, currentIndex - 1)).split(" ")
    currentNode = line[0]
    while len(line) == 1:
        line = linecache.getline(filePath, random.randint(1, currentIndex - 1)).split(" ")
        currentNode = line[0]
    visitCount[currentNode] = 1
    visited.append(currentNode)

    for it in range(iterations):
        randomNumber = random.randint(1, 10)
        if randomNumber <= teleportationProbability:
            nextNode = random.choice(visited)
            visitCount[nextNode] = visitCount[nextNode] + 1
            currentNode = nextNode
        else:
            line = linecache.getline(filePath, indexMap[currentNode]).split(" ")
            line[len(line) - 1] = line[len(line) - 1][:-1]
            if len(line) == 1:
                nextNode = random.choice(visited)
                visitCount[nextNode] = visitCount[nextNode] + 1
                currentNode = nextNode
            else:
                nextNode = line[random.randint(1, len(line) - 1)]
                if nextNode not in visitCount:
                    visitCount[nextNode] = 1
                    visited.append(nextNode)
                else:
                    visitCount[nextNode] = visitCount[nextNode] + 1
                currentNode = nextNode

    print("Random Walk Completed!")

    counter = 1
    wikiPages = []
    for el in sorted(visitCount.items(), key = lambda kv:kv[1], reverse = True):
        if counter == 10*k:
            break

        wikiPages.append(el[0])
        counter += 1
        
    printWikiPages(wikiPages, iterations, k)

In [None]:
filePath = os.getcwd() + "/data/bz2/enwiki-latest-pages-articles.xml.bz2"
buildWikiGraph(filePath)

filePath = os.getcwd() + "/data/txt/adjacencyList.txt"
k = 100
pageRank(filePath, k)