In [2]:
import re
import io
import time

### Parsovanie rozlisovacich stranok z SQL dumpu riadok po riadku

In [5]:
DISAMBIGUATION_REGEX = "\([0-9]*,[0-9]*,.{0,50}_\(rozlišovacia_stránka\)"
disambiguation_pages = {}

with open("skwiki-latest-page.sql", 'rb') as f:
    for line in f:
        line = line.decode('utf-8')
        
        pages = re.findall(DISAMBIGUATION_REGEX, line)
        for page in pages:
            page_id = page.split(",")[0].split("(")[1]
            page_name = page.split(",")[2][1:]
            disambiguation_pages[page_id] = page_name
            
disambiguation_pages

{'5843': 'Merkúr_(rozlišovacia_stránka)',
 '5866': 'Mars_(rozlišovacia_stránka)',
 '5871': 'Saturn_(rozlišovacia_stránka)',
 '6088': 'Venuša_(rozlišovacia_stránka)',
 '6507': 'Mesiac_(rozlišovacia_stránka)',
 '6727': 'Neptún_(rozlišovacia_stránka)',
 '6731': 'Pluto_(rozlišovacia_stránka)',
 '6734': 'Jupiter_(rozlišovacia_stránka)',
 '8237': 'Stupava_(rozlišovacia_stránka)',
 '10084': 'Pôda_(rozlišovacia_stránka)',
 '13826': 'Opera_(rozlišovacia_stránka)',
 '14217': 'Hviezdna_sústava_(rozlišovacia_stránka)',
 '14600': 'Pyramída_(rozlišovacia_stránka)',
 '14601': 'Ihlan_(rozlišovacia_stránka)',
 '14919': 'Apple_(rozlišovacia_stránka)',
 '18926': 'Bunka_(rozlišovacia_stránka)',
 '21733': 'Loď_(rozlišovacia_stránka)',
 '23406': 'Írsko_(rozlišovacia_stránka)',
 '23407': 'Írsko_(rozlišovacia_stránka)',
 '27548': 'Potok_(rozlišovacia_stránka)',
 '28217': 'Mexiko_(rozlišovacia_stránka)',
 '30857': 'Nitra_(rozlišovacia_stránka)',
 '31645': 'Čechy_(rozlišovacia_stránka)',
 '33145': 'Čína_(rozliš

### Streamove parsovanie skwiki-latest-pages-articles.xml

In [6]:
import xml.sax
import csv

class PageHandler(xml.sax.ContentHandler):
    def __init__(self):
        self.LINE_REGEX = "\*.*"
        self.URL_REGEX = "\[\[(.*)\]\]"
        self.DESC_REGEX = ""
        
        self.FILE_NAME = "vinf_export.csv"
        self.ID = "id"
        self.TEXT = "text"
        self.TITLE = "title"
        
        self.buffer = {}
        self.current_tag = ""
        self.in_page = False
        self.in_revision = False
        
        self.file = open(self.FILE_NAME, 'w', encoding="utf-8")
        self.writer = csv.writer(self.file)
        
    def parseDisambiguationText(self):
        return re.findall(self.LINE_REGEX, self.buffer[self.TEXT])
    
    def parseUrl(self, line):
        url = re.findall(self.URL_REGEX, line)
        if url:
            url = url[0].split("'")[1] if "'" in url[0] else url[0]
            url = url.split("|")[0] if "|" in url else url
            url = "sk.wikipedia.org/wiki/" + url
            url = url.replace(" ", "_")
        return url
    
        
    def parseDisambiguationPage(self):
        desc_lines = self.parseDisambiguationText()
        urls = [self.parseUrl(line) for line in desc_lines]
        
        desc_with_url = [x for xs in zip(desc_lines, urls) for x in xs]
        
        row = [
            self.buffer[self.ID],
            self.buffer[self.TITLE],
            *desc_with_url
        ]
        
        self.writer.writerow(row)
    
    def startElement(self, tag, attributes):
        self.current_tag = tag
        if "page" in tag:
            self.in_page = True
        if "revision" in tag:
            self.in_revision = True

    def endElement(self, tag):
        if "revision" in tag:
            self.in_revision = False
            
        if "page" in tag:
            self.in_page = False
            if self.buffer[self.ID] in disambiguation_pages:
                self.parseDisambiguationPage()
            self.buffer = {}
            
        self.current_tag = ""

    def characters(self, content):
        # ID stranky
        if self.ID in self.current_tag and not self.in_revision:
            if self.ID in self.buffer:
                        self.buffer[self.ID] = self.buffer[self.ID].join(content)
            else:
                self.buffer[self.ID] = "".join(content)
        
        # Nazov stranky
        if self.TITLE in self.current_tag:
            if self.TITLE in self.buffer:
                        self.buffer[self.TITLE] = self.buffer[self.TITLE].join(content)
            else:
                self.buffer[self.TITLE] = "".join(content)
        
        # Text stranky
        if self.TEXT in self.current_tag:
            if self.TEXT in self.buffer:
                        self.buffer[self.TEXT] += content
            else:
                self.buffer[self.TEXT] = content
        
    def endDocument(self):
        if self.file:
            self.file.close()
                
parser = xml.sax.make_parser()
# turn off namespaces
parser.setFeature(xml.sax.handler.feature_namespaces, 0)
parser.setContentHandler( PageHandler() )

parser.parse("skwiki-latest-pages-articles.xml")

In [1]:
pip install whoosh

Collecting whoosh
  Downloading Whoosh-2.7.4-py2.py3-none-any.whl (468 kB)
Installing collected packages: whoosh
Successfully installed whoosh-2.7.4
Note: you may need to restart the kernel to use updated packages.


In [2]:
from whoosh.index import *
from whoosh.fields import *
from whoosh.analysis import *
from whoosh.support.charset import accent_map
import os
# schema = Schema(title=TEXT(stored=True), path=ID(stored=True), content=TEXT)
# ix = create_in("indexdir", schema)
# writer = ix.writer()
# writer.add_document(title=u"First document", path=u"/a",
#                      content=u"This is the first document we've added!")
# writer.add_document(title=u"Second document", path=u"/b",
#                      content=u"The second one is even more interesting!")
# writer.commit()
# from whoosh.qparser import QueryParser
# with ix.searcher() as searcher:
#     query = QueryParser("content", ix.schema).parse("first")
#     results = searcher.search(query)
#     results[0]


In [3]:
schema = Schema(content=TEXT(stored=True, analyzer=RegexTokenizer() | LowercaseFilter() | CharsetFilter(accent_map)))

if not os.path.exists("index"):
    os.mkdir("index")
ix = create_in("index", schema)

ix = open_dir("index")

writer = ix.writer()

with open("vinf_export.csv", 'r', encoding='utf-8') as file:
    for line in file:
        writer.add_document(content=line)

writer.commit()

In [18]:
from whoosh.qparser import QueryParser

qp = QueryParser("content", schema=ix.schema)
q = qp.parse(u"film")

with ix.searcher() as s:
    results = s.search(q)
    for res in results:
        print(res)
        print('\n')

<Hit {'content': '286913,Frankenstein (rozlišovacia stránka),* román [[Frankenstein]],sk.wikipedia.org/wiki/Frankenstein,* film [[Frankenstein (film Kevina Connora)]],sk.wikipedia.org/wiki/Frankenstein_(film_Kevina_Connora),* film [[Frankenstein (film Marcusa Nispela)]],sk.wikipedia.org/wiki/Frankenstein_(film_Marcusa_Nispela),* film [[Frankenstein (film Kennetha Branagha)]],sk.wikipedia.org/wiki/Frankenstein_(film_Kennetha_Branagha)\n'}>


<Hit {'content': '351072,Moulin Rouge (rozlišovacia stránka),"* francúzsky kabaret v Paríži, pozri [[Moulin Rouge]]",sk.wikipedia.org/wiki/Moulin_Rouge,"* film z roku 2001, pozri [[Moulin Rouge (film z roku 2001)|Moulin Rouge! (film z roku 2001)]]",sk.wikipedia.org/wiki/Moulin_Rouge_(film_z_roku_2001),"* film z roku 1952, pozri [[Moulin Rouge (film z roku 1952)]]",sk.wikipedia.org/wiki/Moulin_Rouge_(film_z_roku_1952),"* film z roku 1934, pozri [[Moulin Rouge (film z roku 1934)]]",sk.wikipedia.org/wiki/Moulin_Rouge_(film_z_roku_1934),"* film z roku 1