In [51]:
import os
from string import punctuation
from pymorphy2 import MorphAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from whoosh.analysis import Filter, Tokenizer, Token, StopFilter

from whoosh.lang.snowball.russian import RussianStemmer
from whoosh.qparser import QueryParser, OrGroup
from whoosh.index import create_in
from whoosh.fields import *
from whoosh import scoring

In [1]:
os.mkdir('indexdir')

In [52]:
class MyTokenizer(Tokenizer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.tokenizer = word_tokenize

    def __call__(self, value, positions = False, chars = False,
                 keeporiginal = False, removestops = True,
                 start_pos = 0, start_char = 0, mode='',
                 **kwargs):
        t = Token(positions, chars, removestops = removestops, mode=mode)
        for pos, word in enumerate(self.tokenizer(value)):
            word_len = len(word)
            word = word.strip(punctuation)
            if word.isalpha():
                t.text = word
                if keeporiginal:
                    t.original = word
                if positions:
                    t.pos = start_pos + pos
                if chars:
                    t.startchar = start_char
                    t.endchar = start_char + len(word)
                yield t
            start_char += word_len 
    

class MyFilter(Filter):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.morph = MorphAnalyzer()
        self.stopwords = stopwords.words('russian')

    def __call__(self, tokens):
        for token in tokens:
            token.text = self.morph.parse(token.text.strip())[0].normal_form
            if token.text not in self.stopwords:
                yield token

In [53]:
stemmer_ru = RussianStemmer()
analyzer = MyTokenizer() | MyFilter()
schema = Schema(
    title=TEXT(stored=True, analyzer=analyzer),
    path=ID(stored=True), 
    content=TEXT(analyzer=analyzer)
)

ix = create_in("indexdir", schema)
writer = ix.writer()

In [54]:
directory = './texts'

for address, dirs, files in os.walk(directory):
    for name in files:
        path = os.path.join(address, name)
        with open(path, 'r', encoding='utf-8') as f:
            text = f.read()
            writer.add_document(
                title = name, 
                path = path,
                content = text
            )
writer.commit()

In [58]:
writer = ix.writer()
with open('./texts/fourth_doc.txt') as f:
    text = f.read()
    writer.add_document(
        title = 'fourth_doc.txt', 
        path = './texts/fourth_doc.txt',
        content = text
    )
writer.commit()

In [70]:
qu = QueryParser("content", ix.schema,  group=OrGroup)
with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
    query = qu.parse("знания и умения")
    results = searcher.search(query, terms=True)
    if results.has_matched_terms():
        print(results.matched_terms())
        for hit in results:
            print([word[-1].decode('utf-8') for word in hit.matched_terms()])
    print(results[0])

{('content', b'\xd1\x83\xd0\xbc\xd0\xb5\xd0\xbd\xd0\xb8\xd0\xb5'), ('content', b'\xd0\xb7\xd0\xbd\xd0\xb0\xd0\xbd\xd0\xb8\xd0\xb5')}
['умение', 'знание']
['знание']
<Hit {'path': './texts/second_doc.txt', 'title': 'second_doc.txt'}>


In [21]:
with ix.searcher(weighting=scoring.TF_IDF()) as searcher:
    searcher_content = [word.decode('utf-8') for word in searcher.lexicon("content")]

False

In [22]:
'таргетом' in searcher_content

True