In [1]:
import re 
import bisect

import pymorphy2

from collections import defaultdict, namedtuple, Counter
from itertools import chain

from tqdm import tqdm, tqdm_notebook

DocEntry = namedtuple('DocEntry', ['doc_id', 'positions'])

class Parser:
    def __init__(self):
        self.analyzer = pymorphy2.MorphAnalyzer()    
    
    def parse_text(self, text):
        words = (word for word in re.split('\W+', text) if len(word) > 0)
        norm_form = (self.analyzer.normal_forms(word)[0] for word in words)
    
        return list(norm_form)


class InvertedIndex:    
    def __init__(self):
        self.parser = Parser()
        self.dict = defaultdict(list)
        self.texts = dict()
        
    def add_document(self, doc_id, text):
        self.texts[doc_id] = text
        
        words = self.parser.parse_text(text)
        
        word_to_entry = defaultdict(lambda: [])
        
        for pos, word in enumerate(words):
            doc_entry = word_to_entry[word]
            doc_entry.append(pos)
        
        for word, positions in word_to_entry.items():
            postings = self.dict[word]
            entry = DocEntry(doc_id, positions)
            
            i = bisect.bisect_left(postings, entry)
            postings.insert(i, entry)

    def get_postings(self, word):
        return self.dict[word]    
    
    def merge(self, other_index):
        self.texts.update(other_index.texts)
        
        # Task: calc complexity and impore merging  
        for word, postings in other_index.dict.items():
            my_postings = self.dict[word]            
            self.dict[word] = sorted(chain(my_postings, postings))
    

In [None]:
from multiprocessing import Process, Queue
import zipfile
import codecs

def index_worker(in_queue, out_queue):
    index = InvertedIndex()    
      
    while True:
        data = in_queue.get()
        
        if data is None:
            break
            
        split = data.split('\t')
        if len(split) == 2:
            doc_id, text = split
            index.add_document(doc_id, text)
    
    out_queue.put(index)

    
def index_multiproc():
    num_workers = 3

    in_queue = Queue(maxsize=100)
    out_queue = Queue()

    index = InvertedIndex()

    workers = []
    for _ in range(num_workers):
        worker = Process(target=index_worker, args=(in_queue, out_queue))
        worker.start()

        workers.append(worker)

    with zipfile.ZipFile('data/texts.zip') as zf:
        with zf.open('texts.txt', 'r') as f:
            for i, line in tqdm_notebook(zip(range(500), codecs.iterdecode(f, 'utf-8'))):  
                in_queue.put(line)

    for _ in range(num_workers):
        in_queue.put(None)

    for worker in workers:
        index.merge(out_queue.get())   
        
    return index

if __name__ == '__main__':
    index = index_multiproc()