## Import Dependencies 

In [1]:
from __future__ import unicode_literals

import re
import sys
import multiprocessing

import xml.etree.ElementTree as ET
import untangle

from hazm import *
from typing import List, NewType
from tqdm import tqdm
import pickle

In [2]:
sys.setrecursionlimit(10 ** 6)

In [3]:
normalizer = Normalizer()
word_tokenizer = WordTokenizer(separate_emoji=True)
stemmer = Stemmer()


def normalize_text(raw_text: str) -> str:
    return normalizer.normalize(raw_text)


def tokenize_text(normalized_text: str) -> List[str]:
    return word_tokenizer.tokenize(normalized_text)


def remove_punctuation(tokens: List[str]) -> List[str]:
    pattern = re.compile(r"[.!?;\\-]")
    return list(filter(lambda token: not re.match(pattern, token), tokens))


def stem_tokens(tokens: List[str]) -> List[str]:
    return list(map(lambda token: stemmer.stem(token), tokens))


def lemmatize_tokens(tokens: List[str]) -> List[str]:
    lemmatizer = Lemmatizer()
    return list(map(lambda token: lemmatizer.lemmatize(token), tokens))


def prepare_text(
    raw_text: str,
    stem: bool = True,
    del_punctuation: bool = False,
    lemmatize: bool = False,
    debug: bool = False,
) -> List[str]:
    normalized_text = normalize_text(raw_text)
    if debug:
        print(normalized_text)
    tokens = tokenize_text(normalized_text)
    if del_punctuation:
        tokens = remove_punctuation(tokens)
    if stem:
        tokens = stem_tokens(tokens)
    if lemmatize:
        tokens = lemmatize_tokens(tokens)
    return tokens

In [4]:
DocId = NewType("DocId", int)

In [5]:
class Document:
    title_tokens: List[str] = []
    text_tokens: List[str] = []

    def __init__(self, doc_id: DocId, title: str, text: str):
        self.doc_id = doc_id
        self.title = title
        self.text = text
        self.title_tokens = prepare_text(title)
        self.text_tokens = prepare_text(text)

In [6]:
class PostingListItem:
    def __init__(self, doc_id: DocId):
        self.doc_id = doc_id
        self.title_positions = []
        self.text_positions = []

    def add_to_positions(self, field: str, position: int):
        self.__getattribute__(f"{field}_positions").append(position)

    def __str__(self):
        return f"""
        Document ID: {self.doc_id}
        Title Positions: {self.title_positions}
        Text Positions: {self.text_positions}
        """

In [7]:
def create_doc(page, debug: bool = False):
    if debug:
        print(f"Document {page.id.cdata} started!")
    doc = Document(
        int(page.id.cdata), page.title.cdata, page.revision.text.cdata
    )
    if debug:
        print(f"Document {page.id.cdata} done!")
    return doc

In [8]:
def create_documents(
    docs_path: str = "./data/Persian.xml", multiprocess: bool = False
) -> List[Document]:
    tree = untangle.parse(docs_path)
    documents: List[Document] = []

    if multiprocess:
        pool = multiprocessing.Pool()
        for page in tree.mediawiki.page:
            documents.append(pool.apply_async(create_doc, args=(page,)))
        pool.close()
        pool.join()
        documents = [res.get() for res in documents]
    else:
        for page in tqdm(tree.mediawiki.page):
            documents.append(
                Document(
                    int(page.id.cdata),
                    page.title.cdata,
                    page.revision.text.cdata,
                )
            )
    documents = sorted(documents, key=lambda document: document.doc_id)
    return documents

In [9]:
def construct_positional_indexes(
    docs_path: str = "./data/Persian.xml", multiprocess: bool = False
):
    documents = create_documents(docs_path, multiprocess)
    return documents

In [10]:
documents = construct_positional_indexes()

100%|██████████| 1572/1572 [00:15<00:00, 99.66it/s] 


In [11]:
with open("documents.pickle", "wb") as f:
    pickle.dump(documents, f)

In [12]:
corpus_index = dict()
for document in tqdm(documents):
    token_positional_list_item_dict = dict()
    for field in ["title", "text"]:
        for idx, token in enumerate(
            document.__getattribute__(f"{field}_tokens")
        ):
            if token not in token_positional_list_item_dict:
                token_positional_list_item_dict[token] = PostingListItem(
                    document.doc_id
                )
            token_positional_list_item_dict[token].add_to_positions(field, idx)
    for token in token_positional_list_item_dict:
        if token not in corpus_index:
            corpus_index[token] = []
        corpus_index[token].append(token_positional_list_item_dict[token])

100%|██████████| 1572/1572 [00:08<00:00, 186.23it/s]


In [88]:
from src.main import construct_positional_indexes 

In [87]:
sys.path.insert(0, "../MIR")