In [1]:
import gc
from collections import defaultdict

import morfeusz2
from tqdm.auto import tqdm

In [2]:
def read_text(lines):
    content = []
    from_title = 0
    contents = []
    titles = []
    for line in tqdm(lines):
        if "TITLE:" in line:
            from_title = 0
            title = line[7:].strip()
            titles.append([title])
            if len(content) > 0:
                contents.append(content)
            content = []
        elif len(line.strip()) > 0:
            content.append(line.strip())
        from_title += 1
    if len(content) > 0:
        contents.append(content)
    return contents, titles

In [3]:
def index_documents(texts):
    processed_texts = []
    for text in tqdm(texts):
        text = " ".join(text)
        processed_texts.append(text)
    return processed_texts

In [15]:
def process_texts(texts):
    processed_texts = []
    for text in tqdm(texts):
        text = [w.strip().split() for w in text]
        text = sum(text, [])
        text = [w.lower() for w in text if w.isalpha()]
        processed_texts.append(set(text))
    return processed_texts

In [16]:
def lematize_text(text, morph):
    text = " ".join(list(text))
    lematized_text = []
    analysis = morph.analyse(text)
    for i, j, interp in analysis:
        lematized_text.append(interp[1].partition(":")[0])
    return list(set(lematized_text))

In [17]:
def lematize_texts(texts, morph):
    stemmed_texts = []
    for text in tqdm(texts):
        stemmed_texts.append(lematize_text(text, morph))
    return stemmed_texts

In [18]:
def prepare_postinglists(texts):
    postinglists = defaultdict(list)
    for i in tqdm(range(len(texts))):
        text = texts[i]
        for word in text:
            postinglists[word].append(str(i))
    return postinglists

In [4]:
with open("../data/fp_wiki.txt", "r") as f:
    lines = f.readlines()

In [5]:
contents, titles = read_text(lines)

  0%|          | 0/6074907 [00:00<?, ?it/s]

In [9]:
processed_contents, processed_titles = process_texts(contents), process_texts(titles)

  0%|          | 0/1208362 [00:00<?, ?it/s]

  0%|          | 0/1208362 [00:00<?, ?it/s]

In [10]:
contents_exact_postinglists = prepare_postinglists(processed_contents)
with open('../outputs/contents_exact_postinglists.txt', 'w') as f:
    for k, v in contents_exact_postinglists.items():
        if len(v) < 1000:
            f.write(f'{k}: {", ".join(v)} \n')

  0%|          | 0/1208362 [00:00<?, ?it/s]

In [None]:
titles_exact_postinglists = prepare_postinglists(processed_titles)
with open('../outputs/titles_exact_postinglists.txt', 'w') as f:
    for k, v in titles_exact_postinglists.items():
        counts_in_contents = contents_exact_postinglists.get(k)
        if counts_in_contents is None or len(counts_in_contents) < 1000:
            f.write(f'{k}: {", ".join(v)} \n')

In [12]:
morph = morfeusz2.Morfeusz()

In [13]:
lematized_contents = lematize_texts(processed_contents, morph)

  0%|          | 0/1208362 [00:00<?, ?it/s]

In [16]:
contents_lematized_postinglists = prepare_postinglists(lematized_contents)
with open('../outputs/contents_lematized_postinglists.txt', 'w') as f:
    for k, v in contents_lematized_postinglists.items():
        if len(v) < 1000:
            f.write(f'{k}: {", ".join(v)} \n')

  0%|          | 0/1208362 [00:00<?, ?it/s]

In [22]:
lematized_titles = lematize_texts(processed_titles, morph)

  0%|          | 0/1208362 [00:00<?, ?it/s]

In [25]:
titles_lematized_postinglists = prepare_postinglists(lematized_titles)
with open('../outputs/titles_lematized_postinglists.txt', 'w') as f:
    for k, v in titles_lematized_postinglists.items():
        counts_in_contents = contents_lematized_postinglists.get(k)
        if counts_in_contents is None or len(counts_in_contents) < 1000:
            f.write(f'{k}: {", ".join(v)} \n')

  0%|          | 0/1208362 [00:00<?, ?it/s]