In [58]:
from zipfile import ZIP_DEFLATED, ZipFile
import io

# считываем тексты
def read_texts():
    data = []
    with ZipFile('../task2/texts.zip', 'r', ZIP_DEFLATED) as zipFile:
        for info in zipFile.infolist():
            with io.TextIOWrapper(zipFile.open(info.filename), encoding="utf-8") as f:
                data.append(f.read())
    print(f'posts length = {len(data)}')
    return data

In [59]:
from nltk.corpus import stopwords
import string

# читаем стоп слова
def read_stop_words():
    stop_words = stopwords.words("russian")
    for ch in string.punctuation:
        stop_words.append(ch)
    return stop_words

In [60]:
from nltk import word_tokenize

# получаем токены с текста
def get_tokens(text, stop_words):
    tokens = word_tokenize(text.replace("-", " "), language="russian")
    tokens = [i.lower() for i in tokens]
    tokens = ([i for i in tokens if i not in stop_words])
    tokens = [i for i in tokens if i != "" and i.isalpha()]
    return tokens

In [61]:
import pymorphy2

morph = pymorphy2.MorphAnalyzer()

# получаем нормальную форма слова
def get_lemma(token):
    return morph.parse(token)[0].normal_form

In [62]:
# получаем инвертированный список терминов из текстов
def get_inverted_index(texts):
    stop_words = read_stop_words()
    terms = {}
    for i, text in enumerate(texts):
        tokens = get_tokens(text, stop_words)
        lemmas = list(dict.fromkeys([get_lemma(token) for token in tokens]))
        for token in lemmas:
            if terms.get(token, None) is None:
                terms[token] = []
            terms[token].append(i + 1)
    return dict(sorted(terms.items()))

In [63]:
import json

# записываем индекс в файл
def write_index(index):
    with open('index.json', 'w', encoding='utf8') as outfile:
        json.dump(index, outfile, indent=4, ensure_ascii=False)

In [64]:
# читаем индекс
def read_index():
    with open('index.json', 'r', encoding='utf8') as file:
        return json.load(file)

In [65]:
# получаем индекс для текстов и записываем его в файл
inverted_index = get_inverted_index(read_texts())        
write_index(inverted_index)

posts length = 100


In [66]:
import enum

# операции для булева поиска
class Operations(enum.Enum):
    AND = (" ")
    OR = ("or")
    NOT = ("-")
    START_BRACKET = ("(")
    END_BRACKET = (")")
    
    def __init__(self, title):
        self.title = title

In [67]:
# класс для операций
class Operation:

    def __init__(self, operation=None, parent=None):
        self.operation = operation
        self.parent = parent
        self.current_word: str = ""
        self.included_operation = None

    def __str__(self):
        return f'[operation = {self.operation} ' \
               f'current_word = {self.current_word}]'

    def __repr__(self):
        return self.__str__()

    # получаем нормальную форму слова в операции
    def token(self): 
        token = self.current_word
        if token != "":
            return morph.parse(token)[0].normal_form
        else:
            return token

In [68]:
# класс для группировок
class Group:

    def __init__(self, parent=None):
        self.operations = []
        self.parent = parent
        if parent is not None:
            parent.operations.append(self)

    def __str__(self):
        return f'\n[operations = {self.operations}\n]'

    def __repr__(self):
        return self.__str__()

In [69]:
# метод для получения структуры запроса
def parse_request(request):
    current_word = ""
    request = request.lower().strip()
    tokens = get_tokens(request, read_stop_words())
    tokens = [get_lemma(token) for token in tokens]
    stop_words = [item for item in request.split(" ") if (item in read_stop_words() and not ["or", "-"])]
    group = Group()
    top_group = group
    current_oper = Operation()
    for i in request:
        if i == Operations.START_BRACKET.title:
            if current_oper.operation is None:
                current_oper.operation = Operations.AND
            group.operations.append(current_oper)
            group = Group(group)
            current_oper = Operation()
        elif i == Operations.END_BRACKET.title:
            group.operations.append(current_oper)
            group = group.parent
            current_oper = Operation(Operations.AND)
        elif i == Operations.NOT.title:
            current_oper = Operation(Operations.NOT)
        elif i.isalpha():
            if current_oper.operation is None:
                current_oper.operation = Operations.AND
            current_oper.current_word += i
        elif i == Operations.AND.title and current_oper.current_word.isalpha():
            if current_oper.current_word in stop_words:
                current_oper.current_word = ""
            else:
                if current_oper.current_word == Operations.OR.title:
                    current_oper.current_word = ""
                    group.operations.append(current_oper)
                    current_oper = Operation(Operations.OR)
                    current_oper.current_word = ""
                else:
                    if current_oper not in group.operations:
                        group.operations.append(current_oper)
                    current_oper = Operation(Operations.AND)
                    group.operations.append(current_oper)
    if current_oper not in group.operations:
        group.operations.append(current_oper)
    return top_group

In [77]:
# метод для получения результата запроса
def resolve_request(request_struct, index):
    result = set()
    current_operator = None
    result_set = False
    for i, operation in enumerate(request_struct.operations):
        if isinstance(operation, Group):
            group_result = resolve_request(operation, index)
            if current_operator is not None:
                if current_operator.operation == Operations.AND:
                    if result_set:
                        result = result.intersection(group_result)
                    else:
                        result = result.union(group_result)
                if current_operator.operation == Operations.NOT:
                    if group_result is not None:
                        excluded = set(group_result)
                        if not result_set:
                            result = set([item for sublist in inverted_index.values() for item in sublist]).difference(excluded)
                        else:
                            result = result.difference(excluded)
                if current_operator.operation == Operations.OR:
                    result = result.union(group_result)
                current_operator = None
        elif isinstance(operation, Operation):
            if operation.token() == "":
                current_operator = operation
            elif operation.token() not in read_stop_words():
                if operation.operation == Operations.AND:
                    docs = inverted_index.get(operation.token(), set())
                    if not result_set:
                        if len(docs) > 0:
                            result = result.union(docs)
                            result_set = True
                        else:
                            result = result.intersection(docs)
                    else:
                        result = result.intersection(docs)
                elif operation.operation == Operations.NOT:
                    docs = inverted_index.get(operation.token(), None)
                    if docs is not None:
                        excluded = set(docs)
                        if current_operator is not None:
                            if current_operator.operation == Operations.AND:
                                result = result.difference(excluded)
                            current_operator = None
                        else:
                            if not result_set:
                                result = set([item for sublist in inverted_index.values() for item in sublist]).difference(excluded)
                            else:
                                result = result.difference(excluded)
                elif operation.operation == Operations.OR:
                    docs = inverted_index.get(operation.token(), None)
                    if len(docs) > 0:
                        if current_operator is not None:
                            if current_operator.operation == Operations.AND:
                                result = result.union(docs)
                            current_operator = None
    return result

In [78]:
# булев поиск с использованием индекса
def search_request(request, inverted_index):
    request_struct = parse_request(request)
    result = resolve_request(request_struct, inverted_index)
    return result

In [109]:
# читаем urls
def read_urls():
    with open('../task1/index.txt') as f:
        lines = f.readlines()
        result = {}
        for i, line in enumerate(lines):
            items = line.split(" ")
            url = items[1].replace("\n", "")
            result[i + 1] = url
        return result

In [86]:
# записываем индекс в файл
def write_results(results):
    with open('results.json', 'w', encoding='utf8') as outfile:
        json.dump(results, outfile, indent=4, ensure_ascii=False)

In [113]:
# получаем индекс, urls, делаем различные запросы
inverted_index = read_index()
urls = read_urls()
requests = [
    "сообщество (программистов -хабр) OR разработчики",
    "сообщество",
    "разработчики",
    "программистов",
    "блогеры",
    "(блогеров OR программистов)",
    "зная человека",
    "зная человека можно or (программистов or инженеров) -программистов",
    "уволить с работы",
    "сдать проект",
    "сдать проект or дедлайн",
    "хочу получить повышение",
    "хочу получить повышение на работе",
    "хочу получить повышение -проект",
    "хочу получит повышение -зарплата",
    "хочу получить повышение без -работы"
]
results = {}
for i, request in enumerate(requests):
    results[request] = [{item : urls[item]} for i, item in enumerate(sorted(search_request(request, inverted_index)))]
for key, result in results.items():
    print(f'{key} = {result}')
write_results(results)

сообщество (программистов -хабр) OR разработчики = [{4: 'https://habr.com/ru/post/206900/'}, {6: 'https://habr.com/ru/post/534242/'}, {11: 'https://habr.com/ru/post/183674/'}, {12: 'https://habr.com/ru/post/522524/'}, {14: 'https://habr.com/ru/post/347760/'}, {15: 'https://habr.com/ru/post/543692/'}, {16: 'https://habr.com/ru/post/147042/'}, {20: 'https://habr.com/ru/post/129640/'}, {22: 'https://habr.com/ru/post/316912/'}, {23: 'https://habr.com/ru/post/439766/'}, {25: 'https://habr.com/ru/post/273249/'}, {27: 'https://habr.com/ru/post/415841/'}, {28: 'https://habr.com/ru/post/117367/'}, {30: 'https://habr.com/ru/post/507498/'}, {36: 'https://habr.com/ru/post/86394/'}, {39: 'https://habr.com/ru/post/541114/'}, {40: 'https://habr.com/ru/post/146730/'}, {41: 'https://habr.com/ru/post/142140/'}, {42: 'https://habr.com/ru/post/275841/'}, {43: 'https://habr.com/ru/post/438514/'}, {44: 'https://habr.com/ru/post/278827/'}, {45: 'https://habr.com/ru/post/147828/'}, {46: 'https://habr.com/ru/p