In [4]:
import json

# читаем индекс
def read_index():
    with open('../task4/tf_idf.json', 'r', encoding='utf-8') as file:
        return json.load(file)

In [5]:
from nltk.corpus import stopwords
import string


# читаем стоп слова
def read_stop_words():
    stop_words = stopwords.words("russian")
    for ch in string.punctuation:
        stop_words.append(ch)
    return stop_words

In [6]:
from nltk import word_tokenize


# получаем токены с текста
def get_tokens(text, stop_words):
    tokens = word_tokenize(text.replace("-", " "), language="russian")
    tokens = [i.lower() for i in tokens]
    tokens = ([i for i in tokens if i not in stop_words])
    tokens = [i for i in tokens if i != "" and i.isalpha()]
    return tokens

In [7]:
import pymorphy2

morph = pymorphy2.MorphAnalyzer()

# получаем нормальную форма слова
def get_lemma(token):
    return morph.parse(token)[0].normal_form

In [8]:
import math

# методы для вычисления tf, idf, tf-idf

def compute_tf(term_count, all_terms_count):
    return round(term_count / float(all_terms_count), 6)


def compute_idf(docs_count, all_docs):
    return round(math.log10(all_docs / float(docs_count)), 6)


def compute_tf_idf(tf, idf):
    return round(tf * idf, 6)

In [9]:
# получаем матрицу для документов
def get_vector_matrix(index, N):
    matrix = []
    for i in range(1, N):
        matrix.append([])
        for term, docs in index.items():
            tf_idf = docs[f"doc_{i}"]["tf-idf"] if f"doc_{i}" in docs.keys() else 0
            matrix[i - 1].append(tf_idf)
    return matrix

# записываем матрицу документов
def write_vector_matrix(matrix):
    with open('matrix.json', 'w', encoding='utf8') as outfile:
        json.dump(matrix, outfile, indent=4, ensure_ascii=False)

# читаем матрицу документов
def read_vector_matrix():
    with open('matrix.json', 'r', encoding='utf8') as file:
        return json.load(file)

In [10]:
# читаем urls
def read_urls():
    with open('../task1/index.txt') as f:
        lines = f.readlines()
        result = {}
        for i, line in enumerate(lines):
            items = line.split(" ")
            url = items[1].replace("\n", "")
            result[i + 1] = url
        return result

In [24]:
# парсим запрос
def parse_request(request, index):
    stop_words = read_stop_words()
    words = [get_lemma(token) for token in get_tokens(request, stop_words)]
    words = [item for item in words if item in index.keys()]
    return words

# заполняем значения для запроса
def get_request_vector(words, index):
    terms = list(index.keys())
    vector = [0] * len(terms)
    unique_words = set(words)
    for word in unique_words:
        count = words.count(word)
        tf = compute_tf(count, len(words))
        idf = next(iter(index[word].values()))["idf"]
        tf_idf = compute_tf_idf(tf, idf)
        term_index = terms.index(word)        
        vector[term_index] = tf_idf
    return vector

In [25]:
# ищем запрос с использованием индекса и матрицы
def search_request_with_matrix(request, index, matrix):
    N = len(matrix)
    words = parse_request(request, index)
    vector = get_request_vector(words, index)
    counter = {}
    sim = {}
    len_vec = math.sqrt(sum([x ** 2 for x in vector]))
    for i in range(1, N):
        doc_vector = matrix[i - 1]
        mult = sum([vector[j] * doc_vector[j] for j, item in enumerate(doc_vector)])
        len_doc = math.sqrt(sum([x ** 2 for x in doc_vector]))
        sim[i] = round(mult / (len_vec * len_doc), 6)
    filtered_sim = {k:v for k, v in sim.items() if v != 0.0}
    sorted_sim = {k:v for k, v in sorted(filtered_sim.items(), key=lambda item: -item[1])}
    return sorted_sim

In [19]:
# получаем и записываем матрицу документов
N = 100
index = read_index()
matrix = get_vector_matrix(index, N)
write_vector_matrix(matrix)

In [32]:
# записываем индекс в файл
def write_results(results):
    with open('results.json', 'w', encoding='utf8') as outfile:
        json.dump(results, outfile, indent=4, ensure_ascii=False)

In [26]:
%%time
# получаем индекс, матрицу и urls
N = 100
index = read_index()
matrix = read_vector_matrix()
urls = read_urls()
# выполняем запросы
requests = [
    "программисты",
    "блогеры программисты",
    "как повысить зарплату",
    "как получить повышение",
    "как получить повышение на работе",
    "как заработать",
    "как заработать миллион",
    "миллион",
]
results = {}
for i, request in enumerate(requests):
    result = search_request_with_matrix(request, index, matrix)
    result_list = results[request] = []
    for key, item in result.items():
        result_item = {}
        result_item["doc"] = key
        result_item["url"] = urls[key]
        result_item["tf-idf"] = item
        result_list.append(result_item)
# вывод результата
for key, result in results.items():
    print(f'{key} = {result}')

write_results(results)

программисты = [{'doc': 88, 'url': 'https://habr.com/ru/post/210142/', 'tf-idf': 0.299229}, {'doc': 11, 'url': 'https://habr.com/ru/post/183674/', 'tf-idf': 0.080794}, {'doc': 42, 'url': 'https://habr.com/ru/post/275841/', 'tf-idf': 0.059777}, {'doc': 69, 'url': 'https://habr.com/ru/post/51544/', 'tf-idf': 0.036804}, {'doc': 97, 'url': 'https://habr.com/ru/post/423889/', 'tf-idf': 0.032214}, {'doc': 77, 'url': 'https://habr.com/ru/post/192604/', 'tf-idf': 0.020493}, {'doc': 59, 'url': 'https://habr.com/ru/post/488592/', 'tf-idf': 0.01934}, {'doc': 36, 'url': 'https://habr.com/ru/post/86394/', 'tf-idf': 0.018285}, {'doc': 53, 'url': 'https://habr.com/ru/post/450266/', 'tf-idf': 0.0178}, {'doc': 43, 'url': 'https://habr.com/ru/post/438514/', 'tf-idf': 0.017367}, {'doc': 90, 'url': 'https://habr.com/ru/post/70330/', 'tf-idf': 0.015807}, {'doc': 20, 'url': 'https://habr.com/ru/post/129640/', 'tf-idf': 0.015038}, {'doc': 12, 'url': 'https://habr.com/ru/post/522524/', 'tf-idf': 0.014262}, {'

In [12]:
# получаем словарь документ-слова
def get_docs_dict(index, N):
    matrix = {}
    for i in range(1, N):
        doc = matrix[f"doc_{i}"] = {}
        for term, docs in index.items():
            tf_idf = docs[f"doc_{i}"]["tf-idf"] if f"doc_{i}" in docs.keys() else 0
            if tf_idf != 0:  
                doc[term] = tf_idf
    return matrix

# записываем значения tf, idf, tf-idf в json и txt(по заданию)
def write_docs_dict(matrix):
    with open('docs_dict.json', 'w', encoding='utf8') as outfile:
        json.dump(matrix, outfile, indent=4, ensure_ascii=False)
        
def read_docs_dict():
    with open('docs_dict.json', 'r', encoding='utf8') as file:
        return json.load(file)

In [30]:
# парсим запрос
def parse_request(request, terms):
    stop_words = read_stop_words()
    words = [get_lemma(token) for token in get_tokens(request, stop_words)]
    words = [item for item in words if item in terms]
    return words

# вычисляем значения для запроса
def get_request_index(words, index):
    request_index = {}
    unique_words = set(words)
    for word in unique_words:
        word_item = request_index[word] = {}
        word_item["count"] = words.count(word)
        word_item["tf"] = compute_tf(word_item["count"], len(words))
        word_item["idf"] = next(iter(index[word].values()))["idf"]
        word_item["tf-idf"] = compute_tf_idf(word_item["tf"], word_item["idf"])
    return request_index

# ищем запрос по индексу и словарю документов (по сути разряженной матрице документов)
def search_request_by_index(request, index, matrix):
    N = len(matrix.keys())
    terms = list(index.keys())
    words = parse_request(request, terms)
    request_index = get_request_index(words, index)
    sim = {}
    len_vec = math.sqrt(sum([x["tf-idf"] ** 2 for x in request_index.values()]))
    for i in range(1, N):
        doc = matrix[f"doc_{i}"]
        len_doc = math.sqrt(sum([x ** 2 for x in doc.values()]))
        mult = 0
        for key, value in request_index.items():
            request_tf_idf = value["tf-idf"]
            doc_tf_idf = doc.get(key, 0)
            mult += request_tf_idf * doc_tf_idf
        sim[i] = round(mult / (len_vec * len_doc), 6)
    filtered_sim = {k:v for k, v in sim.items() if v != 0.0}
    sorted_sim = {k:v for k, v in sorted(filtered_sim.items(), key=lambda item: -item[1])}
    return sorted_sim

In [27]:
# получаем и записываем словарь для документов (разряженная матрица документов)
N = 100
index = read_index()
docs_dict = get_docs_dict(index, N)
write_docs_dict(docs_dict)

In [31]:
%%time
# получаем индекс, матрицу и urls
N = 100
index = read_index()
matrix = read_docs_dict()
urls = read_urls()
# выполняем запросы
requests = [
    "программисты",
    "блогеры программисты",
    "как повысить зарплату",
    "как получить повышение",
    "как получить повышение на работе",
    "как заработать",
    "как заработать миллион",
    "миллион",
]
results = {}
for i, request in enumerate(requests):
    result = search_request_by_index(request, index, matrix)
    result_list = results[request] = []
    for key, item in result.items():
        result_item = {}
        result_item["doc"] = key
        result_item["url"] = urls[key]
        result_item["tf-idf"] = item
        result_list.append(result_item)
# вывод результата
for key, result in results.items():
    print(f'{key} = {result}')
    
write_results(results)

программисты = [{'doc': 88, 'url': 'https://habr.com/ru/post/210142/', 'tf-idf': 0.299229}, {'doc': 11, 'url': 'https://habr.com/ru/post/183674/', 'tf-idf': 0.080794}, {'doc': 42, 'url': 'https://habr.com/ru/post/275841/', 'tf-idf': 0.059777}, {'doc': 69, 'url': 'https://habr.com/ru/post/51544/', 'tf-idf': 0.036804}, {'doc': 97, 'url': 'https://habr.com/ru/post/423889/', 'tf-idf': 0.032214}, {'doc': 77, 'url': 'https://habr.com/ru/post/192604/', 'tf-idf': 0.020493}, {'doc': 59, 'url': 'https://habr.com/ru/post/488592/', 'tf-idf': 0.01934}, {'doc': 36, 'url': 'https://habr.com/ru/post/86394/', 'tf-idf': 0.018285}, {'doc': 53, 'url': 'https://habr.com/ru/post/450266/', 'tf-idf': 0.0178}, {'doc': 43, 'url': 'https://habr.com/ru/post/438514/', 'tf-idf': 0.017367}, {'doc': 90, 'url': 'https://habr.com/ru/post/70330/', 'tf-idf': 0.015807}, {'doc': 20, 'url': 'https://habr.com/ru/post/129640/', 'tf-idf': 0.015038}, {'doc': 12, 'url': 'https://habr.com/ru/post/522524/', 'tf-idf': 0.014262}, {'