In [None]:
# this part of code for downloading the fasttext modal into the folder "./fasttext-model"

import requests
import certifi
import gzip
import shutil
import os

# URL for the Russian fasttext model
url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz"

# Step 1: Download the file with SSL verification
response = requests.get(url, verify=certifi.where())
with open("cc.ru.300.bin.gz", "wb") as file:
    file.write(response.content)

# Step 2: Unzip the file
with gzip.open("cc.ru.300.bin.gz", "rb") as f_in:
    with open("cc.ru.300.bin", "wb") as f_out:
        shutil.copyfileobj(f_in, f_out)

# Step 3: Create the directory if it doesn't exist
os.makedirs("./fasttext-model", exist_ok=True)

# Step 4: Move the unzipped file to the directory
shutil.move("cc.ru.300.bin", "./fasttext-model/cc.ru.300.bin")

# Step 5: Remove the gzipped file
os.remove("cc.ru.300.bin.gz")

In [None]:
import fasttext
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import json
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pymorphy3
import os

In [2]:
# Загрузка необходимых ресурсов
nltk.data.path.append(os.path.abspath("./nltk_data"))
nltk.download("punkt", download_dir="./nltk_data")
nltk.download("stopwords", download_dir="./nltk_data")

# Инициализация pymorphy3
morph = pymorphy3.MorphAnalyzer()

# Получение списка русских стоп-слов
stop_words = set(stopwords.words('russian'))

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1000)>


In [3]:
# Загрузка модели fasttext
model = fasttext.load_model("./fasttext-model/cc.ru.300.bin")



In [4]:
# Открываем файл JSON и загружаем данные
with open('./data-base/dataset.json', 'r', encoding='utf-8') as file:
    dataset_json = json.load(file)

# Извлекаем все значения 'title' в массив
questions = [item['title'] for item in dataset_json['data']]

In [5]:
def lemmatize_text(text):
    text = text.lower()
    words = word_tokenize(text, language="russian")
    words = [word for word in words if word.isalnum() and word not in stop_words]
    lemmatized_words = [morph.parse(word)[0].normal_form for word in words]
    return ' '.join(lemmatized_words)

In [6]:
lemmatized_questions = [{'question': question, 'lemmatized_question': lemmatize_text(question)} for question in questions]
questions = [item["lemmatized_question"] for item in lemmatized_questions]

In [7]:
def sentence_to_vector(sentence, model):
    words = sentence.split()
    word_vectors = [model.get_word_vector(word) for word in words if word in model]
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model.get_dimension())

In [8]:
# Открываем файл JSON с векторами и загружаем данные
with open('./fasttext-model/dataset-vectors/fasttext_weights.json', 'r', encoding='utf-8') as file:
    vector_data = json.load(file)

question_vectors = np.array(vector_data)

In [9]:
def find_top_similar_questions(new_question, question_vectors, questions, model,):
    new_vector = sentence_to_vector(new_question, model)
    similarities = cosine_similarity([new_vector], question_vectors)[0]
    top_indices = np.argsort(similarities)[::-1]
    top_questions = []
    for idx in top_indices:
        similarity = similarities[idx]
        if similarity >= 0.5:
            if top_questions and (top_questions[0][1] - similarity) > 0.01:
                break
            top_questions.append((questions[idx], similarity))
        if len(top_questions) >= 4:
            break
    return top_questions

In [10]:
def find_original_question(similar_question, dataset):
    for item in dataset['data']:
        if similar_question == lemmatize_text(item['title']):
            return item
    return None

In [11]:
def get_answers_with_details(top_similar_questions, dataset):
    answers_with_details = []
    for question, similarity in top_similar_questions:
        original_question_data = find_original_question(question, dataset)
        if original_question_data:
            answer_detail = {
                "title": original_question_data["title"],
                "description": original_question_data["description"],
                "url": original_question_data["url"]
            }
            answers_with_details.append(answer_detail)
    return answers_with_details

In [20]:
user_question = input()
user_question = lemmatize_text(user_question)
top_similar_questions = find_top_similar_questions(user_question, question_vectors, questions, model)

In [21]:
answers_with_details = get_answers_with_details(top_similar_questions, dataset_json)

prompt = answers_with_details if answers_with_details else [{"answer": "Извините, я не могу найти ответ на ваш вопрос."}]

In [22]:
def save_json(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)

In [23]:
json_file_path = './querry/prompt.json'
save_json(prompt, json_file_path)