In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import json
import pymorphy2
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import string


In [None]:
with open('sample.json', 'r',encoding='utf-8') as file:
    sentences = json.load(file)
sentences = [elem['text'] for elem in sentences]
sentences

In [None]:
import os

# Папка, где хранятся ваши txt файлы
data_folder = 'c:\\Users\\warpa\\Synonyms'

# Создадим пустой словарь для хранения данных
synonym_tables = {}

for filename in os.listdir(data_folder):
    if filename.endswith('.txt'):
        with open(os.path.join(data_folder, filename), 'r', encoding='utf-8') as file:
            for line in file:
                # Разделим строку на слово и его синонимы
                word, synonyms = line.split('[')
                word = word.strip(',').lower()
                synonyms = [s.strip(' \'').lower() for s in synonyms.split(']')[0].split(',')]

                # Определим первую букву слова
                first_letter = word[0].lower()

                # Если буква уже есть в таблице, добавим слово и синонимы
                if first_letter in synonym_tables:
                    if word in synonym_tables[first_letter]:
                        synonym_tables[first_letter][word].extend(synonyms)
                    else:
                        synonym_tables[first_letter][word] = synonyms
                else:
                    # В противном случае, создадим новую хэш-таблицу
                    synonym_tables[first_letter] = {word: synonyms}

# Пример использования таблицы для слов на букву 'а'
if 'а' in synonym_tables:
    print(synonym_tables['к'])


In [None]:
import pymorphy2
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import string

morph = pymorphy2.MorphAnalyzer()

def preprocess_sentence(sentence, word_set, synonym_tables):
    sentence = remove_punctuation(sentence)
    sentence = replace_unknown_words(sentence, word_set, synonym_tables)
    sentence = replace_pronouns(sentence)
    sentence = replace_prepositions(sentence)
    sentence = lemmatize_sentence(sentence)
    return sentence

def find_synonyms_in_table(sentence, synonym_tables):
    morph = pymorphy2.MorphAnalyzer()
    words = sentence.lower().split()
    synonyms_dict = {}

    for word in words:
        normalized_word = morph.parse(word)[0].normal_form
        letter = normalized_word[0].lower()

        if letter in synonym_tables:
            if normalized_word in synonym_tables[letter]:
                synonyms = synonym_tables[letter][normalized_word]
                synonyms_dict[word] = synonyms

    return synonyms_dict


def compare_sentences_with_synonyms(sentence1, sentence2, synonym_tables):
    synonyms_dict1 = find_synonyms_in_table(sentence1, synonym_tables)
    
    synonyms_dict2 = find_synonyms_in_table(sentence2, synonym_tables)
    
    for word1, synonyms1 in synonyms_dict1.items():
        for word2 in sentence2.split():
            if word1 == word2 or any(word2 in synonyms1 for synonym in synonyms1):
                sentence1 = sentence1.replace(word1, word2)
                
    return sentence1, sentence2



def lemmatize_sentence(sentence):
    return ' '.join([morph.parse(word)[0].normal_form for word in sentence.split()])

def compare_sentences(words1, words2, synonym_tables):
    lemmas1 = set(words1.split())
    lemmas2 = set(words2.split())
    
    # Получаем предложения с учетом синонимов
    sentence1, sentence2 = compare_sentences_with_synonyms(words1, words2, synonym_tables)

    # Получаем множество слов в предложениях с учетом синонимов
    lemmas1_synonyms = set(sentence1.split())
    lemmas2_synonyms = set(sentence2.split())
   

    common_words = lemmas1_synonyms & lemmas2_synonyms

    common_words_synonyms = set()
    for word in common_words:
        if word in synonym_tables:
            common_words_synonyms.update(synonym_tables[word])
    
    total_words = lemmas1_synonyms.union(lemmas2_synonyms).union(common_words_synonyms)

    similarity = len(common_words) / len(total_words) if len(total_words) > 0 else 0

    if ('не' in lemmas1 and 'не' in lemmas2) or ('не' not in lemmas1 and 'не' not in lemmas2):
        similarity *= 1
    else:
        similarity = similarity - 0.3

    parsed_words1 = [morph.parse(word)[0].normal_form for word in sentence1.split() if morph.parse(word)[0].tag.POS in ('VERB', 'INFN')]
    parsed_words2 = [morph.parse(word)[0].normal_form for word in sentence2.split() if morph.parse(word)[0].tag.POS in ('VERB', 'INFN')]

    if set(parsed_words2) != set(parsed_words1):
        similarity = similarity - 0.3
    else:
        similarity = similarity

    return similarity

def remove_punctuation(sentence):
    return sentence.translate(str.maketrans('', '', string.punctuation))

def replace_pronouns(sentence):
    return ' '.join(['<PRONOUN>' if 'NPRO' in morph.parse(word)[0].tag else word for word in sentence.split()])

def replace_prepositions(sentence, replacement_token='PREPOSITION'):
    return ' '.join([replacement_token if 'PREP' in morph.parse(word)[0].tag else word for word in sentence.split()])

def levenshtein_distance(first_word, second_word):
    if len(first_word) < len(second_word):
        return levenshtein_distance(second_word, first_word)

    if len(second_word) == 0:
        return len(first_word)

    previous_row = list(range(len(second_word) + 1))

    for i, c1 in enumerate(first_word):
        current_row = [i + 1]

        for j, c2 in enumerate(second_word):
            # Calculate insertions, deletions and substitutions
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)

            # Get the minimum to append to the current row
            current_row.append(min(insertions, deletions, substitutions))

        # Store the previous row
        previous_row = current_row

    # Returns the last element (distance)
    return previous_row[-1]

def find_similar(s1, arr):
    for j in range(8):
        for word in arr:
            if levenshtein_distance(s1, word) <= j:
                return word

def replace_unknown_words(sentence, word_set, synonym_tables):
    words = sentence.split()
    corrected_sentence = []

    for i, word in enumerate(words):
        parsed_word = morph.parse(word)[0]
        
        if "UnknownPrefixAnalyzer" in str(parsed_word.methods_stack):
            # Убираем слово с ошибкой из множества перед поиском наиболее похожего
            word_set_without_error = word_set - {word}
            
            # Найдем ближайшее слово из множества и заменим
            corrected_word = find_similar(word, word_set_without_error)
            
            words[i] = corrected_word
            corrected_sentence.append(f"{word} -> {corrected_word}")
        else:
            corrected_sentence.append(word)

    return ' '.join(corrected_sentence)

# Создаем множество всех слов в предложениях
# sentences = {"Я люблю кушать","я люблю есть"}
all_words_set = set(word for sentence in sentences for word in sentence.split())

# Создайте пустой словарь для хранения сходства между предложениями
similarity_matrix = {}

preprocessed_sentences = [preprocess_sentence(sentence, all_words_set, synonym_tables) for sentence in sentences]

# Используем TF-IDF векторизацию
# vectorizer = TfidfVectorizer(analyzer=lambda x: x.split())
# tfidf_matrix = vectorizer.fit_transform(preprocessed_sentences)

# Сравните все пары предложений в датасете
for i in tqdm(range(len(preprocessed_sentences))):
    for j in range(i+1, len(preprocessed_sentences)):
        similarity = compare_sentences(preprocessed_sentences[i], preprocessed_sentences[j], synonym_tables)
        similarity_matrix[(i, j)] = similarity

# Выведите результаты сходства между парами предложений
for (i, j), similarity in similarity_matrix.items():
    print(f"Сходство между предложением {i + 1} и предложением {j + 1}: {similarity}")

In [None]:
import pymorphy2
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import string

morph = pymorphy2.MorphAnalyzer()

def preprocess_sentence(sentence, word_set, synonym_tables):
    sentence = remove_punctuation(sentence)
    sentence = replace_unknown_words(sentence, word_set, synonym_tables)
    sentence = replace_pronouns(sentence)
    sentence = replace_prepositions(sentence)
    sentence = lemmatize_sentence(sentence)
    return sentence

def find_synonyms_in_table(sentence, synonym_tables):
    morph = pymorphy2.MorphAnalyzer()
    words = sentence.lower().split()
    synonyms_dict = {}

    for word in words:
        normalized_word = morph.parse(word)[0].normal_form
        letter = normalized_word[0].lower()

        if letter in synonym_tables and normalized_word in synonym_tables[letter]:
            synonyms = synonym_tables[letter][normalized_word]
            synonyms_dict[word] = synonyms

    return synonyms_dict

def compare_sentences_with_synonyms(sentence1, sentence2, synonym_tables):
    synonyms_dict1 = find_synonyms_in_table(sentence1, synonym_tables)
    synonyms_dict2 = find_synonyms_in_table(sentence2, synonym_tables)

    for word1, synonyms1 in synonyms_dict1.items():
        for word2 in sentence2.split():
            if word1 == word2 or any(word2 in synonyms1 for synonym in synonyms1):
                sentence1 = sentence1.replace(word1, word2)

    return sentence1, sentence2

def lemmatize_sentence(sentence):
    return ' '.join([morph.parse(word)[0].normal_form for word in sentence.split()])

def compare_sentences(words1, words2, synonym_tables):
    # Получаем предложения с учетом синонимов
    sentence1, sentence2 = compare_sentences_with_synonyms(words1, words2, synonym_tables)

    lemmas1 = set(sentence1.split())
    lemmas2 = set(sentence2.split())

    common_words = lemmas1 & lemmas2

    common_words_synonyms = set()
    for word in common_words:
        if word in synonym_tables:
            common_words_synonyms.update(synonym_tables[word])

    total_words = lemmas1.union(lemmas2).union(common_words_synonyms)

    similarity = len(common_words) / len(total_words) if len(total_words) > 0 else 0

    if ('не' in lemmas1 and 'не' in lemmas2) or ('не' not in lemmas1 and 'не' not in lemmas2):
        similarity *= 1
    else:
        similarity = similarity - 0.3

    parsed_words1 = [morph.parse(word)[0].normal_form for word in sentence1.split() if morph.parse(word)[0].tag.POS in ('VERB', 'INFN')]
    parsed_words2 = [morph.parse(word)[0].normal_form for word in sentence2.split() if morph.parse(word)[0].tag.POS in ('VERB', 'INFN')]

    if set(parsed_words2) != set(parsed_words1):
        similarity = similarity - 0.3
    else:
        similarity = similarity

    return similarity

def remove_punctuation(sentence):
    return sentence.translate(str.maketrans('', '', string.punctuation))

# Создаем множество всех слов в предложениях
all_words_set = set(word for sentence in sentences for word in sentence.split())

# Создайте пустой словарь для хранения сходства между предложениями
similarity_matrix = {}

preprocessed_sentences = [preprocess_sentence(sentence, all_words_set, synonym_tables) for sentence in sentences]

# Сравните все пары предложений в датасете
for i in tqdm(range(len(preprocessed_sentences))):
    for j in range(i+1, len(preprocessed_sentences)):
        similarity = compare_sentences(preprocessed_sentences[i], preprocessed_sentences[j], synonym_tables)
        similarity_matrix[(i, j)] = similarity

# Выведите результаты сходства между парами предложений
for (i, j), similarity in similarity_matrix.items():
    print(f"Сходство между предложением {i + 1} и предложением {j + 1}: {similarity}")


In [None]:
sim = [(i, j,similarity ) for (i, j), similarity in similarity_matrix.items() if 0.4<similarity<0.5]

In [None]:
for i, j, similarity in sim:
    sentence1 = sentences[i]
    sentence2 = sentences[j]
    print(f"Похожие предложения (сходство {similarity:.2f}):")
    print(f"{sentence1}")
    print(f"{sentence2}")
    print()