<a href="https://colab.research.google.com/github/Vakhranev/Pushkina/blob/main/%D0%90%D0%BD%D0%B0%D0%BB%D0%B8%D0%B7%20%D1%83%D1%87%D0%B5%D0%B1%D0%BD%D0%B8%D0%BA%D0%BE%D0%B2%20%D0%BF%D0%BE%20%D0%BF%D1%80%D0%B5%D0%B4%D0%BB%D0%BE%D0%B6%D0%B5%D0%BD%D0%B8%D1%8F%D0%BC%20threshold%3D85.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install python-docx nltk

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.2.0


In [2]:
import docx
import nltk
import os
from difflib import SequenceMatcher
import csv
import re

nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize

def extract_text_from_docx(path):
    doc = docx.Document(path)
    full_text = "\n".join(p.text.strip() for p in doc.paragraphs if p.text.strip())
    return full_text

def clean_and_filter_sentences(text, min_words=3, min_chars=7):
    raw_sents = sent_tokenize(text, language='russian')
    filtered = []
    for s in raw_sents:
        s_clean = s.strip()
        # Удалим технические маркеры и слишком короткие предложения
        s_clean = re.sub(r'^\d+[.)]?\s*', '', s_clean)  # убираем "1.", "2)", "3. " и т.п.
        if len(s_clean) < min_chars:
            continue
        if len(s_clean.split()) < min_words:
            continue
        filtered.append(s_clean)
    return filtered

def is_similar(a, b, threshold):
    return SequenceMatcher(None, a, b).ratio() >= threshold

def compare_sentences(base_sents, other_sents, threshold):
    matches = []
    for sent1 in base_sents:
        for sent2 in other_sents:
            if is_similar(sent1, sent2, threshold):
                matches.append((sent1, sent2))
                break  # считаем только первое совпадение
    return matches

def process_files(new_path, old_paths, threshold):
    new_text = extract_text_from_docx(new_path)
    new_sents = clean_and_filter_sentences(new_text)

    summary = []
    for old_path in old_paths:
        old_text = extract_text_from_docx(old_path)
        old_sents = clean_and_filter_sentences(old_text)

        matches = compare_sentences(new_sents, old_sents, threshold=threshold)
        percent = len(matches) / len(new_sents) * 100

        summary.append({
            'file': os.path.basename(old_path),
            'matches': len(matches),
            'total': len(new_sents),
            'percent': round(percent, 2),
            'matched_pairs': matches,
        })

        # save matched pairs
        output_file = f"matches_{os.path.basename(old_path)}.csv"
        with open(output_file, 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['Новый учебник', 'Старый учебник'])
            writer.writerows(matches)

    return summary

# === Пример использования ===
new_file = 'new История нашего края. Донбасс и Новороссия. 5 класс.docx'
old_files = [
    'old История нашего края. ДНР. 5 класс.docx',
    'old История нашего края. Запорожская область. 5 класс.docx',
    'old История нашего края. ЛНР. 5 класс.docx',
    'old История нашего края. Херсонская область. 5 класс.docx'
]

threshold = 0.85
results = process_files(new_file, old_files, threshold)

print("Сводка совпадений:")
for r in results:
    print(f"- {r['file']}: {r['percent']}% ({r['matches']} из {r['total']} предложений совпадают)")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Сводка совпадений:
- old История нашего края. ДНР. 5 класс.docx: 0.45% (18 из 3997 предложений совпадают)
- old История нашего края. Запорожская область. 5 класс.docx: 0.18% (7 из 3997 предложений совпадают)
- old История нашего края. ЛНР. 5 класс.docx: 0.25% (10 из 3997 предложений совпадают)
- old История нашего края. Херсонская область. 5 класс.docx: 0.1% (4 из 3997 предложений совпадают)
