<a href="https://colab.research.google.com/github/Vakhranev/Pushkina/blob/main/%D0%A1%D0%B5%D0%BC%D0%B0%D0%BD%D1%82%D0%B8%D1%87%D0%B5%D1%81%D0%BA%D0%BE%D0%B5%20%D1%81%D1%80%D0%B0%D0%B2%D0%BD%D0%B5%D0%BD%D0%B8%D0%B5%20%D1%83%D1%87%D0%B5%D0%B1%D0%BD%D0%B8%D0%BA%D0%BE%D0%B2%2C%206%20%D0%BA%D0%BB%D0%B0%D1%81%D1%81.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install sentence-transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.11.0->sentence-transformers)
 

In [3]:
!pip install python-docx nltk

Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.2.0


In [4]:
import os
import csv
import docx
import re
import nltk
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, util

nltk.download('punkt')
nltk.download('punkt_tab')

# === 1. Модель ===
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')  # или ru-sbert-large

# === 2. Извлечение текста ===
def extract_text_from_docx(path):
    doc = docx.Document(path)
    full_text = "\n".join(p.text.strip() for p in doc.paragraphs if p.text.strip())
    return full_text

# === 3. Разделение и очистка ===
def clean_and_filter_sentences(text, min_words=3, min_chars=7):
    raw_sents = sent_tokenize(text, language='russian')
    filtered = []
    for s in raw_sents:
        s_clean = re.sub(r'^\d+[.)]?\s*', '', s.strip())
        if len(s_clean) < min_chars:
            continue
        if len(s_clean.split()) < min_words:
            continue
        filtered.append(s_clean)
    return filtered

# === 4. Семантическое сравнение ===
def get_semantic_matches(new_sents, old_sents, threshold=0.8):
    matches = []
    new_embeddings = model.encode(new_sents, convert_to_tensor=True)
    old_embeddings = model.encode(old_sents, convert_to_tensor=True)

    cosine_scores = util.cos_sim(new_embeddings, old_embeddings)
    for i in range(len(new_sents)):
        for j in range(len(old_sents)):
            if cosine_scores[i][j] >= threshold:
                matches.append((new_sents[i], old_sents[j], float(cosine_scores[i][j])))
                break
    return matches

# === 5. Основной процесс ===
def process_files_semantic(new_path, old_paths, threshold=0.8):
    new_text = extract_text_from_docx(new_path)
    new_sents = clean_and_filter_sentences(new_text)

    summary = []
    for old_path in old_paths:
        old_text = extract_text_from_docx(old_path)
        old_sents = clean_and_filter_sentences(old_text)

        matches = get_semantic_matches(new_sents, old_sents, threshold)
        percent = len(matches) / len(new_sents) * 100

        summary.append({
            'file': os.path.basename(old_path),
            'matches': len(matches),
            'total': len(new_sents),
            'percent': round(percent, 2),
            'matched_pairs': matches,
        })

        # Сохраняем в CSV
        output_file = f"semantic_matches_{os.path.basename(old_path)}.csv"
        with open(output_file, 'w', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(['Новый учебник', 'Старый учебник', 'Сходство'])
            for row in matches:
                writer.writerow(row)

    return summary

# === 6. Использование ===
new_file = 'new История нашего края. Донбасс и Новороссия. 6 класс.docx'
old_files = [
    'old История нашего края. ДНР. 6 класс.docx',
    'old История нашего края. Запорожская область. 6 класс.docx',
    'old История нашего края. ЛНР. 6 класс.docx',
    'old История нашего края. Херсонская область. 6-7 классы.docx',
]

threshold = 0.80
results = process_files_semantic(new_file, old_files, threshold)

print("Сводка семантических совпадений:")
for r in results:
    print(f"- {r['file']}: {r['percent']}% ({r['matches']} из {r['total']})")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Сводка семантических совпадений:
- old История нашего края. ДНР. 6 класс.docx: 6.44% (136 из 2111)
- old История нашего края. Запорожская область. 6 класс.docx: 5.59% (118 из 2111)
- old История нашего края. ЛНР. 6 класс.docx: 6.06% (128 из 2111)
- old История нашего края. Херсонская область. 6-7 классы.docx: 3.27% (69 из 2111)
