In [1]:
# Импорт библиотек
import json
import re
from collections import defaultdict
from pathlib import Path
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
import joblib
import os

# Настройка путей
marks_dir = "301"  # Папка с JSON-файлами (marks/<id>.json)
combined_results_dir = "combined_results"  # Папка с текстами (<id>_combined.txt)
output_model_dir = "models"  # Папка для сохранения модели
scored_results_dir = "logreg_scored_results"  # Папка для файлов с оценками
os.makedirs(output_model_dir, exist_ok=True)
os.makedirs(scored_results_dir, exist_ok=True)

In [3]:
# Функции
def parse_mask(mask):
    scores = re.findall(r'(\d)\(\d\)', mask)
    if len(scores) != 7:
        raise ValueError(f"Некорректный mask: {mask}, ожидается 7 оценок")
    return [int(score) for score in scores]

def read_combined_texts(file_path):
    task_texts = defaultdict(list)
    current_task = None
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                if line.endswith(':'):
                    current_task = line[:-1]
                elif current_task:
                    task_texts[current_task].append(line)
    except Exception as e:
        print(f"Ошибка чтения файла {file_path}: {e}")
        return {}
    return {task: " ".join(texts) for task, texts in task_texts.items()}

# Подготовка данных
texts = []
labels = []
test_texts = []
test_labels = []
test_file_mapping = []
task_numbers = [str(i) for i in range(22, 29)]

all_files = sorted(Path(combined_results_dir).glob("*_combined.txt"))
if len(all_files) < 370:
    raise ValueError("Недостаточно файлов для обучения")

train_files = all_files[:370]
test_files = all_files[370:]

# Обучающая выборка
for combined_file in train_files:
    work_id = combined_file.stem.replace("_combined", "")
    json_file = Path(marks_dir) / f"{work_id}.json"
    if not json_file.exists():
        continue
    try:
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        mask = data.get("mask")
        if not mask:
            continue
        scores = parse_mask(mask)
        task_texts = read_combined_texts(combined_file)
        for i, task_num in enumerate(task_numbers):
            text = task_texts.get(task_num, "")
            if text:
                texts.append(text)
                labels.append(scores[i])
    except Exception as e:
        print(f"Ошибка обработки {combined_file}: {e}")

# Тестовая выборка
for combined_file in test_files:
    work_id = combined_file.stem.replace("_combined", "")
    json_file = Path(marks_dir) / f"{work_id}.json"
    if not json_file.exists():
        continue
    try:
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        mask = data.get("mask")
        if not mask:
            continue
        scores = parse_mask(mask)
        task_texts = read_combined_texts(combined_file)
        for i, task_num in enumerate(task_numbers):
            text = task_texts.get(task_num, "")
            if text:
                test_texts.append(text)
                test_labels.append(scores[i])
                test_file_mapping.append((combined_file, task_num, text))
    except Exception as e:
        print(f"Ошибка обработки {combined_file}: {e}")

if not texts:
    raise ValueError("Нет данных для обучения")

print(f"Обучающая выборка: {len(texts)} текстов")
print(f"Тестовая выборка: {len(test_texts)} текстов")

Обучающая выборка: 1748 текстов
Тестовая выборка: 235 текстов


In [4]:
# Обучение модели
vectorizer = TfidfVectorizer(max_features=5000)
classifier = LogisticRegression(multi_class='multinomial', max_iter=1000)
model = make_pipeline(vectorizer, classifier)

model.fit(texts, labels)
joblib.dump(model, Path(output_model_dir) / "logreg_model.joblib")
print("Модель LogisticRegression обучена и сохранена")

# Оценка модели
if test_texts:
    predictions = model.predict(test_texts)
    accuracy = accuracy_score(test_labels, predictions)
    print(f"Точность на тестовой выборке: {accuracy:.4f}")
else:
    print("Нет тестовой выборки")

# Запись файлов с оценками
test_file_groups = defaultdict(list)
for file_path, task_num, text in test_file_mapping:
    test_file_groups[file_path].append((task_num, text))

for combined_file, tasks in test_file_groups.items():
    try:
        output_lines = []
        for task_num, text in sorted(tasks):
            score = model.predict([text])[0]
            output_lines.append(f"{task_num}: {score}\n{text}")
        output_file = Path(scored_results_dir) / combined_file.name
        with open(output_file, 'w', encoding='utf-8') as f:
            for line in output_lines:
                f.write(line + "\n")
        print(f"Оценки сохранены в {output_file}")
    except Exception as e:
        print(f"Ошибка при записи {combined_file}: {e}")


Модель LogisticRegression обучена и сохранена
Точность на тестовой выборке: 0.4766
Оценки сохранены в logreg_scored_results/2820712761_combined.txt
Оценки сохранены в logreg_scored_results/2820714602_combined.txt
Оценки сохранены в logreg_scored_results/2820715308_combined.txt
Оценки сохранены в logreg_scored_results/2820718037_combined.txt
Оценки сохранены в logreg_scored_results/2820718380_combined.txt
Оценки сохранены в logreg_scored_results/2820720395_combined.txt
Оценки сохранены в logreg_scored_results/2820720456_combined.txt
Оценки сохранены в logreg_scored_results/2820720587_combined.txt
Оценки сохранены в logreg_scored_results/2820720668_combined.txt
Оценки сохранены в logreg_scored_results/2820721594_combined.txt
Оценки сохранены в logreg_scored_results/2820722961_combined.txt
Оценки сохранены в logreg_scored_results/2820723699_combined.txt
Оценки сохранены в logreg_scored_results/2820726115_combined.txt
Оценки сохранены в logreg_scored_results/2820726168_combined.txt
Оценки 

