In [6]:
# Импорт библиотек
import json
import re
from collections import defaultdict
from pathlib import Path
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
import joblib
import os

# Настройка путей
marks_dir = "301"
combined_results_dir = "combined_results"
output_model_dir = "models"
scored_results_dir = "decitree_scored_results"
os.makedirs(output_model_dir, exist_ok=True)
os.makedirs(scored_results_dir, exist_ok=True)

# Проверка существования папок
print(f"Папка marks существует: {Path(marks_dir).exists()}")
print(f"Папка combined_results существует: {Path(combined_results_dir).exists()}")


Папка marks существует: True
Папка combined_results существует: True


In [7]:
# Функции для обработки данных
def parse_mask(mask):
    scores = re.findall(r'(\d)\(\d\)', mask)
    if len(scores) != 7:
        raise ValueError(f"Некорректный mask: {mask}, ожидается 7 оценок")
    return [int(score) for score in scores]

def read_combined_texts(file_path):
    task_texts = defaultdict(list)
    current_task = None
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                if line.endswith(':'):
                    current_task = line[:-1]
                elif current_task:
                    task_texts[current_task].append(line)
    except Exception as e:
        print(f"Ошибка чтения файла {file_path}: {e}")
        return {}
    return {task: " ".join(texts) for task, texts in task_texts.items()}

# Подготовка данных
texts = []
labels = []
test_texts = []
test_labels = []
test_file_mapping = []
task_numbers = [str(i) for i in range(22, 29)]

all_files = sorted(Path(combined_results_dir).glob("*_combined.txt"))
if len(all_files) < 370:
    print(f"Недостаточно файлов: найдено только {len(all_files)}")
    train_files = all_files
    test_files = []
else:
    train_files = all_files[:370]
    test_files = all_files[370:]

# Обработка обучающих данных
for combined_file in train_files:
    try:
        work_id = combined_file.stem.replace("_combined", "")
        json_file = Path(marks_dir) / f"{work_id}.json"
        if not json_file.exists():
            continue
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        mask = data.get("mask")
        if not mask:
            continue
        scores = parse_mask(mask)
        task_texts = read_combined_texts(combined_file)
        if not task_texts:
            continue
        for i, task_num in enumerate(task_numbers):
            text = task_texts.get(task_num, "")
            if text:
                texts.append(text)
                labels.append(scores[i])
    except Exception as e:
        print(f"Ошибка: {e}")

# Обработка тестовых данных
for combined_file in test_files:
    try:
        work_id = combined_file.stem.replace("_combined", "")
        json_file = Path(marks_dir) / f"{work_id}.json"
        if not json_file.exists():
            continue
        with open(json_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        mask = data.get("mask")
        if not mask:
            continue
        scores = parse_mask(mask)
        task_texts = read_combined_texts(combined_file)
        if not task_texts:
            continue
        for i, task_num in enumerate(task_numbers):
            text = task_texts.get(task_num, "")
            if text:
                test_texts.append(text)
                test_labels.append(scores[i])
                test_file_mapping.append((combined_file, task_num, text))
    except Exception as e:
        print(f"Ошибка: {e}")


In [None]:
# Обучение модели
if not texts:
    raise ValueError("Нет обучающих данных")

model = make_pipeline(
    TfidfVectorizer(max_features=5000),
    DecisionTreeClassifier(random_state=42)
)
model.fit(texts, labels)
joblib.dump(model, Path(output_model_dir) / "decision_tree_model.joblib")
print("Модель Decision Tree обучена.")

# Проверка точности
if test_texts:
    preds = model.predict(test_texts)
    acc = accuracy_score(test_labels, preds)
    print(f"Точность на тесте: {acc:.4f}")
else:
    print("Нет тестовых данных")

# Сохранение результатов
grouped = defaultdict(list)
for file_path, task_num, text in test_file_mapping:
    grouped[file_path].append((task_num, text))

for combined_file, tasks in grouped.items():
    try:
        output_lines = []
        for task_num, text in sorted(tasks):
            score = model.predict([text])[0]
            output_lines.append(f"{task_num}: {score}\n{text}")
        output_file = Path(scored_results_dir) / combined_file.name
        with open(output_file, 'w', encoding='utf-8') as f:
            for line in output_lines:
                f.write(line + "\n")
        print(f"Оценки сохранены: {output_file.name}")
    except Exception as e:
        print(f"Ошибка при сохранении файла {combined_file}: {e}")
