In [78]:
import pandas as pd
import numpy as np
import json
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [79]:
with open('train_hackathon_dataset.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
df = pd.DataFrame(data)

print(f"Загружено: {len(df)} писем, {df['label'].nunique()} классов")

Загружено: 1421 писем, 36 классов


<Strong>1. Предобработка текста </Strong>

In [80]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[^а-яёa-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['text_processed'] = df['text'].apply(preprocess_text)

<Strong>2. Разделение на train/test</Strong>

In [81]:
X = df['text_processed']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

df_train = pd.DataFrame({
    'idx': X_train.index,
    'text': X_train,
    'label': y_train
})


<Strong>3. Балансировка данных </Strong>

In [82]:
def balance_data(df_train):
    train_counts = df_train['label'].value_counts()
    balanced_dfs = []
    
    for label in train_counts.index:
        label_data = df_train[df_train['label'] == label]
        n_original = len(label_data)
        
        if n_original <= 5:
            target_samples = 40
        elif n_original <= 10:
            target_samples = 60
        elif n_original <= 20:
            target_samples = 80
        elif n_original <= 50:
            target_samples = 100
        else:
            target_samples = 120
        
        if n_original < target_samples:
            augmented_data = label_data.sample(n=target_samples, replace=True, random_state=42)
            final_data = pd.concat([label_data, augmented_data], ignore_index=True)
        else:
            final_data = label_data.sample(n=target_samples, random_state=42)
        
        balanced_dfs.append(final_data)
    
    df_balanced = pd.concat(balanced_dfs, ignore_index=True)
    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)
    
    if len(df_balanced) > 2800:
        df_balanced = df_balanced.sample(n=2800, random_state=42)
    
    return df_balanced

df_balanced = balance_data(df_train)
print(f"Сбалансированный датасет: {len(df_balanced)} писем")

Сбалансированный датасет: 2800 писем


<Strong>4. Проверка качества </Strong>

In [83]:
vectorizer = TfidfVectorizer(
    max_features=10000,
    min_df=1,
    max_df=0.9,
    ngram_range=(1, 3),
    sublinear_tf=True
)

X_train_vec = vectorizer.fit_transform(df_balanced['text'])
X_test_vec = vectorizer.transform(X_test)

model = LogisticRegression(
    random_state=42,
    max_iter=3000,
    class_weight='balanced',
    C=0.5,
    solver='liblinear'
)

model.fit(X_train_vec, df_balanced['label'])
y_pred = model.predict(X_test_vec)

macro_f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
f1_scores = f1_score(y_test, y_pred, average=None, zero_division=0)
variance = f1_scores.var()
final_score = macro_f1 - 0.1 * np.sqrt(variance)

<Strong>5. Проверяем и сохраняем итоговый json</Strong>

In [84]:
baseline_f1 = 0.46
requirements_met = macro_f1 >= baseline_f1 and len(df_balanced) <= 3000

print(f"Macro F1-score: {macro_f1:.4f} | Бейзлайн: {baseline_f1} | Улучшение: {macro_f1 - baseline_f1:+.4f}")
print(f"Размер датасета: {len(df_balanced)} | Лимит: 3000")
print(f"Final score: {final_score:.4f}")
print(f"Дисперсия: {variance:.4f}")

Macro F1-score: 0.4834 | Бейзлайн: 0.46 | Улучшение: +0.0234
Размер датасета: 2800 | Лимит: 3000
Final score: 0.4566
Дисперсия: 0.0718


In [85]:
final_data = []
for _, row in df_balanced.iterrows():
    final_data.append({
        'idx': str(row['idx']),
        'text': row['text'],
        'label': row['label']
    })

with open('train_hackathon_dataset_final.json', 'w', encoding='utf-8') as f:
    json.dump(final_data, f, ensure_ascii=False, indent=2)

print(f"Файл итоговый сохранен")
print(f"Итоговые данные: {len(final_data)} писем, {df_balanced['label'].nunique()} классов")


Файл итоговый сохранен
Итоговые данные: 2800 писем, 36 классов
