# Toxic BiLSTM — 95% Target Pipeline (structured)

Этот ноутбук:
1. Находит пути к `cc.ru.300.bin`, `rus_toxic_full_df.csv`, папке `experiments/`.
2. Подтягивает **лучший прошлый запуск** и тренирует ансамбль поверх него.
3. Загружает FastText через **gensim** (без нативной сборки fasttext, дружит с Windows).
4. Использует **StreamingTensorDataset** (эмбеддинги на лету, без OOM).
5. Делает **TTA + bias-tuning**, сохраняет итог в `experiments/ensemble_95_target_eng.json`,
   обновляет `best_run_eng.json`, если стало лучше.


## 1) Жёсткая привязка путей (исправь BASE, если нужно)

In [1]:

from pathlib import Path
import os

BASE = Path.cwd()  # Можно поставить, например: Path(r"e:\Python_projects\Toxic_class_base")

FASTTEXT_BIN   = (BASE / "cc.en.300.bin")
DATASET_CSV    = (BASE / "english_subset_50.csv")
EXPERIMENTS_DIR = (BASE / "experiments_en")

print("BASE            :", BASE.resolve())
print("cc.en.300.bin   :", FASTTEXT_BIN, "exists:", FASTTEXT_BIN.exists())
print("dataset csv     :", DATASET_CSV, "exists:", DATASET_CSV.exists())
print("experiments dir :", EXPERIMENTS_DIR, "exists:", EXPERIMENTS_DIR.exists())

EXPERIMENTS_DIR.mkdir(parents=True, exist_ok=True)


BASE            : E:\Python_projects\Toxic_class_base
cc.en.300.bin   : e:\Python_projects\Toxic_class_base\cc.en.300.bin exists: True
dataset csv     : e:\Python_projects\Toxic_class_base\english_subset_50.csv exists: True
experiments dir : e:\Python_projects\Toxic_class_base\experiments_en exists: True


## 2) Импорты, сиды, предупреждения

In [2]:

import re, math, inspect, json, warnings
from typing import List, Dict, Any, Optional, Tuple
from collections import Counter

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch.optim.swa_utils import AveragedModel, update_bn
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("[Init] device:", device)

warnings.filterwarnings("ignore", message="RNN module weights are not part of single contiguous chunk")

GLOBAL_RANDOM_SEED = 42
def set_seed(seed: int = 42):
    import random
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    print(f"[Seed] set to {seed}")

set_seed(GLOBAL_RANDOM_SEED)


[Init] device: cuda
[Seed] set to 42


## 3) FastText через gensim (совместимо с версиями с/без `mmap`)

In [3]:

from gensim.models.fasttext import load_facebook_vectors  # pip install gensim

def _hash_vec(word: str, dim: int, seed: int = 1337):
    rs = np.random.RandomState(abs(hash((word, seed))) % (2**32))
    v = rs.normal(0, 1, size=(dim,)).astype(np.float32)
    v /= (np.linalg.norm(v) + 1e-8)
    return v

class GensimFTWrapper:
    def __init__(self, kv):
        self.kv = kv
        self._dim = int(kv.vector_size)
    def get_dimension(self):
        return self._dim
    def get_word_vector(self, word: str):
        try:
            return self.kv.get_vector(word)
        except KeyError:
            return _hash_vec(word, self._dim)

# --- fastText loader: native -> gensim subword -> gensim vectors ---
import os, numpy as np
from functools import lru_cache

def _hash_vec(word: str, dim: int) -> np.ndarray:
    h = (hash(word) & 0xFFFFFFFF)
    rng = np.random.default_rng(h)
    v = rng.normal(0.0, 0.5, size=(dim,)).astype(np.float32)
    n = float(np.linalg.norm(v)) + 1e-9
    return (v / n).astype(np.float32)

class _NativeFTWrapper:
    def __init__(self, m):
        self.m = m
        self._dim = int(m.get_dimension())
    def get_dimension(self): return self._dim
    @lru_cache(maxsize=200_000)
    def get_word_vector(self, w: str) -> np.ndarray:
        return np.asarray(self.m.get_word_vector(w), dtype=np.float32)

class _GensimSubwordWrapper:
    def __init__(self, m):
        self.m = m
        self._dim = int(m.wv.vector_size)
    def get_dimension(self): return self._dim
    @lru_cache(maxsize=200_000)
    def get_word_vector(self, w: str) -> np.ndarray:
        # у gensim FastText (model.wv) OOV собираются по n-gram, как в native
        return self.m.wv.get_vector(w, norm=False).astype(np.float32)

class _GensimVectorsWrapper:
    def __init__(self, kv):
        self.kv = kv
        self._dim = int(kv.vector_size)
    def get_dimension(self): return self._dim
    @lru_cache(maxsize=200_000)
    def get_word_vector(self, w: str) -> np.ndarray:
        try:
            return self.kv.get_vector(w).astype(np.float32)
        except KeyError:
            return _hash_vec(w, self._dim)  # fallback для OOV

def ensure_fasttext_model(model_path: str, download_if_missing: bool = False):
    if not (isinstance(model_path, str) and os.path.isfile(model_path)):
        raise FileNotFoundError(f"FastText model not found: {model_path}")

    # 1) native fasttext (лучший вариант)
    try:
        import fasttext
        m = fasttext.load_model(model_path)
        print("[FT] Using native fasttext (subword).")
        return _NativeFTWrapper(m)
    except Exception as e:
        print(f"[FT] native fasttext unavailable: {e}")

    # 2) gensim subword model
    try:
        from gensim.models.fasttext import load_facebook_model
        m = load_facebook_model(model_path)
        print("[FT] Using gensim FastText MODEL (subword).")
        return _GensimSubwordWrapper(m)
    except Exception as e:
        print(f"[FT] gensim subword model unavailable: {e}")

    # 3) fallback: plain vectors (без subword)
    from gensim.models.fasttext import load_facebook_vectors
    kv = load_facebook_vectors(model_path)
    print("[FT] Using gensim word VECTORS (NO subword) — expect lower quality.")
    return _GensimVectorsWrapper(kv)



## 4) Токенизация и стриминговый датасет (без OOM)

In [4]:

TOKEN_RE = re.compile(r"\w+", flags=re.UNICODE)

def _tokenize(text: str):
    if not isinstance(text, str):
        return []
    return TOKEN_RE.findall(text.lower())

def embed_tokens(tokens: List[str], ft_model, max_tokens: int) -> torch.Tensor:
    toks = tokens[:max_tokens]
    dim = int(ft_model.get_dimension())
    X = np.zeros((max_tokens, dim), dtype=np.float32)
    for j, tok in enumerate(toks):
        X[j, :] = ft_model.get_word_vector(tok)
    return torch.from_numpy(X)

class StreamingTensorDataset(Dataset):
    def __init__(self, texts: List[str], y_list: List[int], ft_model, max_tokens: int):
        self.texts = list(texts)
        self.y = torch.tensor(list(y_list), dtype=torch.long)
        self.ft = ft_model
        self.max_tokens = int(max_tokens)
        self.tensors = (None, self.y)  # for WeightedRandomSampler

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        toks = _tokenize(self.texts[idx])
        x = embed_tokens(toks, self.ft, self.max_tokens)
        y = self.y[idx]
        return x, y


## 5) Модель BiLSTM

In [5]:

class BiLSTM(nn.Module):
    def __init__(self, embed_dim: int, hidden_size: int, num_layers: int, dropout: float, num_classes: int):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=embed_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0.0,
            bidirectional=True,
            batch_first=True,
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        if hasattr(self.lstm, "flatten_parameters"):
            self.lstm.flatten_parameters()
        out, _ = self.lstm(x)     # (B,T,2H)
        out = out[:, -1, :]
        out = self.dropout(out)
        return self.fc(out)

def make_bilstm(num_classes: int, embed_dim: int, hidden_size: int, num_layers: int, dropout: float):
    return BiLSTM(embed_dim=embed_dim, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout, num_classes=num_classes)


In [6]:
# === BiLSTM_Advanced с простым self-attention ===
import torch, torch.nn as nn, torch.nn.functional as F

class BiLSTM_Advanced(nn.Module):
    def __init__(self, embed_dim: int, hidden_size: int, num_layers: int,
                 dropout: float, num_classes: int, use_advanced_attention: bool = True):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=embed_dim, hidden_size=hidden_size, num_layers=num_layers,
            bidirectional=True, dropout=(dropout if num_layers > 1 else 0.0), batch_first=True
        )
        self.use_attn = use_advanced_attention
        if self.use_attn:
            self.attn = nn.Sequential(
                nn.Linear(2*hidden_size, 2*hidden_size),
                nn.Tanh(),
                nn.Linear(2*hidden_size, 1)
            )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(2*hidden_size, num_classes)

    def forward(self, x):
        if hasattr(self.lstm, "flatten_parameters"):
            self.lstm.flatten_parameters()
        out, _ = self.lstm(x)                      # (B, T, 2H)
        if self.use_attn:
            e = self.attn(out).squeeze(-1)         # (B, T)
            a = torch.softmax(e, dim=1).unsqueeze(-1)
            h = (out * a).sum(dim=1)               # (B, 2H)
        else:
            h = out[:, -1, :]
        h = self.dropout(h)
        return self.fc(h)

# фабрика модели: выбирает advanced по флагу
def make_model_from_params(model_p: dict, num_classes: int, embed_dim: int):
    if model_p.get("use_advanced_model", False):
        return BiLSTM_Advanced(
            embed_dim=embed_dim,
            hidden_size=model_p.get("hidden_size", 320),
            num_layers=model_p.get("num_layers", 2),
            dropout=model_p.get("dropout", 0.28),
            num_classes=num_classes,
            use_advanced_attention=model_p.get("use_advanced_attention", True),
        )
    # fallback на классический BiLSTM, если нужен
    return BiLSTM(
        embed_dim=embed_dim,
        hidden_size=model_p.get("hidden_size", 320),
        num_layers=model_p.get("num_layers", 2),
        dropout=model_p.get("dropout", 0.28),
        num_classes=num_classes,
    )


In [7]:
# === DEFAULT: auto_advanced_medium (усилен под финал) ===
ADV_DEFAULT = {
    "data": {
        "sample_per_class": 16000, # попробовать 17000
        "max_tokens": 160, # попробовать 170
        "test_size": 0.20,
        "val_size": 0.10,
        "random_state": 42,
        "class_labels": ["neutral","toxic_not_threat","toxic_threat"],
        "dataset_path": str(DATASET_CSV),
    },
    "model": {
        "embed_dim": 300,
        "hidden_size": 320,
        "num_layers": 2,
        "dropout": 0.28,
        "use_advanced_model": True,
        "use_advanced_attention": True,
        "ms_dropout_samples": 6,   # было 5, попробовать 7
    },
    "train": {
        "batch_size": 36,          # поднимай до 40, если VRAM позволяет
        "epochs": 30,              # было 24
        "learning_rate": 8e-4,     # лучший из свипа
        "weight_decay": 5e-5,
        "scheduler": "onecycle",
        "pct_start": 0.05,         # было 0.10 (раньше пик OneCycle)
        "use_ema": True,
        "use_swa": True,           # включаем SWA
        "focal_gamma": 1.6,        # было 1.5
        "label_smoothing": 0.06,   # было 0.05
    }
}

ADV_DEFAULT["data"]["take_fraction"] = 1.0  # при отладке можно 0.5–0.8



## 6) Потери, EMA, семплер, лоадеры

In [None]:

def make_class_alpha(train_ds, num_classes: int) -> torch.Tensor:
    y = train_ds.tensors[1].cpu().tolist()
    cnt = Counter(y)
    # инверсия частот
    w = [1.0 / max(1, cnt.get(i, 1)) for i in range(num_classes)]
    w = torch.tensor(w, dtype=torch.float32)
    # нормализация: среднее = 1.0 (стабильнее для оптимизатора)
    w = w * (num_classes / w.sum().clamp_min(1e-8))
    return w

class FocalLoss(nn.Module):
    def __init__(self, gamma: float = 1.5, alpha: torch.Tensor | None = None, reduction: str = "mean"):
        super().__init__()
        self.gamma = gamma
        # храним alpha как buffer (float32); может быть на CPU — перенесём в forward
        if isinstance(alpha, torch.Tensor):
            self.register_buffer("alpha", alpha.float())
        else:
            self.alpha = None
        self.reduction = reduction

    def forward(self, logits: torch.Tensor, target: torch.Tensor) -> torch.Tensor:
        logp = F.log_softmax(logits, dim=-1)                 
        p    = logp.exp()
        logpt = logp.gather(1, target.unsqueeze(1)).squeeze(1)  
        pt    = p.gather(1, target.unsqueeze(1)).squeeze(1)     

        if self.alpha is not None:
            alpha = self.alpha
            if alpha.device != target.device:                
                alpha = alpha.to(target.device)              
            at = alpha[target]                               
        else:
            at = 1.0

        loss = - at * ((1.0 - pt).clamp_min(1e-8) ** self.gamma) * logpt
        if self.reduction == "mean":
            return loss.mean()
        if self.reduction == "sum":
            return loss.sum()
        return loss


class EMA:
    def __init__(self, model, decay=0.999):
        self.decay = decay
        self.shadow, self.backup = {}, {}
        for n, p in model.named_parameters():
            if p.requires_grad:
                self.shadow[n] = p.data.clone()
    def update(self, model):
        for n, p in model.named_parameters():
            if p.requires_grad:
                self.shadow[n] = (1 - self.decay) * p.data + self.decay * self.shadow[n]
    def apply_shadow(self, model):
        self.backup = {}
        for n, p in model.named_parameters():
            if p.requires_grad:
                self.backup[n] = p.data.clone()
                p.data = self.shadow[n]
    def restore(self, model):
        for n, p in model.named_parameters():
            if p.requires_grad and n in self.backup:
                p.data = self.backup[n]
        self.backup = {}

IS_WIN = os.name == "nt"

def sampler_from_ds(train_ds):
    y = train_ds.tensors[1].cpu().tolist()
    cnt = Counter(y); total = sum(cnt.values())
    class_weight = {c: total/(len(cnt)*n) for c,n in cnt.items()}
    sample_w = [class_weight[int(t)] for t in y]
    return WeightedRandomSampler(sample_w, num_samples=len(sample_w), replacement=True)

def build_loaders(train_ds, val_ds, batch_size=36, num_workers=2):
    # На Windows форсим 0 воркеров, иначе повиснет на сериализации gensim-модели
    if IS_WIN:
        num_workers = 0
    train_loader = DataLoader(
        train_ds,
        batch_size=batch_size,
        sampler=sampler_from_ds(train_ds),
        shuffle=False,
        num_workers=num_workers,
        pin_memory=False,          # важно: False
        persistent_workers=False,  # важно: False
    )
    val_loader = DataLoader(
        val_ds,
        batch_size=batch_size,
        shuffle=False,
        num_workers=num_workers,
        pin_memory=False,
        persistent_workers=False,
    )
    print(f"[Loaders] batch_size={batch_size}, num_workers={num_workers}, pin_memory=False")
    return train_loader, val_loader


## 7) Тренер v3 (warmup CE → Focal/EMA/SWA + OneCycle, MC-dropout на val)

In [9]:
# === train_model_v3 с EarlyStopping (по val_macro_f1) ===

def train_model_v3(model, train_ds, val_ds, *,
                   epochs=24, base_lr=4.8e-4, weight_decay=5e-5, pct_start=0.1,
                   use_onecycle=True, use_ema=True, use_swa=False, swa_start_ratio=0.7, swa_lr=None,
                   loss_type="focal", focal_gamma=1.6, label_smoothing=0.06, warmup_ce_epochs=5,
                   batch_size=32, num_workers=0, ms_dropout_samples=3, class_names=None,
                   early_stop=True, es_patience=5, es_min_delta=1e-3):
    import math, json
    from torch.optim.swa_utils import AveragedModel, update_bn
    from sklearn.metrics import f1_score, classification_report, confusion_matrix
    from torch.utils.data import DataLoader

    model = model.to(device)
    train_loader, val_loader = build_loaders(train_ds, val_ds, batch_size=batch_size, num_workers=num_workers)

    optimizer = torch.optim.AdamW(model.parameters(), lr=base_lr, weight_decay=weight_decay)
    scheduler = (torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=base_lr,
                 steps_per_epoch=len(train_loader), epochs=epochs, pct_start=pct_start)
                 if use_onecycle else torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs))

    ema = EMA(model, decay=0.999) if use_ema else None
    swa_model = AveragedModel(model) if use_swa else None
    swa_start = int(math.ceil(epochs * swa_start_ratio))

    # классовые веса заранее
    alpha_vec = make_class_alpha(train_ds, model.fc.out_features)

    history = []
    best_f1 = -1.0
    best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
    best_ema_shadow = {k: v.detach().cpu().clone() for k, v in ema.shadow.items()} if ema else None
    no_improve = 0

    for epoch in range(1, epochs+1):
        model.train()
        total_loss = 0.0

        if epoch <= warmup_ce_epochs or loss_type == "ce":
            # CE на прогреве: со сглаживанием + класс-весами
            criterion = nn.CrossEntropyLoss(weight=alpha_vec.to(device), label_smoothing=label_smoothing)
        else:
            # Focal без smoothing (как в оригинале фокала), но с alpha по классам
            criterion = FocalLoss(gamma=focal_gamma, alpha=alpha_vec, reduction="mean")

        for X, y in train_loader:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad(set_to_none=True)
            logits = model(X)
            loss = criterion(logits, y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            if ema: ema.update(model)
            if use_onecycle: scheduler.step()
            total_loss += loss.item()

        # ===== Validation (MC-dropout)
        model.eval()
        if ema: ema.apply_shadow(model)
        def _enable_dropout(m):
            if isinstance(m, nn.Dropout):
                m.train()
        y_true, y_pred = [], []
        with torch.no_grad():
            for X, y in val_loader:
                X, y = X.to(device), y.to(device)
                probs_accum = None
                model.apply(_enable_dropout)
                for _ in range(ms_dropout_samples):
                    logits = model(X)
                    probs = F.softmax(logits, dim=-1)
                    probs_accum = probs if probs_accum is None else (probs_accum + probs)
                model.eval()
                preds = (probs_accum / ms_dropout_samples).argmax(dim=-1)
                y_pred += preds.cpu().tolist()
                y_true += y.cpu().tolist()
        if ema: ema.restore(model)
        if not use_onecycle:
            scheduler.step()

        macro_f1 = f1_score(y_true, y_pred, average="macro")
        # аккуратно посчитаем loss за эпоху
        train_loss = total_loss / max(1, len(train_loader))
        history.append({"epoch": epoch, "train_loss": train_loss, "val_macro_f1": macro_f1})

        # возьмём текущий LR (OneCycle обновляет его по батчам)
        curr_lr = scheduler.get_last_lr()[0] if hasattr(scheduler, "get_last_lr") else optimizer.param_groups[0]["lr"]

        # определим, какой лосс сейчас используется
        is_ce = (epoch <= warmup_ce_epochs) or (loss_type == "ce")
        print(
            f"[Epoch {epoch}/{epochs}] "
            f"lr={curr_lr:.2e}  "
            f"loss_type={'CE' if is_ce else 'Focal'}  "
            f"train_loss={train_loss:.4f}  "
            f"val_macro_f1={macro_f1:.4f}"
        )

        # SWA обновление (если включено)
        if use_swa and epoch >= swa_start:
            swa_model.update_parameters(model)

        # Early Stopping
        if macro_f1 > best_f1 + es_min_delta:
            best_f1 = macro_f1
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            if ema:
                best_ema_shadow = {k: v.detach().cpu().clone() for k, v in ema.shadow.items()}
            no_improve = 0
        else:
            no_improve += 1
            if early_stop and no_improve >= es_patience:
                print(f"[EarlyStop] нет улучшений {es_patience} эпох. Лучший val_macro_f1={best_f1:.4f}")
                break

    # Откат на лучшие веса
    model.load_state_dict(best_state)
    if ema and best_ema_shadow is not None:
        ema.shadow = best_ema_shadow
    if use_swa:
        update_bn(train_loader, swa_model, device=device)
        model = swa_model

    # Финальный отчёт по валидации
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for X, y in val_loader:
            X, y = X.to(device), y.to(device)
            pr = model(X).argmax(-1)
            y_pred += pr.cpu().tolist()
            y_true += y.cpu().tolist()
    rep = classification_report(y_true, y_pred, digits=4, zero_division=0, output_dict=True)
    cm = confusion_matrix(y_true, y_pred).tolist()
    metrics_blob = {"history": history,
                    "classification_report": rep,
                    "confusion_matrix": cm,
                    "best_val_macro_f1": best_f1}
    (EXPERIMENTS_DIR/"last_val_metrics_eng.json").write_text(json.dumps(metrics_blob, ensure_ascii=False, indent=2), encoding="utf-8")
    print("[trainer] Saved val metrics → experiments/last_val_metrics_eng.json")
    return model, metrics_blob


## 8) TTA и bias tuning

In [10]:
# пути к исходному ENG CSV и к новому датасету
SRC_ENG = Path(r"eng_toxic_full_df.csv")       # <-- исходный ENG CSV
DST_ENG = Path(r"english_subset_50.csv")     # <-- куда сохранить половину

In [11]:
# нужны только эти колонки (остальные не читаем — экономим RAM)
usecols = [
    "comment_text",
    "neutral", "toxic_not_threat", "toxic_threat",
]
dtypes = {
    "neutral": "int8",
    "toxic_not_threat": "int8",
    "toxic_threat": "int8",
}

# нормализация (как в RU, плюс @user)
URL_RE     = re.compile(r'https?://\S+')
EMOJI_RE   = re.compile(r'[\U00010000-\U0010ffff]', flags=re.UNICODE)
TEXT_AT_RE = re.compile(r'@\w+')

def normalize_text(s: str) -> str:
    s = str(s).strip()
    s = URL_RE.sub('<URL>', s)
    s = EMOJI_RE.sub('<EMOJI>', s)
    s = TEXT_AT_RE.sub('<USER>', s)
    s = re.sub(r'\s+', ' ', s)
    return s

frac = 0.5           # РОВНО половину
rng  = np.random.RandomState(42)  # фиксируем сид

first = True
total_in = total_out = 0
for chunk in pd.read_csv(SRC_ENG, usecols=usecols, dtype=dtypes, chunksize=100_000):
    total_in += len(chunk)
    # берём половину строк из этого чанка
    mask = rng.rand(len(chunk)) < frac
    sub  = chunk.loc[mask].copy()

    if not sub.empty:
        sub = sub.rename(columns={"comment_text": "text"})
        sub["text"] = sub["text"].map(normalize_text)
        sub.to_csv(DST_ENG, index=False, mode=("w" if first else "a"), header=first)
        total_out += len(sub)
        first = False

print(f"[Subset] read: {total_in}  -> wrote: {total_out}  ({total_out/max(1,total_in):.1%})")
print("Saved to:", DST_ENG)


def tta_variants(text: str) -> List[str]:
    t = text if isinstance(text, str) else str(text)
    cand = [t, t.lower(), URL_RE.sub("<URL>", t), EMOJI_RE.sub("<EMOJI>", t), re.sub(r"[^\w\s]", " ", t.lower())]
    seen, out = set(), []
    for v in cand:
        if v not in seen:
            out.append(v); seen.add(v)
    return out[:7]

def predict_proba_tta(model, texts: List[str], ft_model, max_tokens: int, batch=256):
    all_probs = []
    for txt in texts:
        vs = tta_variants(txt)
        Xv = [embed_tokens(_tokenize(v), ft_model, max_tokens) for v in vs]
        Xv = torch.stack(Xv, dim=0)
        dl = DataLoader([(x, 0) for x in Xv], batch_size=batch, shuffle=False)
        model.eval()
        probs_acc = None
        with torch.no_grad():
            for X,_ in dl:
                X = X.to(device)
                logits = model(X)
                probs = F.softmax(logits, dim=-1).cpu().numpy()
                probs_acc = probs if probs_acc is None else np.vstack((probs_acc, probs))
        all_probs.append(probs_acc.mean(axis=0))
    return np.vstack(all_probs)

def tune_logit_biases(probs_val: np.ndarray, y_val: List[int], grid=(-0.5, -0.25, 0.0, 0.25, 0.5)):
    from itertools import product
    C = probs_val.shape[1]
    best_f1, best_b = -1.0, np.zeros(C, dtype=np.float32)
    for deltas in product(grid, repeat=C):
        shifted = probs_val + np.array(deltas)[None, :]
        pred = shifted.argmax(1)
        f1 = f1_score(y_val, pred, average="macro")
        if f1 > best_f1:
            best_f1, best_b = f1, np.array(deltas, dtype=np.float32)
    return best_b, best_f1


[Subset] read: 159571  -> wrote: 79611  (49.9%)
Saved to: english_subset_50.csv


## 9) Чтение прошлых результатов

In [12]:

def load_previous_runs(exp_dir: Path):
    runs = []
    for p in [exp_dir/"runs_eng.jsonl", Path("runs_eng.jsonl")]:
        if p.exists():
            with p.open("r", encoding="utf-8") as f:
                for line in f:
                    try:
                        runs.append(json.loads(line))
                    except:
                        pass
    return runs

def load_best_run(exp_dir: Path):
    for p in [exp_dir/"best_run_eng.json", Path("best_run_eng.json")]:
        if p.exists():
            return json.loads(p.read_text(encoding="utf-8"))
    return None

def get_params_for_id(runs: List[Dict[str,Any]], run_id: str) -> Optional[Dict[str,Any]]:
    for r in runs:
        if isinstance(r, dict):
            pp = r.get("parameters")
            if isinstance(pp, dict) and pp.get("id") == run_id:
                return pp
            if r.get("id") == run_id and "parameters" in r:
                return r["parameters"]
    return None


## 10) Оркестратор improve_from_best

In [None]:
# === improve_from_best: форсим базу = ADV_DEFAULT; опц. свип LR; ансамбль по сид-ам ===
from typing import List, Optional
from pathlib import Path
import pandas as pd, numpy as np, json
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score

def improve_from_best(seeds: List[int] = [42,1337,2025],
                      save_name: str = "ensemble_95_target_eng.json",
                      lr_override: Optional[float] = None,
                      lr_grid: Optional[List[float]] = None,
                      lr_grid_epochs: Optional[int] = None):
    # 1) берём пресет как БАЗУ
    data_p  = dict(ADV_DEFAULT["data"])
    model_p = dict(ADV_DEFAULT["model"])
    train_p = dict(ADV_DEFAULT["train"])
    class_labels = data_p["class_labels"]

    # 2) читаем датасет и сплитим
    df = pd.read_csv(data_p["dataset_path"])
    texts_all = df["text"].astype(str).tolist()
    y_all = np.argmax(df[class_labels].values, axis=1).tolist()

    take_fraction = float(data_p.get("take_fraction", 1.0))
    if 0.0 < take_fraction < 1.0:
        idx = np.arange(len(texts_all))
        idx_sub, _ = train_test_split(
            idx, train_size=take_fraction, random_state=rs, stratify=y_all
        )
        texts_all = [texts_all[i] for i in idx_sub]
        y_all     = [y_all[i]     for i in idx_sub]
        print(f"[DATA] Using fraction={take_fraction:.2f} -> {len(texts_all)} rows")

    rs = int(data_p.get("random_state", 42))
    test_size = float(data_p.get("test_size", 0.2))
    val_size  = float(data_p.get("val_size", 0.1))
    x_tmp, x_test, y_tmp, y_test = train_test_split(texts_all, y_all, test_size=test_size, random_state=rs, stratify=y_all)

    val_rel = val_size / (1.0 - test_size) if (1.0 - test_size) > 0 else 0.0
    X_train, X_val, Y_train, Y_val = train_test_split(
        x_tmp, y_tmp,
        test_size=val_rel,
        random_state=rs,
        stratify=y_tmp
    )
    # для единообразия имён:
    X_test, Y_test = x_test, y_test
    print(f"[SPLIT] train={len(X_train)}  val={len(X_val)}  test={len(X_test)}")

    rel_val = val_size / (1.0 - test_size)
    x_train, x_val, y_train, y_val = train_test_split(x_tmp, y_tmp, test_size=rel_val, random_state=rs, stratify=y_tmp)

    # 3) fastText и стриминговые датасеты
    ft = ensure_fasttext_model(str(FASTTEXT_BIN))
    max_tokens = int(data_p["max_tokens"])
    train_ds = StreamingTensorDataset(x_train, y_train, ft, max_tokens)
    val_ds   = StreamingTensorDataset(x_val,   y_val,   ft, max_tokens)
    test_ds  = StreamingTensorDataset(x_test,  y_test,  ft, max_tokens)
    num_classes = len(class_labels)

    # helper — один прогон
    def _train_and_probs(seed: int, base_lr: float, epochs_use: int):
        set_seed(int(seed))
        model = make_model_from_params(model_p, num_classes=num_classes, embed_dim=model_p.get("embed_dim",300))
        model, _ = train_model_v3(
            model, train_ds, val_ds,
            epochs=train_p.get("epochs", 30),           
            base_lr=best_lr,
            weight_decay=train_p.get("weight_decay", 5e-5),
            pct_start=train_p.get("pct_start", 0.05),
            use_onecycle=(train_p.get("scheduler","onecycle") == "onecycle"),
            use_ema=train_p.get("use_ema", True),
            use_swa=train_p.get("use_swa", True),       # ← SWA
            loss_type="focal",
            focal_gamma=train_p.get("focal_gamma", 1.6),
            label_smoothing=train_p.get("label_smoothing", 0.06),
            warmup_ce_epochs=6,                         # ← подольше прогрев CE
            batch_size=min(36, train_p.get("batch_size", 36)),
            num_workers=0,
            ms_dropout_samples=model_p.get("ms_dropout_samples", 6),
            class_names=class_labels,
            early_stop=True, es_patience=6, es_min_delta=5e-4
        )

        pv = predict_proba_tta(model, x_val,  ft, max_tokens)
        pt = predict_proba_tta(model, x_test, ft, max_tokens)
        return pv, pt

    # 4) выбор LR 
    if lr_override is not None:
        best_lr = float(lr_override)
        print(f"[LR] override = {best_lr}")
    elif lr_grid:
        probe_seed = int(seeds[0]) if seeds else 42
        probe_epochs = int(lr_grid_epochs or min(14, train_p["epochs"]))
        print(f"[LR] grid search on seed={probe_seed}, epochs={probe_epochs}: {lr_grid}")
        best_lr, best_f1 = None, -1.0
        for lr in lr_grid:
            pv, _ = _train_and_probs(probe_seed, float(lr), probe_epochs)
            bias,_ = tune_logit_biases(pv, y_val, grid=(-0.25,-0.1,0.0,0.1,0.25))
            f1 = f1_score(y_val, (pv + bias[None,:]).argmax(1), average="macro")
            print(f"  lr={lr:.6f}  val_macro_f1={f1:.4f}")
            if f1 > best_f1:
                best_f1, best_lr = f1, float(lr)
        print(f"[LR] best = {best_lr:.6f} (val_macro_f1={best_f1:.4f})")
    else:
        best_lr = float(train_p["learning_rate"])
        print(f"[LR] from preset = {best_lr}")

    # 5) ансамбль по сид-ам
    probs_val_sum = probs_test_sum = None
    for s in seeds:
        pv, pt = _train_and_probs(int(s), best_lr, epochs_use=min(20, train_p["epochs"]))
        probs_val_sum  = pv if probs_val_sum  is None else (probs_val_sum  + pv)
        probs_test_sum = pt if probs_test_sum is None else (probs_test_sum + pt)

    probs_val_mean  = probs_val_sum  / max(1, len(seeds))
    probs_test_mean = probs_test_sum / max(1, len(seeds))

    bias, _ = tune_logit_biases(probs_val_mean, y_val,
                                grid=(-0.25, -0.15, -0.1, -0.05, 0.0, 0.05, 0.1, 0.15, 0.25))
    val_pred  = (probs_val_mean  + bias[None,:]).argmax(1)
    test_pred = (probs_test_mean + bias[None,:]).argmax(1)

    val_f1   = f1_score(y_val,  val_pred,  average="macro")
    test_f1  = f1_score(y_test, test_pred, average="macro")
    val_acc  = accuracy_score(y_val,  val_pred)
    test_acc = accuracy_score(y_test, test_pred)

    out = {
        "base_id": "auto_advanced_medium",
        "seeds": list(map(int, seeds)),
        "val_macro_f1": float(val_f1),
        "val_accuracy": float(val_acc),
        "test_macro_f1": float(test_f1),
        "test_accuracy": float(test_acc),
        "bias": bias.tolist(),
        "used_params": {"data":data_p, "model":model_p, "train":train_p},
        "chosen_lr": best_lr,
        "notes": f"Advanced preset + Ensemble x{len(seeds)} + TTA + bias (num_workers=0)"
    }
    (EXPERIMENTS_DIR/save_name).write_text(json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"[RESULT] VAL F1={val_f1:.4f}  ACC={val_acc:.4f} | TEST F1={test_f1:.4f}  ACC={test_acc:.4f} | LR={best_lr}")
    print(f"[SAVED] {EXPERIMENTS_DIR/save_name}")
    return out


## 11) Запуск улучшения

In [14]:
res = improve_from_best(seeds=[42], lr_override=8e-4)
res


[SPLIT] train=55727  val=7961  test=15923
[FT] Using native fasttext (subword).
[LR] override = 0.0008
[Seed] set to 42
[Loaders] batch_size=36, num_workers=0, pin_memory=False
[Epoch 1/30] lr=6.08e-04  loss_type=CE  train_loss=0.1515  val_macro_f1=0.0046
[Epoch 2/30] lr=7.99e-04  loss_type=CE  train_loss=0.0761  val_macro_f1=0.0852
[Epoch 3/30] lr=7.95e-04  loss_type=CE  train_loss=0.0657  val_macro_f1=0.2141
[Epoch 4/30] lr=7.85e-04  loss_type=CE  train_loss=0.0621  val_macro_f1=0.2427
[Epoch 5/30] lr=7.71e-04  loss_type=CE  train_loss=0.0604  val_macro_f1=0.2469
[Epoch 6/30] lr=7.52e-04  loss_type=CE  train_loss=0.0605  val_macro_f1=0.2454
[Epoch 7/30] lr=7.29e-04  loss_type=Focal  train_loss=0.0023  val_macro_f1=0.5961
[Epoch 8/30] lr=7.02e-04  loss_type=Focal  train_loss=0.0016  val_macro_f1=0.6150
[Epoch 9/30] lr=6.71e-04  loss_type=Focal  train_loss=0.0029  val_macro_f1=0.5745
[Epoch 10/30] lr=6.37e-04  loss_type=Focal  train_loss=0.0027  val_macro_f1=0.6031
[Epoch 11/30] lr=6.0

{'base_id': 'auto_advanced_medium',
 'seeds': [42],
 'val_macro_f1': 0.7184151536881306,
 'val_accuracy': 0.9448561738475066,
 'test_macro_f1': 0.7225389873915398,
 'test_accuracy': 0.9471205174904227,
 'bias': [0.25, -0.25, 0.25],
 'used_params': {'data': {'sample_per_class': 16000,
   'max_tokens': 160,
   'test_size': 0.2,
   'val_size': 0.1,
   'random_state': 42,
   'class_labels': ['neutral', 'toxic_not_threat', 'toxic_threat'],
   'dataset_path': 'e:\\Python_projects\\Toxic_class_base\\english_subset_50.csv',
   'take_fraction': 1.0},
  'model': {'embed_dim': 300,
   'hidden_size': 320,
   'num_layers': 2,
   'dropout': 0.28,
   'use_advanced_model': True,
   'use_advanced_attention': True,
   'ms_dropout_samples': 6},
  'train': {'batch_size': 36,
   'epochs': 30,
   'learning_rate': 0.0008,
   'weight_decay': 5e-05,
   'scheduler': 'onecycle',
   'pct_start': 0.05,
   'use_ema': True,
   'use_swa': True,
   'focal_gamma': 1.6,
   'label_smoothing': 0.06}},
 'chosen_lr': 0.0008

In [15]:
# === Итоговые метрики для отчёта ===
import json, os, numpy as np, pandas as pd
from pathlib import Path
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score

EXP = Path("experiments")
res = globals().get("res", None) 


val_f1 = test_f1 = val_acc = test_acc = None
if isinstance(res, dict):
    val_f1  = float(res.get("val_macro_f1", np.nan))
    test_f1 = float(res.get("test_macro_f1", np.nan))
    val_acc = float(res.get("val_accuracy", np.nan))
    test_acc= float(res.get("test_accuracy", np.nan))


if (val_f1 is None or np.isnan(val_f1)) and (EXP/"last_val_metrics_eng.json").exists():
    j = json.loads((EXP/"last_val_metrics_eng.json").read_text(encoding="utf-8"))
    val_f1 = float(j.get("best_val_macro_f1", np.nan))
if (test_f1 is None or np.isnan(test_f1)) and (EXP/"ensemble_95_target_eng.json").exists():
    j = json.loads((EXP/"ensemble_95_target_eng.json").read_text(encoding="utf-8"))
    test_f1  = float(j.get("test_macro_f1", np.nan))
    test_acc = float(j.get("test_accuracy", np.nan))
    val_acc  = float(j.get("val_accuracy", np.nan)) if val_acc is None else val_acc

# Печать короткого сводного блока
print("=== Итог для отчёта ===")
print(f"Test accuracy (главный критерий задания): {test_acc:.4f}" if test_acc is not None else "Test accuracy: n/a")
print(f"Test macro-F1: {test_f1:.4f}" if test_f1 is not None else "Test macro-F1: n/a")
print(f"Val macro-F1 (для подбора модели): {val_f1:.4f}" if val_f1 is not None else "Val macro-F1: n/a")
print(f"Val accuracy: {val_acc:.4f}" if val_acc is not None else "Val accuracy: n/a")

# Если есть подробный classification_report/CM из последнего валидационного прогона
report_path = EXP/"last_val_metrics_eng.json"
if report_path.exists():
    j = json.loads(report_path.read_text(encoding="utf-8"))
    cr = j.get("classification_report")
    cm = j.get("confusion_matrix")
    if cr:
        print("\n— Per-class на валидации (macro-F1 мы использовали для выбора):")
        df_cr = pd.DataFrame(cr).T
        display(df_cr)
    if cm:
        print("— Confusion matrix (валидация):")
        df_cm = pd.DataFrame(cm,
                             index=["true:neutral","true:not_threat","true:threat"],
                             columns=["pred:neutral","pred:not_threat","pred:threat"])
        display(df_cm)

# Сохраним мини-выжимку для вставки в отчёт
REPORT = Path("report"); REPORT.mkdir(exist_ok=True)
summary = {
    "test_accuracy": test_acc,
    "test_macro_f1": test_f1,
    "val_macro_f1": val_f1,
    "val_accuracy": val_acc,
}
(Path(REPORT/"summary_metrics_eng.json")).write_text(json.dumps(summary, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"\n[Saved] report/summary_metrics_eng.json")


=== Итог для отчёта ===
Test accuracy (главный критерий задания): 0.9471
Test macro-F1: 0.7225
Val macro-F1 (для подбора модели): 0.7184
Val accuracy: 0.9449

[Saved] report/summary_metrics_eng.json
