In [None]:
# Ячейка 1: Импорты и базовые настройки

# Стандартная библиотека
import json
from math import radians
from multiprocessing import Pool, cpu_count
import os
import random
import re
import warnings
from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor, as_completed

# Сторонние библиотеки
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
import numpy as np
import pandas as pd
from scipy.spatial.distance import cdist
import swifter
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoConfig,
)
import torch
import torch.nn as nn
import torch.multiprocessing
from torch.utils.data import DataLoader
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import IsolationForest
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score,
    classification_report,
    f1_score,
    confusion_matrix,
    precision_recall_curve,
)
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import joblib
import shap
from tqdm import tqdm
from googletrans import Translator
import googletrans

# Локальные настройки
from dotenv import load_dotenv
load_dotenv()

torch.multiprocessing.set_sharing_strategy("file_system")

print(googletrans.__version__)
warnings.filterwarnings("ignore")

# Установка устройства
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Current device is: {device}")


In [None]:
# Ячейка 2: Пути к файлам и конфигурация
main_dir = os.getenv("MAIN_DIR")
model_dir = os.path.join(main_dir, "model_deberta_v3_xsmall")
data_dir = os.path.join(main_dir, "liar2")
extra_data_dir = os.path.join(main_dir, "liar-twitter")
unreliable_data_dir = os.path.join(main_dir, "News_dataset")
scifact_dir = os.path.join(main_dir, "SciFact")
fever_dir = os.path.join(main_dir, "FEVER")
output_dir = os.path.join(main_dir, "output")
intermediate_dir = os.path.join(main_dir, "saved")
os.makedirs(output_dir, exist_ok=True)

class config:
    MODEL = model_dir
    MAX_LEN = 128
    BATCH_SIZE_TRAIN = 16
    BATCH_SIZE_VALID = 16
    EPOCHS = 5
    LEARNING_RATE = 2e-5
    LEARNING_RATE_BIN = 4e-6
    SEED = 42
    NUM_CLASSES = 4  # меньше на 1, т.к. далее мы объединяем 0 и 1 классы
    NUM_CLASSES_BIN = 1  # для бинарной модели
    NUM_WORKERS = 4  # если мало памяти, лучше 0
    GRADIENT_ACCUMULATION_STEPS = 4
    WEIGHT_DECAY = 0.01
    WEIGHT_DECAY_BIN = 1e-4

In [None]:
# Ячейка 3: Функция для установки seed
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(config.SEED)

In [None]:
# Ячейка 4.0: Подготовка к обработке данных
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

contractions = [
    (r"(\w+)n['’]t\b", r"\1 not"),  # couldn't -> could not
    (r"(\w+)n t\b", r"\1 not"),    # couldn t -> could not
    (r"(\w+)['’]ll\b", r"\1 will"),
    (r"(\w+) ll\b", r"\1 will"),
    (r"(\w+)['’]re\b", r"\1 are"),
    (r"(\w+) re\b", r"\1 are"),
    (r"(\w+)['’]ve\b", r"\1 have"),
    (r"(\w+) ve\b", r"\1 have"),
    (r"(\w+)['’]m\b", r"\1 am"),
    (r"(\w+) m\b", r"\1 am"),
    (r"(\w+)['’]d\b", r"\1 would"),
    (r"(\w+) d\b", r"\1 would"),
    (r"\b(he|she|it|that|there|what|who|where)['’]s\b", r"\1 is"),
    (r"\b(he|she|it|that|there|what|who|where) s\b", r"\1 is")
]

def clean_text(text):
    for pattern, replacement in contractions:
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)

    text = re.sub(r'\S+@\S+', ' ', text)
    text = re.sub(r'http\S+|www\S+|https\S+', ' ', text)
    text = re.sub(r'[^\w\s\.\-\+\%\$]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()

    tokens = nltk.word_tokenize(text)
    
    cleaned_tokens = []
    for token in tokens:
        if re.match(r'^[-+]?\d*\.?\d+%?$', token):
            cleaned_tokens.append(token)
            continue
        
        lower_token = token.lower()
        lemma = lemmatizer.lemmatize(lower_token)
        
        negation_words = {'not', 'no', 'nor', 'never', 'none', 'nothing', 'nowhere', 'n\'t'}
        if (lemma not in stop_words) or (lemma in negation_words):
            cleaned_tokens.append(lemma)
            
    return ' '.join(cleaned_tokens)

In [None]:
# Ячейка 4.1: Загрузка данных liar2 (https://huggingface.co/datasets/chengxuphd/liar2)
liar_train = pd.read_csv(os.path.join(data_dir, "train.csv"))
liar_valid = pd.read_csv(os.path.join(data_dir, "valid.csv"))
liar_test = pd.read_csv(os.path.join(data_dir, "test.csv"))

liar_train = liar_train[['label', 'statement']]
liar_valid = liar_valid[['label', 'statement']]
liar_test = liar_test[['label', 'statement']]

def liar_redo(df: pd.DataFrame):
    # Объединяем 0 (pants-on-fire) и 1 (false) классы.
    # Модель их плохо различает + они оба по сути ложные.
    # Эти классы описывают лишь наглость лжи и без
    # дополнительного контекста их будет сложно различить.
    df['label'] = df['label'].replace({0:1})

    # Теперь сдвигаем все классы на -1, чтобы классы начинались с 0
    df['label'] = df['label'] - 1

    df = df.rename(columns={'label': 'label_bin'})

    # Оставляем только ложь и правду, промежуточные убираем
    df['label_bin'] = df['label_bin'].apply(lambda x: 1 if x == 4 else 0 if x == 0 else 2)
    df = df[df['label_bin'] != 2].reset_index(drop=True)

    

    return df

liar_train = liar_redo(liar_train)
liar_valid = liar_redo(liar_valid)
liar_test = liar_redo(liar_test)

liar_true_train = liar_train[liar_train['label_bin'] == 1]
liar_false_train = liar_train[liar_train['label_bin'] == 0]

liar_true_valid = liar_valid[liar_valid['label_bin'] == 1]
liar_false_valid = liar_valid[liar_valid['label_bin'] == 0]

liar_true_test = liar_test[liar_test['label_bin'] == 1]
liar_false_test = liar_test[liar_test['label_bin'] == 0]

print(f"Train liar2: {liar_train['label_bin'].value_counts()}")
print(f"Valid liar2: {liar_valid['label_bin'].value_counts()}")
print(f"Test liar2: {liar_test['label_bin'].value_counts()}")

In [None]:
# Ячейка 4.2: Загрузка данных liar-twitter (https://www.kaggle.com/datasets/muhammadimran112233/liar-twitter-dataset/data)
liar_twitter = pd.read_csv(os.path.join(extra_data_dir, "Liar_Dataset.csv"))

label_mapping = {"pants-fire": 0, "FALSE": 1, "barely-true": 2, "half-true": 3, "mostly-true": 4, "TRUE": 5}
liar_twitter['label'] = liar_twitter['label'].map(label_mapping)

liar_twitter = liar_twitter[['label', 'statement']]
liar_twitter = liar_redo(liar_twitter)

liar_twitter_true = liar_twitter[liar_twitter['label_bin'] == 1].reset_index(drop=True)
liar_twitter_false = liar_twitter[liar_twitter['label_bin'] == 0].reset_index(drop=True)

def split_train_valid_test(df: pd.DataFrame):
    df_train, df_valid_test = train_test_split(
        df,
        test_size=0.2,
        random_state=config.SEED
    )
    df_valid, df_test = train_test_split(
        df_valid_test,
        test_size=0.5,
        random_state=config.SEED
    )
    return df_train, df_valid, df_test

liar_twitter_true_train, liar_twitter_true_valid, liar_twitter_true_test = split_train_valid_test(liar_twitter_true)
liar_twitter_false_train, liar_twitter_false_valid, liar_twitter_false_test = split_train_valid_test(liar_twitter_false)

liar_twitter_train = pd.concat([liar_twitter_true_train, liar_twitter_false_train]).reset_index(drop=True)
liar_twitter_valid = pd.concat([liar_twitter_true_valid, liar_twitter_false_valid]).reset_index(drop=True)
liar_twitter_test  = pd.concat([liar_twitter_true_test, liar_twitter_false_test]).reset_index(drop=True)

print(f"Train liar-twitter: {liar_twitter_train['label_bin'].value_counts()}")
print(f"Valid liar-twitter: {liar_twitter_valid['label_bin'].value_counts()}")
print(f"Test liar-twitter: {liar_twitter_test['label_bin'].value_counts()}")

In [None]:
# Ячейка 4.3: Загрузка данных Fake News (https://www.kaggle.com/datasets/emineyetm/fake-news-detection-datasets/data)

# unreliable_false = pd.read_csv(os.path.join(unreliable_data_dir, "Fake.csv"))
# unreliable_true = pd.read_csv(os.path.join(unreliable_data_dir, "True.csv"))

# unreliable_false['statement'] = unreliable_false['text']
# unreliable_true['statement']  = unreliable_true['text']

# unreliable_false = unreliable_false[['statement']]
# unreliable_true = unreliable_true[['statement']]

# unreliable_false['label_bin'] = 0
# unreliable_true['label_bin'] = 1

# def clean_false(text_series):
#     return text_series.str.replace(r'((\s?[a-zA-Z,.-]+){1,4}\s+?\(@.+?\)\s[a-zA-Z]+\s[0-3]?[0-9],?\s?20[0-9][0-9].+$)|(((https://)|(([a-zA-Z]+?\.){1,3}[a-zA-Z]+?/))[a-zA-Z./0-9]+\s)|(((Featured image via)|(Photo by)|(Image via)).+$)', '', regex=True)

# def clean_truths(text_series):
#     return text_series.str.replace(r'(^.*?-\s*)|(\s*-\sSource link.*$)|(((bit\.ly.+?)|(via\s+@[a-zA-Z0-9]+))?\s*\[.+?\]\s*-)', '', regex=True)

# def parallelize_series(series, func, n_cores=None):
#     if n_cores is None:
#         n_cores = cpu_count()
#     chunk_size = int(np.ceil(len(series) / n_cores))
#     chunks = [series[i*chunk_size:(i+1)*chunk_size] for i in range(n_cores)]

#     with Pool(n_cores) as pool:
#         results = pool.map(func, chunks)
#     return pd.concat(results)


# for text in unreliable_false['statement'][0:1]:
#   print(text)

# unreliable_false['statement'] = parallelize_series(unreliable_false['statement'], clean_false, 10)
# unreliable_false['statement'] = unreliable_false['statement'].apply(lambda x: clean_text(x))
# unreliable_false['statement'] = unreliable_false['statement'].fillna("").astype(str)
# unreliable_false = unreliable_false[unreliable_false['statement'].str.strip() != ""].reset_index(drop=True)

# for text in unreliable_false['statement'][0:1]:
#   print(text)

# for text in unreliable_true['statement'][0:1]:
#   print(text)

# unreliable_true['statement'] = parallelize_series(unreliable_true['statement'], clean_truths, 10)
# unreliable_true['statement'] = unreliable_true['statement'].apply(lambda x: clean_text(x))
# unreliable_true['statement'] = unreliable_true['statement'].fillna("").astype(str)
# unreliable_true = unreliable_true[unreliable_true['statement'].str.strip() != ""].reset_index(drop=True)

# for text in unreliable_true['statement'][0:1]:
#   print(text)

# # Сохраняем обработанные датафреймы в CSV
# unreliable_false.to_csv(os.path.join(unreliable_data_dir, "unreliable_false_cleaned.csv"), index=False)
# unreliable_true.to_csv(os.path.join(unreliable_data_dir, "unreliable_true_cleaned.csv"), index=False)

# Читаем обработанные датафреймы из CSV
unreliable_true = pd.read_csv(os.path.join(unreliable_data_dir, "unreliable_true_cleaned.csv"))
unreliable_false = pd.read_csv(os.path.join(unreliable_data_dir, "unreliable_false_cleaned.csv"))

unreliable_true_train, unreliable_true_valid, unreliable_true_test = split_train_valid_test(unreliable_true)
unreliable_false_train,unreliable_false_valid, unreliable_false_test = split_train_valid_test(unreliable_false)

unreliable_train = pd.concat([unreliable_true_train, unreliable_false_train]).reset_index(drop=True)
unreliable_valid = pd.concat([unreliable_true_valid, unreliable_false_valid]).reset_index(drop=True)
unreliable_test  = pd.concat([unreliable_true_test, unreliable_false_test]).reset_index(drop=True)

print(f"Train fake-news: {unreliable_train['label_bin'].value_counts()}")
print(f"Valid fake-news: {unreliable_valid['label_bin'].value_counts()}")
print(f"Test fake-news: {unreliable_test['label_bin'].value_counts()}")

In [None]:
# Ячейка 4.4: Загрузка данных SCiFact (https://www.kaggle.com/datasets/thedevastator/unlock-insight-into-scientific-claims-with-scifa)

scifact_train = pd.read_csv(os.path.join(scifact_dir, "claims_train.csv"))
scifact_valid = pd.read_csv(os.path.join(scifact_dir, "claims_validation.csv"))

scifact_all = pd.concat([scifact_train, scifact_valid])

scifact_all = scifact_all.rename(columns={'claim': 'statement'})
scifact_all = scifact_all.rename(columns={'evidence_label': 'label_bin'})
scifact_all = scifact_all.drop(columns=['id', 'evidence_doc_id', 'evidence_sentences', 'cited_doc_ids'])
scifact_all = scifact_all.dropna()
scifact_all['label_bin'] = scifact_all['label_bin'].apply(lambda x: 1 if x == "SUPPORT" else 0)

scifact_true = scifact_all[scifact_all['label_bin'] == 1].reset_index(drop=True)
scifact_false = scifact_all[scifact_all['label_bin'] == 0].reset_index(drop=True)

scifact_true_train, scifact_true_valid, scifact_true_test = split_train_valid_test(scifact_true)
scifact_false_train, scifact_false_valid, scifact_false_test = split_train_valid_test(scifact_false)

scifact_train = pd.concat([scifact_true_train, scifact_false_train]).reset_index(drop=True)
scifact_valid = pd.concat([scifact_true_valid, scifact_false_valid]).reset_index(drop=True)
scifact_test  = pd.concat([scifact_true_test, scifact_false_test]).reset_index(drop=True)

print(f"scifact_train:\n{scifact_train['label_bin'].value_counts()}")
print(f"scifact_valid:\n{scifact_valid['label_bin'].value_counts()}")
print(f"scifact_test:\n{scifact_test['label_bin'].value_counts()}")

In [None]:
# Ячейка 4.5: Загрузка данных FEVER (https://fever.ai/dataset/fever.html)

fever = pd.read_json(os.path.join(fever_dir, "train.jsonl"), lines=True)

fever = fever[fever['verifiable'] == "VERIFIABLE"].drop(columns=['verifiable', 'id', 'evidence'])
fever = fever[fever['label'] != "NOT ENOUGH INFO"].rename(columns={'label': 'label_bin', 'claim': 'statement'})
fever = fever.dropna().reset_index(drop=True)
fever['label_bin'] = fever['label_bin'].apply(lambda x: 1 if x == "SUPPORTS" else 0)

fever_true = fever[fever['label_bin'] == 1]
fever_false = fever[fever['label_bin'] == 0]

fever_true_train, fever_true_valid, fever_true_test = split_train_valid_test(fever_true)
fever_false_train, fever_false_valid, fever_false_test = split_train_valid_test(fever_false)

fever_train = pd.concat([fever_true_train, fever_false_train]).reset_index(drop=True)
fever_valid = pd.concat([fever_true_valid, fever_false_valid]).reset_index(drop=True)
fever_test  = pd.concat([fever_true_test, fever_false_test]).reset_index(drop=True)

print(f"fever_train:\n{fever_train['label_bin'].value_counts()}")
print(f"fever_valid:\n{fever_valid['label_bin'].value_counts()}")
print(f"fever_test:\n{fever_test['label_bin'].value_counts()}")

In [None]:
# Ячейка 5: Объединение.
datasets = ['liar', 'liar_twitter', 'unreliable', 'scifact', 'fever']
splits = ['train', 'valid', 'test']

for ds in datasets:
    for split in splits:
        df = globals()[f"{ds}_{split}"]
        df['dataset_name'] = ds

# Объединяем датасеты
train_df = pd.concat([globals()[f"{ds}_train"] for ds in datasets]).reset_index(drop=True)
valid_df = pd.concat([globals()[f"{ds}_valid"] for ds in datasets]).reset_index(drop=True)
test_df = pd.concat([globals()[f"{ds}_test"] for ds in datasets]).reset_index(drop=True)

In [None]:
# Ячейка 6: Форматируем данные в датасетах.
train_df['statement'] = train_df['statement'].swifter.apply(clean_text)
valid_df['statement'] = valid_df['statement'].swifter.apply(clean_text)
test_df['statement']  = test_df['statement'].swifter.apply(clean_text)

train_df = train_df.drop_duplicates(subset=['statement'], keep='first').reset_index(drop=True)
valid_df = valid_df.drop_duplicates(subset=['statement'], keep='first').reset_index(drop=True)
test_df  = test_df.drop_duplicates(subset=['statement'], keep='first').reset_index(drop=True)

In [None]:
# Ячейка 7.0: Подготовка к редукции датасета.
def compute_embeddings_and_labels(texts, 
                                  num_clusters=500, 
                                  svd_components=50, 
                                  tsne_perplexity=40, 
                                  tsne_iter=300, 
                                  random_seed=config.SEED,
                                  verbose=True):
    """
    Векторизация, понижение размерности, кластеризация и t-SNE.
    """
    vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
    X = vectorizer.fit_transform(texts)
    
    svd = TruncatedSVD(n_components=svd_components, random_state=random_seed)
    X_reduced = svd.fit_transform(X)
    
    kmeans = MiniBatchKMeans(n_clusters=num_clusters, random_state=random_seed)
    kmeans.fit(X_reduced)
    labels = kmeans.labels_
    
    tsne = TSNE(n_components=2, perplexity=tsne_perplexity, n_iter=tsne_iter, random_state=random_seed)
    X_embedded = tsne.fit_transform(X_reduced)
    
    if verbose:
        print(f"Вычислено эмбеддингов: {len(texts)}")

    # Визуализация t-SNE
    fig, ax = plt.subplots(figsize=(12, 8))
    scatter = ax.scatter(X_embedded[:, 0], X_embedded[:, 1], c=labels, cmap='tab20', s=5, alpha=0.7)
    plt.colorbar(scatter, ticks=range(len(np.unique(labels))))
    ax.set_title("Кластеризация текстов (без эллипсов)")
    plt.xlabel("t-SNE компонента 1")
    plt.ylabel("t-SNE компонента 2")
    plt.show()
    
    return X_reduced, X_embedded, labels, kmeans

def get_top_n_cluster_examples(texts, X_reduced, kmeans, n=5):
    centers = kmeans.cluster_centers_
    distances = cdist(centers, X_reduced, metric='euclidean')
    
    top_n_indices_per_cluster = np.argsort(distances, axis=1)[:, :n]
    
    cluster_examples = []
    cluster_example_indices = []
    for i in range(len(centers)):
        indices = top_n_indices_per_cluster[i]
        examples = [texts[idx] for idx in indices]
        cluster_examples.append(examples)
        cluster_example_indices.append(indices)
    
    return cluster_examples, cluster_example_indices

def points_in_ellipse(X, center, axes, angle_rad):
    cos_angle = np.cos(angle_rad)
    sin_angle = np.sin(angle_rad)
    xc, yc = center
    a, b = axes
    diff = X - np.array([xc, yc])
    x_rot = diff[:, 0]*cos_angle + diff[:, 1]*sin_angle
    y_rot = -diff[:, 0]*sin_angle + diff[:, 1]*cos_angle
    val = (x_rot/a)**2 + (y_rot/b)**2
    return val <= 1

def plot_ellipses(X_embedded, labels, ellipses_params, show=True):
    """
    Рисует scatter и несколько эллипсов.
    Возвращает список массивов индексов точек внутри каждого эллипса.
    """
    fig, ax = plt.subplots(figsize=(12,8))
    scatter = ax.scatter(X_embedded[:,0], X_embedded[:,1], c=labels, cmap='tab20', s=5, alpha=0.7)
    
    indices_list = []
    for params in ellipses_params:
        ellipse = Ellipse(
            xy=params['center'],
            width=params['width'],
            height=params['height'],
            angle=np.degrees(params['angle_rad']),
            edgecolor='red', facecolor='none', linewidth=2
        )
        ax.add_patch(ellipse)

        axes = (params['width']/2, params['height']/2)
        mask = points_in_ellipse(X_embedded, params['center'], axes, params['angle_rad'])
        indices = np.where(mask)[0]
        indices_list.append(indices)

        print(f"Эллипс в {params['center']}: {len(indices)} точек найдено")
    
    if show:
        plt.colorbar(scatter, ticks=range(len(np.unique(labels))))
        ax.set_title("Кластеризация с эллипсами")
        plt.xlabel("Компонента 1")
        plt.ylabel("Компонента 2")
        plt.show()
    
    return indices_list

def reduce_texts_by_indices(texts, labels, indices_list, keep_fraction=0.2, random_seed=config.SEED):
    """
    По списку индексов выполняет редуцирование:
    из кластеров, плотно попадающих в эллипс, оставляет keep_fraction примеров случайно,
    возвращает новый общий список текстов с уменьшенным размером.
    """
    all_filtered_indices = []
    all_reduced_texts = []

    labels_df = pd.Series(labels).value_counts()

    for indices_in_ellipse in indices_list:
        labels_in_ellipse = labels[indices_in_ellipse]
        labels_ellipse_df = pd.Series(labels_in_ellipse).value_counts()

        to_reduce = []
        for label_id in labels_ellipse_df.index:
            if labels_df[label_id] * 0.8 <= labels_ellipse_df[label_id]:
                to_reduce.append(label_id)

        # Фильтруем индексы для кластеров, которые нужно редуцировать
        mask = np.isin(labels_in_ellipse, to_reduce)
        filtered_indices = indices_in_ellipse[mask]

        df_sel = pd.DataFrame({
            'index': filtered_indices,
            'label': labels[filtered_indices],
            'text': [texts[i] for i in filtered_indices]
        })

        def sample_fn(group):
            n_keep = max(1, int(len(group)*keep_fraction))
            return group.sample(n=n_keep, random_state=random_seed)

        reduced_df = df_sel.groupby('label').apply(sample_fn).reset_index(drop=True)

        all_filtered_indices.extend(list(filtered_indices))
        all_reduced_texts.extend(reduced_df['text'].tolist())

        print(f"Область эллипса: {len(indices_in_ellipse)} исходных, редуцировано до {len(reduced_df)}")

    # Формируем итоговый список: вынести удалённые и добавить редуцированные
    set_to_remove = set(all_filtered_indices)
    filtered_texts = [texts[i] for i in range(len(texts)) if i not in set_to_remove]
    combined_texts = filtered_texts + all_reduced_texts

    print(f"Общий размер после редукции: {len(combined_texts)}")
    return combined_texts

In [None]:
# Ячейка 7.1: Получение текстов для кластеризации.
texts = train_df['statement'].to_list()

In [None]:
# Ячейка 7.2: Флаг, отвечающий за вычисление кластеров.
# Если изменений в данных не производилось, то лучше не трогать, т.к. это дорогостоящая операция.
run_flag = False

In [None]:
# Ячейка 7.3: Первая кластеризация
if not run_flag:
    # Загрузка
    X_reduced = np.load(os.path.join(intermediate_dir, 'X_reduced.npy'))
    X_embedded = np.load(os.path.join(intermediate_dir, 'X_embedded.npy'))
    labels = np.load(os.path.join(intermediate_dir, 'labels.npy'))
    kmeans = joblib.load(os.path.join(intermediate_dir, 'kmeans_model.joblib'))
    print(f"Загружены данные из {intermediate_dir}")
    # plot_ellipses(X_embedded, labels, ellipses_params=[])
else:
    X_reduced, X_embedded, labels, kmeans = compute_embeddings_and_labels(texts)

    # Сохранение
    np.save(os.path.join(intermediate_dir, 'X_reduced.npy'), X_reduced)
    np.save(os.path.join(intermediate_dir, 'X_embedded.npy'), X_embedded)
    np.save(os.path.join(intermediate_dir, 'labels.npy'), labels)
    joblib.dump(kmeans, os.path.join(intermediate_dir, 'kmeans_model.joblib'))

In [None]:
# Ячейка 7.4: Первый выбор областей для редукции.
ellipse_params = [{
    'center': (1, 10.5),
    'width': 4.5,
    'height': 8,
    'angle_rad': radians(-10)
}, {
    'center': (8.5, 2),
    'width': 14,
    'height': 16,
    'angle_rad': radians(0)
}, {
    'center': (-0.75, -4.75),
    'width': 6,
    'height': 7,
    'angle_rad': radians(0)
}]

indices_list = plot_ellipses(X_embedded, labels, ellipse_params)

In [None]:
# Ячейка 7.5: Первое выполнение редукции.
reduced_texts = reduce_texts_by_indices(texts, labels, indices_list)

In [None]:
if not run_flag:
    # Загрузка
    X_reduced2 = np.load(os.path.join(intermediate_dir, 'X_reduced2.npy'))
    X_embedded2 = np.load(os.path.join(intermediate_dir, 'X_embedded2.npy'))
    labels2 = np.load(os.path.join(intermediate_dir, 'labels2.npy'))
    kmeans2 = joblib.load(os.path.join(intermediate_dir, 'kmeans2_model.joblib'))
    print(f"Загружены данные из {intermediate_dir}")
    # plot_ellipses(X_embedded2, labels2, ellipses_params=[])
else:
    X_reduced2, X_embedded2, labels2, kmeans2 = compute_embeddings_and_labels(reduced_texts)

    # Сохранение
    np.save(os.path.join(intermediate_dir, 'X_reduced2.npy'), X_reduced2)
    np.save(os.path.join(intermediate_dir, 'X_embedded2.npy'), X_embedded2)
    np.save(os.path.join(intermediate_dir, 'labels2.npy'), labels2)
    joblib.dump(kmeans2, os.path.join(intermediate_dir, 'kmeans2_model.joblib'))

In [None]:
ellipse_params2 = [{
    'center': (2.5, -6),
    'width': 6,
    'height': 7,
    'angle_rad': radians(0)
}, {
    'center': (-5.75, -2.25),
    'width': 12,
    'height': 7,
    'angle_rad': radians(-35)
}, {
    'center': (-5, -12),
    'width': 7,
    'height': 5,
    'angle_rad': radians(45)
}]

indices_list2 = plot_ellipses(X_embedded2, labels2, ellipse_params2)

In [None]:
reduced_texts2 = reduce_texts_by_indices(reduced_texts, labels2, indices_list2)

In [None]:
if not run_flag:
    # Загрузка
    X_reduced3 = np.load(os.path.join(intermediate_dir, 'X_reduced3.npy'))
    X_embedded3 = np.load(os.path.join(intermediate_dir, 'X_embedded3.npy'))
    labels3 = np.load(os.path.join(intermediate_dir, 'labels3.npy'))
    kmeans3 = joblib.load(os.path.join(intermediate_dir, 'kmeans3_model.joblib'))
    print(f"Загружены данные из {intermediate_dir}")
    # plot_ellipses(X_embedded3, labels3, ellipses_params=[])
else:
    X_reduced3, X_embedded3, labels3, kmeans3 = compute_embeddings_and_labels(reduced_texts2)
    # Сохранение
    np.save(os.path.join(intermediate_dir, 'X_reduced3.npy'), X_reduced3)
    np.save(os.path.join(intermediate_dir, 'X_embedded3.npy'), X_embedded3)
    np.save(os.path.join(intermediate_dir, 'labels3.npy'), labels3)
    joblib.dump(kmeans3, os.path.join(intermediate_dir, 'kmeans3_model.joblib'))

In [None]:
# Ячейка 8.1: Выбор строк из оригинального тренировочного массива по редуцированному набору.
reduced_set = set(reduced_texts2)
reduced_train_df = train_df[pd.Index(texts).isin(reduced_set)].reset_index(drop=True)

In [None]:
# Ячейка 8.2: Вычисление выбросов среди редуцированного набора и их удаление.

# Создаём DataFrame признаков из эмбеддингов
df_features = pd.DataFrame(X_reduced3)

# Добавляем колонку с именем датасета из согласованного reduced_train_df
df_features['dataset_name'] = reduced_train_df['dataset_name'].values

# Применяем Isolation Forest по группам

def detect_outliers_by_group(df_feat, group_col='dataset_name', contamination=0.05):
    outliers_idx = []
    for group in df_feat[group_col].unique():
        grp_data = df_feat[df_feat[group_col] == group].drop(columns=[group_col])
        iso_forest = IsolationForest(contamination=contamination, random_state=config.SEED)
        preds = iso_forest.fit_predict(grp_data)
        group_outliers = df_feat[df_feat[group_col] == group].index[preds == -1].tolist()
        outliers_idx.extend(group_outliers)
        print(f"Группа '{group}': найдено выбросов {len(group_outliers)}")
    return outliers_idx

outliers_indices = detect_outliers_by_group(df_features)

# Добавляем метку выбросов в reduced_train_df
reduced_train_df['is_outlier'] = False
reduced_train_df.loc[outliers_indices, 'is_outlier'] = True

print(f"Найдено выбросов в reduced_train_df: {reduced_train_df['is_outlier'].sum()} из {len(reduced_train_df)}")

reduced_train_df = reduced_train_df[reduced_train_df['is_outlier'] == False].reset_index(drop=True)
reduced_texts3 = reduced_train_df['statement'].to_list()

In [None]:
# Ячейка 8.3: Визуализация сокращения данных в числовом выражении.
for ds in datasets:
    filter_df = train_df[train_df['dataset_name'] == ds]
    true_df = filter_df[filter_df['label_bin'] == 1]
    reduced_filter_df = reduced_train_df[reduced_train_df['dataset_name'] == ds]
    reduced_true_df = reduced_filter_df[reduced_filter_df['label_bin'] == 1]
    
    print(f"{ds}:\nTrue: {len(true_df)} -> {len(reduced_true_df)}\nFalse: {len(filter_df) - len(true_df)} -> {len(reduced_filter_df) - len(reduced_true_df)}")

In [None]:
# Ячейка 9.0: Выполняем кластеризацию для выделения центральных примеров из кластеров.
if not run_flag:
    # Загрузка
    X_reduced4 = np.load(os.path.join(intermediate_dir, 'X_reduced4.npy'))
    X_embedded4 = np.load(os.path.join(intermediate_dir, 'X_embedded4.npy'))
    labels4 = np.load(os.path.join(intermediate_dir, 'labels4.npy'))
    kmeans4 = joblib.load(os.path.join(intermediate_dir, 'kmeans4_model.joblib'))
    print(f"Загружены данные из {intermediate_dir}")
    # plot_ellipses(X_embedded4, labels4, ellipses_params=[])
else:
    X_reduced4, X_embedded4, labels4, kmeans4 = compute_embeddings_and_labels(reduced_texts3)
    # Сохранение
    np.save(os.path.join(intermediate_dir, 'X_reduced4.npy'), X_reduced4)
    np.save(os.path.join(intermediate_dir, 'X_embedded4.npy'), X_embedded4)
    np.save(os.path.join(intermediate_dir, 'labels4.npy'), labels4)
    joblib.dump(kmeans4, os.path.join(intermediate_dir, 'kmeans4_model.joblib'))

In [None]:
# Ячейка 9.1: Выделение центральных примеров из кластеров.
center_texts, center_indices = get_top_n_cluster_examples(reduced_texts3, X_reduced4, kmeans4, n=20)
for i, text in enumerate(center_texts[:10]):
    print(f"Кластер {i} ключевое предложение:\n{text}\n")

all_center_texts = []
for text_list in center_texts:
    all_center_texts.extend(text_list)

In [None]:
# Ячейка 9.2: Формирование датафрейма из центральных примеров.
center_texts_set = set(all_center_texts)
core_train_df = reduced_train_df[reduced_train_df['statement'].isin(center_texts_set)].reset_index(drop=True)

In [None]:
# Ячейка 10.0: Функция расчёта весов классов каждого датасета по отношению к размеру общего.
def compute_sample_weights(df, label_col='label_bin', dataset_col='dataset_name'):
    total_len = len(df)
    weights_true = df[df[label_col] == 1][dataset_col].value_counts()
    weights_false = df[df[label_col] == 0][dataset_col].value_counts()

    w_true = total_len / weights_true
    w_false = total_len / weights_false
    total_weights_sum = w_true.sum() + w_false.sum()

    w_true /= total_weights_sum
    w_false /= total_weights_sum

    # Создаем словарь для быстрого присвоения веса
    weight_map_true = w_true.to_dict()
    weight_map_false = w_false.to_dict()

    # Функция для присвоения веса по строке
    def assign_weight(row):
        if row[label_col] == 1:
            return weight_map_true.get(row[dataset_col], 0)
        else:
            return weight_map_false.get(row[dataset_col], 0)

    weights = df.apply(assign_weight, axis=1)
    return weights


In [None]:
# Ячейка 10.1: Вычисление весов
reduced_train_df['weight'] = compute_sample_weights(reduced_train_df)
valid_df['weight'] = compute_sample_weights(valid_df)
test_df['weight'] = compute_sample_weights(test_df)

In [None]:
# Ячейка 11: Токенизатор
tokenizer = AutoTokenizer.from_pretrained(config.MODEL)

In [None]:
# Ячейка 12.1: Функция чанкования, класс датасета и функция сбора.
def process_one_doc(args):
    doc_id, text, label, sample_weight, max_chunk_len, stride = args
    tokens = tokenizer.tokenize(text)
    
    if len(tokens) <= max_chunk_len:
        num_chunks = 1
        enc = tokenizer.encode_plus(
            tokens,
            is_split_into_words=True,
            max_length=max_chunk_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        chunks = [{
            'doc_id': doc_id,
            'num_chunks': num_chunks,
            'input_ids': enc['input_ids'].squeeze(0),
            'attention_mask': enc['attention_mask'].squeeze(0),
            'label': label,
            'sample_weight': sample_weight
        }]
    else:
        num_chunks = (len(tokens) - max_chunk_len) // stride + 1
        chunks = []
        for i in range(0, len(tokens), stride):
            chunk_tokens = tokens[i:i+max_chunk_len]
            if len(chunk_tokens) == 0:
                break
            enc = tokenizer.encode_plus(
                chunk_tokens,
                is_split_into_words=True,
                max_length=max_chunk_len,
                padding='max_length',
                truncation=True,
                return_tensors="pt"
            )
            chunks.append({
                'doc_id': doc_id,
                'num_chunks': num_chunks,
                'input_ids': enc['input_ids'].squeeze(0),
                'attention_mask': enc['attention_mask'].squeeze(0),
                'label': label,
                'sample_weight': sample_weight
            })
    return chunks

def process_chunking(df, max_chunk_len, stride, text_column, label_column, weight_column, num_workers=4):
    args = [(idx, row[text_column], row[label_column], row[weight_column], max_chunk_len, stride)
            for idx, row in df.iterrows()]
    
    chunks_list = [None] * len(args)
    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        futures = {executor.submit(process_one_doc, arg): i for i, arg in enumerate(args)}
        for future in tqdm(as_completed(futures), total=len(futures), desc="Chunking parallel"):
            idx = futures[future]
            chunks_list[idx] = future.result()
    
    new_df = df.copy()
    new_df['chunks'] = chunks_list
    return new_df


class ChunkedTextDataset(torch.utils.data.Dataset):
    def __init__(self, chunks):
        self.chunks = chunks

    def __len__(self):
        return len(self.chunks)

    def __getitem__(self, idx):
        chunk = self.chunks[idx]
        return chunk

def collate_fn(batch):
    input_ids = torch.stack([item['input_ids'] for item in batch])
    attention_mask = torch.stack([item['attention_mask'] for item in batch])
    labels = torch.tensor([item['label'] for item in batch], dtype=torch.long)
    doc_ids = torch.tensor([item['doc_id'] for item in batch], dtype=torch.long)
    num_chunks = torch.tensor([item['num_chunks'] for item in batch], dtype=torch.long)
    sample_weights = torch.tensor([item['sample_weight'] for item in batch], dtype=torch.float)
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels,
        'doc_ids': doc_ids,
        'num_chunks': num_chunks,
        'sample_weights': sample_weights
    }


In [None]:
# Ячейка 12.2: Формирование чанкованных датафреймов.
chunked_core_train_df = process_chunking(df=core_train_df.drop(columns=['dataset_name']), max_chunk_len=config.MAX_LEN, stride=config.MAX_LEN//2, text_column='statement', label_column='label_bin', weight_column='weight')
chunked_train_df = process_chunking(df=reduced_train_df.drop(columns=['dataset_name']), max_chunk_len=config.MAX_LEN, stride=config.MAX_LEN//2, text_column='statement', label_column='label_bin', weight_column='weight')
chunked_valid_df = process_chunking(df=valid_df.drop(columns=['dataset_name']), max_chunk_len=config.MAX_LEN, stride=config.MAX_LEN//2, text_column='statement', label_column='label_bin', weight_column='weight')
chunked_test_df  = process_chunking(df=test_df.drop(columns=['dataset_name']), max_chunk_len=config.MAX_LEN, stride=config.MAX_LEN//2, text_column='statement', label_column='label_bin', weight_column='weight')

In [None]:
print(chunked_train_df['chunks'].head())

In [None]:
def chunks_to_list(df: pd.DataFrame):
    all_list = []
    for idx, row in df.iterrows():
        all_list.extend(row['chunks'])
    return all_list

In [None]:
# Ячейка 13: Формирование загрузчиков данных.
core_train_loader_bin = DataLoader(ChunkedTextDataset(chunks_to_list(chunked_core_train_df)), batch_size=config.BATCH_SIZE_VALID, shuffle=False, num_workers=config.NUM_WORKERS, collate_fn=collate_fn)
valid_loader_bin = DataLoader(ChunkedTextDataset(chunks_to_list(chunked_valid_df)), batch_size=config.BATCH_SIZE_VALID, shuffle=False, num_workers=config.NUM_WORKERS, collate_fn=collate_fn)
test_loader_bin  = DataLoader(ChunkedTextDataset(chunks_to_list(chunked_test_df)), batch_size=config.BATCH_SIZE_VALID, shuffle=False, num_workers=config.NUM_WORKERS, collate_fn=collate_fn)

In [None]:
# Ячейка 14: Модель с выходом для любого количества классов
class CustomModel(nn.Module):
    def __init__(self, model_name, num_classes, dropout_rate):
        super().__init__()
        self.config = AutoConfig.from_pretrained(model_name)
        self.config.output_hidden_states = False
        self.model = AutoModel.from_pretrained(model_name, config=self.config)
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        cls_embeddings = outputs.last_hidden_state[:, 0]  # CLS токен
        cls_embeddings = self.dropout(cls_embeddings)
        logits = self.classifier(cls_embeddings)
        return logits


In [None]:
# Ячейка 15: EarlyStopping
class EarlyStopping:
    def __init__(self, patience=3, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_score = None
        self.early_stop = False

    def __call__(self, score):
        if self.best_score is None:
            self.best_score = score
            return False
        elif score < self.best_score + self.min_delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
                return True
        else:
            self.best_score = score
            self.counter = 0
            return False

In [None]:
# Ячейка 16.1: Тренировочный цикл, бинарная классификация Правда/Ложь.
# Ctrl+F8 - выполнить все ячейки выше

model_bin = CustomModel(config.MODEL, config.NUM_CLASSES_BIN, 0.3).to(device)
optimizer = torch.optim.AdamW(model_bin.parameters(), lr=config.LEARNING_RATE, weight_decay=config.WEIGHT_DECAY_BIN)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2)
criterion = nn.BCEWithLogitsLoss(reduction='none')
scaler = torch.amp.GradScaler()

truth_threshold = 0.5
early_stopping = EarlyStopping(patience=3, min_delta=1e-4)

model_save_path = os.path.join(output_dir, "deberta_v3_xsmall_binary_chunked_extended.pth")
best_macro_f1 = 0
best_threshold = truth_threshold
thresholds_to_try = np.linspace(0.1, 0.9, 81)

def find_best_threshold(targets, preds_probs, thresholds):
    best_f1 = 0
    best_t = thresholds[0]
    for t in thresholds:
        preds_bin = (preds_probs > t).astype(int)
        f1 = f1_score(targets, preds_bin, average='macro')
        if f1 > best_f1:
            best_f1 = f1
            best_t = t
    return best_t, best_f1

def aggregate_predictions(model, data_loader):
    model.eval()
    val_chunk_logits = []
    val_chunk_doc_ids = []
    val_labels_dict = {}
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device).float().unsqueeze(1)
            doc_ids = batch['doc_ids']
            outputs = model(input_ids, attention_mask)
            val_chunk_logits.append(outputs.detach().cpu())
            val_chunk_doc_ids.append(doc_ids)
            for doc_id, label in zip(doc_ids.tolist(), labels.detach().cpu().tolist()):
                if doc_id not in val_labels_dict:
                    val_labels_dict[doc_id] = label
    chunk_logits_tensor = torch.cat(val_chunk_logits, dim=0)
    chunk_doc_ids_total = torch.cat(val_chunk_doc_ids, dim=0)
    doc_logits_dict = defaultdict(list)
    for logit, doc_id in zip(chunk_logits_tensor, chunk_doc_ids_total):
        doc_logits_dict[doc_id.item()].append(logit)
    val_preds = []
    val_targets = []
    for doc_id, logits_list in doc_logits_dict.items():
        logits_stack = torch.stack(logits_list)
        agg_logit = logits_stack.mean(dim=0).item()
        val_preds.append(agg_logit)
        val_targets.append(val_labels_dict[doc_id])
    return np.array(val_targets), np.array(val_preds)

def chunks_to_list(df: pd.DataFrame):
    all_list = []
    for idx, row in df.iterrows():
        all_list.extend(row['chunks'])
    return all_list

# # Первая тренировка на core_train_loader_bin
# print("Initial training on core_train_loader_bin...")
# model_bin.train()
# for epoch in range(config.EPOCHS):
#     optimizer.zero_grad(set_to_none=True)
#     for batch in tqdm(core_train_loader_bin, desc=f"Initial Training Epoch {epoch+1}"):
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         labels = batch['labels'].to(device).float().unsqueeze(1)
#         sample_weights = batch['sample_weights'].to(device).unsqueeze(1)
#         with torch.amp.autocast(device_type="cuda"):
#             outputs = model_bin(input_ids, attention_mask)
#             losses = criterion(outputs, labels)
#             weighted_losses = losses * sample_weights
#             loss = weighted_losses.mean() / config.GRADIENT_ACCUMULATION_STEPS
#         scaler.scale(loss).backward()
#         scaler.step(optimizer)
#         scaler.update()
#         optimizer.zero_grad(set_to_none=True)

iteration = 0
# max_iterations = 10

# while iteration < max_iterations:
while True:
    print(f"Iteration {iteration + 1}: Validation on full training set")

    train_loader_bin = DataLoader(
        ChunkedTextDataset(chunks_to_list(chunked_train_df)),
        batch_size=config.BATCH_SIZE_VALID,
        shuffle=False,
        num_workers=config.NUM_WORKERS,
        collate_fn=collate_fn
    )

    val_targets, val_preds = aggregate_predictions(model_bin, train_loader_bin)

    probs = 1 / (1 + np.exp(-val_preds))
    current_best_threshold, current_macro_f1 = find_best_threshold(val_targets, probs, thresholds_to_try)

    print(f"Best threshold={current_best_threshold:.3f}, Macro F1={current_macro_f1:.4f}")

    if current_macro_f1 > best_macro_f1:
        best_macro_f1 = current_macro_f1
        best_threshold = current_best_threshold
        torch.save({
            'model_state_dict': model_bin.state_dict(),
            'best_threshold': best_threshold
        }, model_save_path)
        print(f"Saved new best model")

    abs_diff = np.abs(val_targets.squeeze() - val_preds)
    retrain_mask = abs_diff > 0.2
    retrain_df = chunked_train_df.iloc[retrain_mask.nonzero()[0]].copy()

    print(f"Examples selected for retraining: {len(retrain_df)}")

    if len(retrain_df) == 0:
        print("No hard examples left, stopping training.")
        break

    # Обновляем doc_id для чанков
    for idx, row in retrain_df.iterrows():
        for chunk in row['chunks']:
            chunk['doc_id'] = idx

    train_loader_retrain = DataLoader(
        ChunkedTextDataset(chunks_to_list(retrain_df)),
        batch_size=config.BATCH_SIZE_VALID,
        shuffle=True,
        num_workers=config.NUM_WORKERS,
        collate_fn=collate_fn
    )

    model_bin.train()
    optimizer.zero_grad(set_to_none=True)
    for step, batch in enumerate(tqdm(train_loader_retrain, desc=f"Retraining iteration {iteration + 1}")):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device).float().unsqueeze(1)
        sample_weights = batch['sample_weights'].to(device).unsqueeze(1)
        with torch.amp.autocast(device_type="cuda"):
            outputs = model_bin(input_ids, attention_mask)
            losses = criterion(outputs, labels)
            weighted_losses = losses * sample_weights
            loss = weighted_losses.mean() / config.GRADIENT_ACCUMULATION_STEPS
        scaler.scale(loss).backward()
        if (step + 1) % config.GRADIENT_ACCUMULATION_STEPS == 0 or (step + 1) == len(train_loader_retrain):
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad(set_to_none=True)

    iteration += 1

print("Training complete.")


In [None]:
# Ячейка 16.2: Корректировка модельных вероятностей.

model_bin = CustomModel(config.MODEL, config.NUM_CLASSES_BIN, 0.3).to(device)

# Загрузка лучшей модели и порога
checkpoint = torch.load(os.path.join(output_dir, "deberta_v3_xsmall_binary_chunked_extended.pth"), weights_only=False)
model_bin.load_state_dict(checkpoint['model_state_dict'])
best_threshold = checkpoint['best_threshold']
print(f"Загрузили модель с порогом {best_threshold:.3f}")

# Валидация
model_bin.eval()
val_chunk_logits = []
val_chunk_doc_ids = []
val_labels_dict = {}

with torch.no_grad():
    for batch in tqdm(valid_loader_bin, desc="Validation"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device).float().unsqueeze(1)
        doc_ids = batch['doc_ids']

        outputs = model_bin(input_ids, attention_mask)
        val_chunk_logits.append(outputs.detach().cpu())
        val_chunk_doc_ids.append(doc_ids)

        for doc_id, label in zip(doc_ids.tolist(), labels.detach().cpu().tolist()):
            if doc_id not in val_labels_dict:
                val_labels_dict[doc_id] = label

chunk_logits_tensor = torch.cat(val_chunk_logits, dim=0)
chunk_doc_ids_total = torch.cat(val_chunk_doc_ids, dim=0)
doc_logits_dict = defaultdict(list)

for logit, doc_id in zip(chunk_logits_tensor, chunk_doc_ids_total):
    doc_logits_dict[doc_id.item()].append(logit)

val_preds = []
val_targets = []

for doc_id, logits_list in doc_logits_dict.items():
    logits_stack = torch.stack(logits_list)
    agg_logit = logits_stack.mean(dim=0).item()
    val_preds.append(agg_logit)
    val_targets.append(val_labels_dict[doc_id])

val_preds = np.array(val_preds)
val_targets = np.array(val_targets)
val_probs = 1 / (1 + np.exp(-val_preds))

# Подбор оптимального порога по F1 для неоткалиброванных вероятностей
precision, recall, thresholds = precision_recall_curve(val_targets, val_probs)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
best_f1 = f1_scores[best_idx]

print(f"Uncalibrated best threshold: {best_threshold:.4f}, Macro F1: {best_f1:.4f}")

val_probs = np.array(val_probs).flatten()
val_targets = np.array(val_targets).flatten()

# Обучаем калибровщики
platt = LogisticRegression()
platt.fit(val_probs.reshape(-1, 1), val_targets)

iso_reg = IsotonicRegression(out_of_bounds='clip')
iso_reg.fit(val_probs, val_targets)

# Применяем калибровку
val_probs_platt = platt.predict_proba(val_probs.reshape(-1, 1))[:, 1]
val_probs_iso = iso_reg.predict(val_probs)

def find_best_threshold(probs, targets):
    precision, recall, thresholds = precision_recall_curve(targets, probs)
    f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
    best_idx = np.argmax(f1_scores)
    return thresholds[best_idx], f1_scores[best_idx]

best_thresh_platt, best_f1_platt = find_best_threshold(val_probs_platt, val_targets)
best_thresh_iso, best_f1_iso = find_best_threshold(val_probs_iso, val_targets)

print(f"Platt scaling: best threshold = {best_thresh_platt:.4f}, best macro F1 = {best_f1_platt:.4f}")
print(f"Isotonic regression: best threshold = {best_thresh_iso:.4f}, best macro F1 = {best_f1_iso:.4f}")

def classification_report_with_threshold(probs, targets, threshold):
    preds = (probs > threshold).astype(int)
    print(classification_report(targets, preds, digits=4, zero_division=0))
    print("Confusion matrix:")
    print(confusion_matrix(targets, preds))

print("\nUncalibrated classification report:")
classification_report_with_threshold(val_probs, val_targets, best_threshold)

print("\nPlatt scaling classification report:")
classification_report_with_threshold(val_probs_platt, val_targets, best_thresh_platt)

print("\nIsotonic regression classification report:")
classification_report_with_threshold(val_probs_iso, val_targets, best_thresh_iso)

# Сохраняем калибровщики для теста
joblib.dump(platt, os.path.join(output_dir, "platt_scaler_chunked_extended.pkl"))
joblib.dump(iso_reg, os.path.join(output_dir, "isotonic_regressor_chunked_extended.pkl"))

# Сохраняем оптимальные вероятности калибровщиков тоже

thresholds_dict = {
    "platt_threshold": float(best_thresh_platt),
    "isotonic_threshold": float(best_thresh_iso)
}

with open(os.path.join(output_dir, "calibration_thresholds_chunked_extended.json"), "w") as f:
    json.dump(thresholds_dict, f)

In [None]:
# Ячейка 17.1: Тестирование результата (тестовые данные из датасета)

# Загрузка модели и порога
model_bin = CustomModel(config.MODEL, config.NUM_CLASSES_BIN, 0.3).to(device)
checkpoint = torch.load(os.path.join(output_dir, "deberta_v3_xsmall_binary_chunked_extended.pth"), weights_only=False)
model_bin.load_state_dict(checkpoint['model_state_dict'])
truth_threshold = checkpoint['best_threshold']
print(f"Загрузили модель с порогом {truth_threshold:.3f}")

model_bin.eval()


# Загружаем калибровщики
platt = joblib.load(os.path.join(output_dir, "platt_scaler_chunked_extended.pkl"))
iso_reg = joblib.load(os.path.join(output_dir, "isotonic_regressor_chunked_extended.pkl"))

# Загружаем пороги
with open(os.path.join(output_dir, "calibration_thresholds_chunked_extended.json"), "r") as f:
    thresholds = json.load(f)

platt_threshold = thresholds["platt_threshold"]
isotonic_threshold = thresholds["isotonic_threshold"]


# calibrator - platt или iso_reg, truth_threshold - platt_*/isotonic_*
calibrator = platt
truth_threshold = platt_threshold

all_chunk_logits = []
all_chunk_doc_ids = []
all_labels_dict = {}

with torch.no_grad():
    for batch in tqdm(test_loader_bin, desc="Testing"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        doc_ids = batch['doc_ids']
        
        labels = batch.get('labels')
        if labels is not None:
            labels = labels.to(device).float().unsqueeze(1)

        outputs = model_bin(input_ids, attention_mask)
        all_chunk_logits.append(outputs.detach().cpu())
        all_chunk_doc_ids.append(doc_ids)
        
        if labels is not None:
            for doc_id, label in zip(doc_ids.tolist(), labels.cpu().tolist()):
                if doc_id not in all_labels_dict:
                    all_labels_dict[doc_id] = label

chunk_logits_tensor = torch.cat(all_chunk_logits, dim=0)
chunk_doc_ids_total = torch.cat(all_chunk_doc_ids, dim=0)

doc_logits_dict = defaultdict(list)
for logit, doc_id in zip(chunk_logits_tensor, chunk_doc_ids_total):
    doc_logits_dict[doc_id.item()].append(logit)

test_preds = []
test_targets = []
test_probs = []

for doc_id, logits_list in doc_logits_dict.items():
    logits_stack = torch.stack(logits_list)
    agg_logit = logits_stack.mean(dim=0).item()
    prob_raw = 1 / (1 + np.exp(-agg_logit))

    # Калибровка
    if isinstance(calibrator, LogisticRegression):
        prob_calibrated = calibrator.predict_proba(np.array([[prob_raw]]))[:, 1][0]
    else:
        prob_calibrated = calibrator.predict(np.array([prob_raw]))

    test_probs.append(prob_calibrated)
    pred = int(prob_calibrated > truth_threshold)
    test_preds.append(pred)

    if all_labels_dict:
        test_targets.append(all_labels_dict[doc_id])

if test_targets:
    test_targets = np.array(test_targets)
    test_preds = np.array(test_preds)
    test_probs = np.array(test_probs)

    print("Test Classification Report:")
    print(classification_report(test_targets, test_preds, digits=4, zero_division=0))
    print("Macro F1-score:", f1_score(test_targets, test_preds, average='macro'))
    print("Confusion Matrix:")
    print(confusion_matrix(test_targets, test_preds))

    # Индексы ошибок
    false_positives_idx = np.where((test_preds == 1) & (test_targets == 0))[0]
    false_negatives_idx = np.where((test_preds == 0) & (test_targets == 1))[0]

    # Данные ошибок и их уверенности
    fp_confidence = test_probs[false_positives_idx]
    fn_confidence = test_probs[false_negatives_idx]

    # Сортируем по убыванию уверенности
    fp_sorted_idx = false_positives_idx[np.argsort(-fp_confidence)]
    fn_sorted_idx = false_negatives_idx[np.argsort(-fn_confidence)]

    print("False Positives (ложь -> правда), отсортированные по уверенности ошибки:")
    for i in fp_sorted_idx[:5]:
        print(f"Текст: {test_df.iloc[i]['statement']}")
        print(f"Вероятность (ошибки): {test_probs[i]:.4f}")
        print("---")

    print("False Negatives (правда -> ложь), отсортированные по уверенности ошибки:")
    for i in fn_sorted_idx[:5]:
        print(f"Текст: {test_df.iloc[i]['statement']}")
        print(f"Вероятность (ошибки): {test_probs[i]:.4f}")
        print("---")
else:
    print("Метки отсутствуют, выведены только предсказания.")

In [None]:
# Ячейка 17.2: Тестирование результата (ручное)
test_device = torch.device('cpu')

tokenizer = AutoTokenizer.from_pretrained(config.MODEL)

# Загрузка модели и порога
model_bin = CustomModel(config.MODEL, config.NUM_CLASSES_BIN, 0.3).to(test_device)
checkpoint = torch.load(os.path.join(output_dir, "deberta_v3_xsmall_binary_chunked_extended.pth"), weights_only=False)
model_bin.load_state_dict(checkpoint['model_state_dict'])
truth_threshold = checkpoint['best_threshold']


# Загружаем калибровщики
platt = joblib.load(os.path.join(output_dir, "platt_scaler_chunked_extended.pkl"))
iso_reg = joblib.load(os.path.join(output_dir, "isotonic_regressor_chunked_extended.pkl"))

# Загружаем пороги
with open(os.path.join(output_dir, "calibration_thresholds_chunked_extended.json"), "r") as f:
    thresholds = json.load(f)

platt_threshold = thresholds["platt_threshold"]
isotonic_threshold = thresholds["isotonic_threshold"]


# calibrator - platt/iso_reg, truth_threshold - platt_*/isotonic_*
calibrator = platt
truth_threshold = platt_threshold



# Функция для разбиения текста на чанки
def chunk_text(text, tokenizer, max_len=config.MAX_LEN, stride=config.MAX_LEN//2):
    tokens = tokenizer.tokenize(text)
    chunks = []
    for i in range(0, len(tokens), stride):
        chunk_tokens = tokens[i:i+max_len]
        if not chunk_tokens:
            break
        enc = tokenizer.encode_plus(
            chunk_tokens,
            is_split_into_words=True,
            max_length=max_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        chunks.append(enc)
    return chunks

# Функция предсказания с калибровкой, усредняя чанки
def predict_text_chunked(text, model, tokenizer, calibrator, threshold, device):
    model.eval()
    chunks = chunk_text(text, tokenizer)
    input_ids = torch.cat([chunk['input_ids'] for chunk in chunks], dim=0).to(device)
    attention_mask = torch.cat([chunk['attention_mask'] for chunk in chunks], dim=0).to(device)

    with torch.no_grad():
        logits = model(input_ids, attention_mask)
        logits = logits.squeeze(-1).cpu()

    agg_logit = logits.mean().item()
    prob_raw = 1 / (1 + np.exp(-agg_logit))

    if isinstance(calibrator, LogisticRegression):
        prob_calibrated = calibrator.predict_proba(np.array([[prob_raw]]))[0, 1]
    else:
        prob_calibrated = calibrator.predict(np.array([prob_raw]))

    label = "Правда" if prob_calibrated > threshold else "Ложь"
    return label, prob_calibrated

# Функция для SHAP
def predict_proba_shap(texts):
    probs = []
    for text in texts:
        _, prob = predict_text_chunked(text, model_bin, tokenizer, calibrator, truth_threshold, test_device)
        probs.append(prob)
    probs = np.array(probs)
    return np.stack([1 - probs, probs], axis=1)

shap_explainer = shap.Explainer(predict_proba_shap, masker=shap.maskers.Text(tokenizer))

print("Введите текст для классификации (пустая строка для выхода):")

translator = Translator()

standart_phrases = ["Вчерашний шторм был самым слабым за последние годы, он нанес катастрофический ущерб всему городу.",
                    "Этот абсолютно безопасный метод лечения имеет смертельно опасные побочные эффекты.",
                    "Компания одновременно побила все рекорды по прибыли и обанкротилась в этом квартале.",
                    "Ученые обнаружили кота, который одновременно и жив, и мертв.",
                    "Новое устройство производит больше энергии, чем потребляет, создавая вечный двигатель.",
                    "В результате аварии пять человек погибли, и еще трое получили серьезные ранения, но жертв удалось избежать.",
                    "Президент скоропостижно скончался.",
                    "Глава государства внезапно умер.",
                    "Лидер нации нашел свой конец.",
                    "Земля вращается вокруг Солнца.",
                    "Солнце вращается вокруг Земли.",
                    "Этот ресторан подает самую отвратительную еду в городе.",
                    "Если бы мы все перешли на электромобили, загрязнение воздуха исчезло бы за один день.",
                    "Правда ли, что правительство скрывает инопланетян?",
                    "Новый завод обеспечит работу для 200% безработных в регионе.",
                    "Рождаемость в городе выросла на 150% за одну ночь.",
                    "Цена на хлеб упала до -5 долларов.",
                    "Комитет единогласно одобрил законопроект.",
                    "Комитет единогласно отклонил законопроект.",
                    "Это общеизвестныйфакт.",
                    "Вакцины вызывают @втизм.",
                    "Чтобы приготовить яичницу, нужно сначала почистить апельсин.",
                    "Рыбы вышли на берег прогуляться.",
                    "Солнце сделано из жидкого шоколада."]

standart_number = len(standart_phrases)
while True:
    if (standart_number < len(standart_phrases)):
        raw_input = standart_phrases[standart_number]
        standart_number = standart_number + 1
    else:
        raw_input = input(">>> ").strip()
        if raw_input == "":
            print("Выход.")
            break
    
    print(f"Оригинальный текст: {raw_input}")

    # Перевод текста
    try:
        translated_text = translator.translate(raw_input, dest='en').text
    except Exception as e:
        print(f"Ошибка перевода: {e}")
        continue

    print(f"Перевод: {translated_text}")

    cleaned_text = clean_text(translated_text)

    print(f"Очищенный перевод: {cleaned_text}")

    label, prob = predict_text_chunked(cleaned_text, model_bin, tokenizer, calibrator, truth_threshold, test_device)
    print(f"Предсказание: {label} (вероятность: {prob:.4f})")

    # print("Объяснение SHAP:")
    # try:
    #     shap_values = shap_explainer([cleaned_text])
    #     shap.plots.text(shap_values)
    # except Exception as e:
    #     print(f"Ошибка SHAP: {e}")

    print("\n")