In [None]:
!pip install catboost

In [2]:
import spacy

In [3]:
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split
from sklearn.metrics import classification_report

In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
def preprocess_text(text):
    doc = nlp(text)  # нижний регистр + токенизация и лемматизация
    tokens = [
        token.lemma_.lower().strip()
        for token in doc
        if not token.is_stop and not token.is_punct and not token.is_space and token.is_alpha
    ]
    return " ".join(tokens)

In [6]:
def truncate_text(text, max_len=400):
    return text[:max_len]

In [7]:
# Загрузка и подготовка данных
df = pd.read_csv("youtube_toxic_comments.csv")

In [None]:
df.head()

In [9]:
df['text_length'] = df['Text'].apply(len)

In [None]:
print(df['text_length'].describe())

In [None]:
plt.figure(figsize=(12, 6))

# Гистограмма
plt.subplot(1, 2, 1)
sns.histplot(df['text_length'], bins=50, kde=True)
plt.title('Distribution comment length')
plt.xlabel('Comment length')
plt.ylabel('Count')

In [12]:
df['IsToxic'] = df['IsToxic'].astype(int)
df['Text'] = df['Text'].astype(str).apply(preprocess_text)
df['Text'] = df['Text'].astype(str).apply(truncate_text)

In [None]:
df.head()

In [None]:
df["IsToxic"].value_counts()

In [None]:
sns.countplot(x="IsToxic", data=df, palette=["skyblue", "salmon"])

plt.title("Распределение токсичных и нетоксичных комментариев")
plt.xlabel("Токсичность (0 = нет, 1 = да)")
plt.ylabel("Количество")
plt.show()


In [16]:
X = df['Text']
y = df['IsToxic']

In [17]:
# Разделение данных
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [25]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), min_df=3, max_df=0.9)),
    ("catboost", CatBoostClassifier(iterations=2000, depth=6,
       random_seed=42,
        task_type="GPU"
    ))
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))

## Логистическая регрессия + TFIDF

In [29]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1, 2), min_df=3, max_df=0.9)),
    ("logreg", LogisticRegression(
        C=0.9,                     # регуляризация
        penalty="l2",             # тип регуляризации
        solver="liblinear",       # совместим с 'l1' и 'l2'
        max_iter=1000,
        random_state=42
    ))
])

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred, digits=4))

## Bert Classification

In [138]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset
import torch
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [139]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=1)

    return {
        "accuracy": accuracy_score(labels, predictions),
        "f1": f1_score(labels, predictions),
        "precision": precision_score(labels, predictions),
        "recall": recall_score(labels, predictions)
    }

In [140]:
df = pd.read_csv("youtube_toxic_comments.csv")

In [141]:
df['IsToxic'] = df['IsToxic'].astype(int)
df["Text"] = df["Text"].apply(truncate_text)

In [142]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["Text"].tolist(), df["IsToxic"].tolist(), test_size=0.1, stratify=df["IsToxic"]
)

In [143]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [144]:
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

In [None]:
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
test_dataset = Dataset.from_dict({"text": test_texts, "label": test_labels})

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

In [151]:
training_args = TrainingArguments(
    report_to="tensorboard",
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
)


In [152]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

![Result trainer.train](trainer_result.png)

TrainOutput(global_step=570, training_loss=0.14070430493407082, metrics={'train_runtime': 309.3423, 'train_samples_per_second': 29.094, 'train_steps_per_second': 1.843, 'total_flos': 591999874560000.0, 'train_loss': 0.14070430493407082, 'epoch': 10.0})