<a href="https://colab.research.google.com/github/Radzon/Toxic_comments_detection/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import lightgbm as lgb
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
nltk.download('punkt')
nltk.download('stopwords')
from sklearn.metrics import precision_score, recall_score, precision_recall_curve, classification_report
from matplotlib import pyplot as plt

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
def load_and_process_data(filepath):
    data = []
    count = 0
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(' ', 1)
            if len(parts) != 2:
                continue
            labels, text = parts
            labels = labels.split(',')
            if '__label__NORMAL' in labels:
                data.append((text, 0))  # Нейтральный комментарий
            else:
                data.append((text, 1))  # Негативный комментарий
            count += 1
    print(count)
    return pd.DataFrame(data, columns=['text', 'label'])

In [14]:
filepath = './dataset.txt'
df = load_and_process_data(filepath)

248290


In [15]:
# Разделение данных на классы
df_majority = df[df.label == 0]
df_minority = df[df.label == 1]

# Андерсемплирование большинства класса
df_majority_downsampled = resample(df_majority,
                                   replace=False,    # Замена не нужна
                                   n_samples=len(df_minority),  # Чтобы количество совпадало с меньшинством
                                   random_state=42)  # Для воспроизводимости

# Объединение андерсемплированного большинства и меньшинства
df_balanced = pd.concat([df_majority_downsampled, df_minority])

In [24]:
df_balanced['label'].value_counts()

label
0    44605
1    44605
Name: count, dtype: int64

In [25]:
X_train, X_test, y_train, y_test = train_test_split(df_balanced['text'],
                                                    df_balanced['label'],
                                                    test_size=0.2,
                                                    stratify=df_balanced['label'])

In [28]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [29]:
train_data = lgb.Dataset(X_train_tfidf, label=y_train)
test_data = lgb.Dataset(X_test_tfidf, label=y_test, reference=train_data)

In [30]:
num_negatives = (y_train == 0).sum()
num_positives = (y_train == 1).sum()
scale_pos_weight = num_negatives / num_positives

In [31]:
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'scale_pos_weight': scale_pos_weight
}

In [32]:
model = lgb.train(
    params,
    train_data,
    num_boost_round=100,
    valid_sets=[train_data, test_data],
    callbacks=[
        lgb.early_stopping(stopping_rounds=10),
    ]
)

Training until validation scores don't improve for 10 rounds
Did not meet early stopping. Best iteration is:
[100]	training's binary_logloss: 0.350545	valid_1's binary_logloss: 0.359851


In [33]:
y_pred_proba = model.predict(X_test_tfidf)

In [34]:
optimal_threshold = 0.5
for threshold in [i * 0.01 for i in range(100)]:
    y_pred_custom = (y_pred_proba >= threshold).astype(int)
    precision = precision_score(y_test, y_pred_custom)
    if precision >= 0.95:
        optimal_threshold = threshold
        break

In [35]:
y_pred_custom = (y_pred_proba >= optimal_threshold).astype(int)
recall = recall_score(y_test, y_pred_custom)
precision = precision_score(y_test, y_pred_custom)

In [36]:
print(f"Optimal Threshold: {optimal_threshold}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(classification_report(y_test, y_pred_custom))

Optimal Threshold: 0.52
Precision: 0.9510400452808829
Recall: 0.7533908754623921
              precision    recall  f1-score   support

           0       0.80      0.96      0.87      8921
           1       0.95      0.75      0.84      8921

    accuracy                           0.86     17842
   macro avg       0.87      0.86      0.86     17842
weighted avg       0.87      0.86      0.86     17842

