In [None]:
!pip install transformers datasets torch scikit-learn pandas numpy matplotlib seaborn emoji



In [None]:
import pandas as pd
import numpy as np
import re
import emoji
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_score

from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments)

import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("train.csv")
data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = emoji.replace_emoji(text, replace='')
    return text

In [None]:
data['comment_text'] = data['comment_text'].apply(clean_text)

In [None]:
labels = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']
data['labels'] = data[labels].values.tolist()

In [None]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data['comment_text'].tolist(),
    data['labels'].tolist(),
    test_size=0.2,
    random_state=42)

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
class ToxicDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = ToxicDataset(train_encodings, train_labels)
test_dataset = ToxicDataset(test_encodings, test_labels)

In [None]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=6,
    problem_type="multi_label_classification")

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    save_total_limit=1,
    logging_dir="./logs",
    load_best_model_at_end=True)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset)

trainer.train()

In [None]:
predictions = trainer.predict(test_dataset)
preds = torch.sigmoid(torch.tensor(predictions.predictions)).numpy()
binary_preds = (preds > 0.5).astype(int)

print(classification_report(test_labels, binary_preds, target_names=labels))

In [None]:
def toxicity_score(pred):
    return int(np.sum(pred))

In [None]:
def moderation_action(score):
    if score == 0:
        return "Allow"
    elif score <= 2:
        return "Warning"
    elif score <= 4:
        return "Mute"
    else:
        return "Auto-Ban"

In [None]:
chat_samples = [
    "you are stupid",
    "nice gameplay bro",
    "go kill yourself",
    "this is trash game",
    "i hate your community"]

In [None]:
def analyze_chat(messages):
    enc = tokenizer(messages, truncation=True, padding=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**enc)
    probs = torch.sigmoid(outputs.logits).numpy()

    logs = []
    for msg, p in zip(messages, probs):
        labels_detected = [labels[i] for i in range(6) if p[i] > 0.5]
        score = toxicity_score(p > 0.5)
        action = moderation_action(score)
        logs.append([msg, labels_detected, score, action])
    return pd.DataFrame(logs, columns=["Message", "Detected Labels", "Severity", "Action"])

In [None]:
log_df = analyze_chat(chat_samples)
log_df

In [None]:
log_df['Severity'].value_counts().plot(kind='bar')
plt.title("Detected Toxicity Severity Levels")
plt.show()

In [None]:
import matplotlib.pyplot as plt

log_df['severity'].value_counts().plot(kind='bar')
plt.title('Detected Toxicity Levels')
plt.show()