# Fine-Tuning DistilBERT for Offensive Content Detection

This notebook fine-tunes a DistilBERT model for classifying Discord messages as Hate Speech (0), Offensive (1), or Neither (2). It uses the same datasets and preprocessing as `lstm_finetune_clean.ipynb` but applies transfer learning with DistilBERT. The model is converted to TFLite for real-time inference in a Discord bot.

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from transformers import TFDistilBertForSequenceClassification, DistilBertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from tensorflow.keras.utils import to_categorical
from tensorflow.compat.v1.train import AdamOptimizer
from sklearn.utils import resample
import matplotlib.pyplot as plt
import re
import emoji
import hashlib
import os
from tqdm import tqdm

# Fix: Explicitly import callbacks from tf.keras (not standalone keras)
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint





  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Konfigurasi
max_length = 30
batch_size = 16

# Slang normalization (sama dengan lstm_finetune_clean.ipynb)
slang_dict = {
    'wtf': 'what the fuck', 'lol': 'laughing out loud', 'fr': 'for real', 'tbh': 'to be honest',
    'fucking': 'fuckin', 'fuckinng': 'fuckin', 'ur': 'your', 'r': 'are',
    'omg': 'oh my god', 'dope': 'great', 'lit': 'great', 'nigga': 'nigga',
    'pussi': 'pussy', 'hoe': 'ho', 'fam': 'friends', 'dawg': 'friend',
    'stfu': 'shut up', 'yo': 'hey', 'vibin': 'vibing', 'chill': 'relax',
    'slaps': 'great', 'cap': 'lie', 'bet': 'okay'
}

In [3]:
# Clean text
def clean_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return ''
    text = text.lower()
    text = emoji.demojize(text, delimiters=(' ', ' '))
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s!?]', '', text)
    for slang, full in slang_dict.items():
        text = re.sub(r'\b' + slang + r'\b', full, text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def hash_text(text):
    return hashlib.md5(str(text).encode()).hexdigest().lower()

def load_and_clean(df, text_col, label_col=None, rename=True):
    if rename and label_col:
        df = df.rename(columns={text_col: 'text', label_col: 'label'})
    elif rename:
        df = df.rename(columns={text_col: 'text'})
    df = df.dropna(subset=['text'])
    df['text'] = df['text'].apply(clean_text)
    df = df[df['text'].str.len() > 0]
    df['hash'] = df['text'].apply(hash_text)
    return df

In [4]:
# Load datasets
df = pd.read_csv('data/labeled_data_clean.csv')
df = load_and_clean(df, 'clean_tweet', 'class')

In [5]:
toxic_data = pd.read_csv('data/toxic-chat_annotation_all.csv')
toxic_data['label'] = toxic_data['toxicity'].apply(lambda x: 1 if x == 1 else 2)
toxic_data = load_and_clean(toxic_data, 'user_input', 'label')

In [6]:
oig_data = pd.read_csv('data/OIG_safety_v0.2.csv')
offensive_keywords = ['abuse', 'hate', 'offensive', 'bastard', 'fuck', 'bitch', 'nigga', 'asshole']
oig_data['label'] = oig_data['text'].apply(
    lambda x: 1 if any(word in str(x).lower() for word in offensive_keywords) else 2
)
oig_data = load_and_clean(oig_data, 'text', 'label', rename=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(clean_text)


In [7]:
discord_data = pd.read_csv('data/Example-Data-Guild_966767749118443530.csv')
discord_data['label'] = discord_data['content'].apply(
    lambda x: 1 if any(word in str(x).lower() for word in ['bastard', 'fuck', 'bitch']) else 2
)
discord_data = load_and_clean(discord_data, 'content', 'label')

In [8]:
real_world_data = pd.DataFrame({
    'text': ['yo dawg this is lit sunglasses', 'omg so cool fr', 'fam we vibin', 'stfu you jerk', 'bastard you suck',
             'this game slaps fr', 'chill vibes only', 'wtf this is dope', 'cap you lyin fam'],
    'label': [2, 2, 2, 1, 1, 2, 2, 2, 1]
})
real_world_data['text'] = real_world_data['text'].apply(clean_text)
real_world_data = real_world_data[real_world_data['text'].str.len() > 0]
real_world_data['hash'] = real_world_data['text'].apply(hash_text)

In [9]:
# Merge datasets
df = pd.concat([df[['text', 'label', 'hash']], toxic_data[['text', 'label', 'hash']],
                oig_data[['text', 'label', 'hash']], discord_data[['text', 'label', 'hash']],
                real_world_data[['text', 'label', 'hash']]], ignore_index=True)
df = df.drop_duplicates(subset=['hash']).drop(columns=['hash'])

In [10]:
# Oversampling
df_hate = df[df.label == 0]
df_offensive = df[df.label == 1]
df_neither = df[df.label == 2]
df_hate_upsampled = resample(df_hate, replace=True, n_samples=25000, random_state=42)
df_offensive_upsampled = resample(df_offensive, replace=True, n_samples=25000, random_state=42)
df_neither_upsampled = resample(df_neither, replace=True, n_samples=25000, random_state=42)
df_balanced = pd.concat([df_hate_upsampled, df_offensive_upsampled, df_neither_upsampled])
df_balanced = df_balanced.sample(frac=1, random_state=42)

In [11]:
# augmented data
augmented_df = pd.read_csv('data/augmented_data.csv')
df_balanced = pd.concat([df_balanced, augmented_df], ignore_index=True)

In [12]:
# Preprocessing for BERT (clean and convert to strings)
texts = [str(text).strip() if text is not None else "" for text in df_balanced['text']]
labels = df_balanced['label'].tolist()

In [13]:
print(type(texts))
print(texts[:3])  # preview the first few entries

<class 'list'>
['phone drier hey bitch scalp', 'peac fag rememb im best lux support na drop lustboy pick kakao tsm', 'lesbian get nip pierc ew']


In [14]:
# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='tf')



In [15]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    encodings['input_ids'].numpy(), to_categorical(labels, num_classes=3),
    test_size=0.15, random_state=42, stratify=labels
)
attention_mask_train, attention_mask_test = train_test_split(
    encodings['attention_mask'].numpy(), test_size=0.15, random_state=42, stratify=labels
)


In [16]:
# Create dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': X_train, 'attention_mask': attention_mask_train},
    y_train
)).batch(batch_size).prefetch(tf.data.AUTOTUNE)
test_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': X_test, 'attention_mask': attention_mask_test},
    y_test
)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [None]:
# Load model
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Compile with TF-compatible optimizer
optimizer = AdamOptimizer(learning_rate=2e-5)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])  # or sparse_categorical_crossentropy

# Callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True),
    ModelCheckpoint('model/lstm/bert_model.keras', monitor='val_loss', save_best_only=True)
]

# Train
history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=3,
    callbacks=callbacks,
    verbose=1
)






Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 


Epoch 1/3



In [None]:
# Evaluate
test_pred = model.predict(test_dataset)
test_pred_classes = np.argmax(test_pred.logits, axis=1)
test_true_classes = np.argmax(y_test, axis=1)
print("\nClassification Report (Test - BERT):")
print(classification_report(test_true_classes, test_pred_classes, target_names=['Hate', 'Offensive', 'Neither']))



In [None]:
# Confusion Matrix
cm = confusion_matrix(test_true_classes, test_pred_classes)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Hate', 'Offensive', 'Neither'])
plt.figure(figsize=(6, 6))
disp.plot(cmap='Blues', values_format='d')
plt.title('Confusion Matrix - BERT (Test Set)')
plt.show()

# Training History
plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Accuracy over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Save model
model.save_pretrained('model/bert/bert_model')
tokenizer.save_pretrained('model/bert/bert_tokenizer')

# Convert to TFLite
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
converter._experimental_lower_tensor_list_ops = False
converter.experimental_new_converter = True
converter.input_shape = (None, max_length)  # Batch size dinamis
tflite_model = converter.convert()
with open('model/lstm/tflite_bert_model.tflite', 'wb') as f:
    f.write(tflite_model)


In [None]:
# Test inference
inference_tests = [
    ('omg beautiful', 2), ('amazing', 2), ('holy', 2), ('omg', 2), ('bastard', 1),
    ('hello guys im here', 2), ('great bro', 2), ('nigger', 0), ('youre a pathetic loser', 1),
    ('lets go', 2), ('are you serious', 2), ('wtf', 2), ('lets fuckin go', 2),
    ('omg how this grok so idiot', 1), ('not bad', 2), ('well done', 2),
    ('had a long day need a nap', 2), ('this is fuckin awesome', 2),
    ('yo this game is dope sunglasses', 2), ('what a bitch always complaining', 1),
    ('this app is a ripoff pure garbage', 1), ('muslims are terrorists ban them all', 0),
    ('black people are just lazy leeches', 0), ('beautiful weather for a walk today', 2),
    ('i love you', 2), ('i mean thats alright', 2), ('just stop doing that', 2),
    ('why are you doing that', 2), ('why you are so dumb', 1), ('dawg do you even had a brain', 1),
    ('indian nigga', 0), ('indian jerk bastard', 1), ('stfu indian', 0),
    ('fr this game slaps bet', 2), ('vibin with the fam yo', 2)
]

test_texts = [clean_text(text) for text, _ in inference_tests]
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=max_length, return_tensors='tf')
test_dataset = tf.data.Dataset.from_tensor_slices((
    {'input_ids': test_encodings['input_ids'], 'attention_mask': test_encodings['attention_mask']}
)).batch(1)

test_preds = model.predict(test_dataset)
test_pred_classes = np.argmax(test_preds.logits, axis=1)

for (text, true_label), pred_class, probs, cleaned in zip(inference_tests, test_pred_classes, test_preds.logits, test_texts):
    class_names = ['Hate', 'Offensive', 'Neither']
    print(f'Input: {text} (Cleaned: {cleaned})')
    print(f'True Class: {class_names[true_label]}')
    print(f'Predicted Class: {class_names[pred_class]}, Confidence: {tf.nn.softmax(probs).numpy()[pred_class]:.4f}')
    print(f'Scores: Hate={tf.nn.softmax(probs).numpy()[0]:.4f}, Offensive={tf.nn.softmax(probs).numpy()[1]:.4f}, Neither={tf.nn.softmax(probs).numpy()[2]:.4f}\n')

correct = sum(p == t for p, (_, t) in zip(test_pred_classes, inference_tests))
accuracy = correct / len(inference_tests)
print(f'Accuracy: {accuracy:.2%}')

In [None]:
# Dokumentasi
print('Transfer Learning: Fine-tuned DistilBERT untuk klasifikasi Hate Speech/Offensive/Neither.')
print('Dataset: OIG_safety_v0.2.csv, labeled_data_clean.csv, toxic-chat_annotation_all.csv, Discord Unveiled, real-world data.')
print('Model disimpan di model/lstm/bert_model dan tflite_bert_model.tflite.')
print('Memenuhi aturan: Utilize TensorFlow, transfer learning allowed, no TensorFlow Hub.')
print('LSTM model retained as backup for comparison in inference.py.')