# Bangla Cyberbullying Classification with LSTM

This notebook implements an LSTM model for multi-class classification of Bangla cyberbullying text.


In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    roc_curve,
    auc,
    precision_recall_curve,
)
from sklearn.preprocessing import label_binarize
from bnlp import SentencepieceTokenizer
from bnlp.embedding.fasttext import BengaliFasttext
from sklearn.preprocessing import LabelEncoder
import time
from itertools import cycle
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings

# Import deep learning libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical

warnings.filterwarnings("ignore")

# Set plot style
plt.style.use("ggplot")
sns.set(font_scale=1.2)
sns.set_style("whitegrid")

## 1. Loading and Exploring Datasets


In [None]:
# 1. Load the dataset
print("Loading dataset...")
original_cleaned_df = pd.read_csv("../../dataset/cleaned/original_cleaned.csv")

print(f"Original dataset shape: {original_cleaned_df.shape}")

In [None]:
# Display the first few rows of original dataset
original_cleaned_df.head()

## 2. Label Mapping and Visualization


In [None]:
# Map the labels for the dataset
label_mapping = {"not bully": 0, "troll": 1, "sexual": 2, "religious": 3, "threat": 4}

original_cleaned_df["label_encoded"] = original_cleaned_df["label"].map(label_mapping)

In [None]:
# Check the distribution of labels
plt.figure(figsize=(12, 5))
sns.countplot(x="label", data=original_cleaned_df, palette="viridis")
plt.title("Distribution of Labels in Original Dataset")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 3. Data Splitting


In [None]:
# Split the original dataset into train and test sets
X = original_cleaned_df["comment"]
y = original_cleaned_df["label_encoded"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Convert to one-hot encoding for LSTM model
y_train_cat = to_categorical(y_train, num_classes=5)
y_test_cat = to_categorical(y_test, num_classes=5)

print(f"Training set size: {len(X_train)}")
print(f"Test set size: {len(X_test)}")

In [None]:
# Visualize the class distribution in the training set
plt.figure(figsize=(10, 6))

sns.countplot(x=y_train, palette="viridis")
plt.title("Class Distribution in Training Set")
plt.xlabel("Class Label")
plt.ylabel("Count")

plt.tight_layout()
plt.show()

## 4. Text Tokenization and Processing for LSTM


In [None]:
# For LSTM, we need to tokenize and pad sequences
print("Tokenizing and preparing sequences...")

# Set parameters for tokenization
MAX_FEATURES = 50000  # Maximum number of words to keep based on word frequency
MAX_SEQUENCE_LENGTH = 200  # Pad or truncate all sentences to this length

# Initialize and fit the tokenizer
tokenizer = Tokenizer(num_words=MAX_FEATURES, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

# Get vocabulary size and word index
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token
print(f"Vocabulary Size: {vocab_size}")

# Convert texts to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences to ensure uniform length
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

print(f"Training data shape after padding: {X_train_pad.shape}")
print(f"Testing data shape after padding: {X_test_pad.shape}")

In [None]:
# Optional: Load pre-trained word embeddings from BengaliFasttext
print("Loading FastText word vectors...")
fasttext_model = BengaliFasttext()
EMBEDDING_DIM = 300  # FastText typically uses 300-dimensional vectors

# Create an embedding matrix for the vocabulary
embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    if i >= MAX_FEATURES:
        continue
    try:
        embedding_vector = fasttext_model.get_word_vector(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    except:
        continue
        
print(f"Created embedding matrix of shape: {embedding_matrix.shape}")

## 5. Building and Training LSTM Model


In [None]:
# Configure and build LSTM model
print("Building LSTM model...")

model = Sequential()

# Add embedding layer - either random initialization or pre-trained
model.add(Embedding(
    input_dim=vocab_size,
    output_dim=EMBEDDING_DIM,
    input_length=MAX_SEQUENCE_LENGTH,
    weights=[embedding_matrix],  # Use pre-trained embeddings
    trainable=False  # Keep embeddings fixed
))

# Add spatial dropout to prevent overfitting
model.add(SpatialDropout1D(0.3))

# Add LSTM layer
model.add(LSTM(units=128, dropout=0.3, recurrent_dropout=0.3, return_sequences=True))
model.add(LSTM(units=64, dropout=0.3, recurrent_dropout=0.3))

# Output layer
model.add(Dense(5, activation='softmax'))  # 5 classes

# Compile the model
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# Model summary
model.summary()

In [None]:
# Set up callbacks for training
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True
)

model_checkpoint = ModelCheckpoint(
    filepath='../../models/lstm/best_lstm_model.h5',
    monitor='val_loss',
    save_best_only=True
)

# Train the model
print("Training LSTM model...")
start_time = time.time()

history = model.fit(
    X_train_pad, y_train_cat,
    validation_split=0.1,
    epochs=10,
    batch_size=32,
    callbacks=[early_stopping, model_checkpoint],
    verbose=1
)

training_time = time.time() - start_time
print(f"Model trained in {training_time:.2f} seconds")

In [None]:
# Plot training history
plt.figure(figsize=(12, 5))

# Plot accuracy
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

# Plot loss
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')

plt.tight_layout()
plt.show()

## 6. Model Evaluation - Basic Metrics


In [None]:
# Make predictions and calculate basic metrics
print("Evaluating model...")
y_pred_probs = model.predict(X_test_pad)
y_pred = np.argmax(y_pred_probs, axis=1)
y_test_labels = np.argmax(y_test_cat, axis=1)

# Calculate basic metrics
accuracy = accuracy_score(y_test_labels, y_pred)
precision = precision_score(y_test_labels, y_pred, average="weighted")
recall = recall_score(y_test_labels, y_pred, average="weighted")
f1 = f1_score(y_test_labels, y_pred, average="weighted")

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

In [None]:
# Display classification report
print("Classification Report:")
print(classification_report(y_test_labels, y_pred, target_names=list(label_mapping.keys())))

## 7. Confusion Matrix Visualization


In [None]:
# Create and display confusion matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test_labels, y_pred)
cm_norm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]  # Normalize
sns.heatmap(
    cm_norm,
    annot=True,
    fmt=".2f",
    cmap="Blues",
    xticklabels=list(label_mapping.keys()),
    yticklabels=list(label_mapping.keys()),
)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Normalized Confusion Matrix")
plt.tight_layout()
plt.show()

## 8. Analyzing Model Predictions


In [None]:
# Function to analyze incorrect predictions
def analyze_misclassifications(X_test, y_true, y_pred, tokenizer, n_samples=10):
    # Find indices of misclassified examples
    misclassified_idxs = np.where(y_true != y_pred)[0]
    
    if len(misclassified_idxs) == 0:
        print("No misclassifications found.")
        return
    
    # Select a random sample of misclassified examples
    sample_idxs = np.random.choice(misclassified_idxs, min(n_samples, len(misclassified_idxs)), replace=False)
    
    # Create a mapping from index to label
    idx_to_label = {v: k for k, v in label_mapping.items()}
    
    # Print details of misclassified examples
    print("\nSample of misclassified texts:")
    for i, idx in enumerate(sample_idxs):
        text = X_test.iloc[idx]
        true_label = idx_to_label[y_true[idx]]
        pred_label = idx_to_label[y_pred[idx]]
        print(f"\nExample {i+1}:")
        print(f"Text: {text[:100]}..." if len(text) > 100 else f"Text: {text}")
        print(f"True Label: {true_label}")
        print(f"Predicted Label: {pred_label}")

# Analyze misclassifications
analyze_misclassifications(X_test, y_test_labels, y_pred, tokenizer, n_samples=5)

## 9. ROC Curve Analysis


In [None]:
# ROC Curve and AUC for each class
# Binarize the output for ROC curve
y_test_bin = y_test_cat
y_score = y_pred_probs

# Calculate ROC curve and ROC area for each class
fpr = {}
tpr = {}
roc_auc = {}

plt.figure(figsize=(12, 10))
colors = cycle(["blue", "red", "green", "purple", "orange"])
class_names = list(label_mapping.keys())

for i, color, class_name in zip(range(5), colors, class_names):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    plt.plot(
        fpr[i],
        tpr[i],
        color=color,
        lw=2,
        label=f"ROC curve of {class_name} (area = {roc_auc[i]:.2f})",
    )

plt.plot([0, 1], [0, 1], "k--", lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Multi-class ROC Curve")
plt.legend(loc="lower right")
plt.tight_layout()
plt.show()

## 10. Precision-Recall Curve Analysis


In [None]:
# Calculate Precision-Recall curve and average precision for each class
precision = {}
recall = {}
plt.figure(figsize=(12, 10))

for i, color, class_name in zip(range(5), colors, class_names):
    precision[i], recall[i], _ = precision_recall_curve(y_test_bin[:, i], y_score[:, i])
    plt.plot(
        recall[i],
        precision[i],
        color=color,
        lw=2,
        label=f"Precision-Recall curve of {class_name}",
    )

plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Multi-class Precision-Recall Curve")
plt.legend(loc="lower left")
plt.tight_layout()
plt.show()

## 12. Model Summary and Saving


In [None]:
# Summary of the model
print("LSTM Classification Model Summary:")
print(f"Total samples in training set: {len(X_train)}")
print(f"Total samples in test set: {len(X_test)}")
print(f"Vocabulary size: {vocab_size}")
print(f"Sequence length: {MAX_SEQUENCE_LENGTH}")
print(f"Embedding dimension: {EMBEDDING_DIM}")
print(f"Final model accuracy on test set: {accuracy:.4f}")

In [None]:
# Save the model
model.save("../../models/lstm/multiclass_no_augment_lstm.h5")

# Save the tokenizer
import pickle
with open("../../models/lstm/tokenizer.pickle", "wb") as handle
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("Model and tokenizer saved successfully!")