In [15]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout, GRU
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os


In [2]:
df = pd.read_csv('/kaggle/input/cyberbullying-classification/cyberbullying_tweets.csv')

In [6]:
print(df.head())

                                          tweet_text cyberbullying_type
0  In other words #katandandre, your food was cra...  not_cyberbullying
1  Why is #aussietv so white? #MKR #theblock #ImA...  not_cyberbullying
2  @XochitlSuckkks a classy whore? Or more red ve...  not_cyberbullying
3  @Jason_Gio meh. :P  thanks for the heads up, b...  not_cyberbullying
4  @RudhoeEnglish This is an ISIS account pretend...  not_cyberbullying


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47692 entries, 0 to 47691
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   tweet_text          47692 non-null  object
 1   cyberbullying_type  47692 non-null  object
dtypes: object(2)
memory usage: 745.3+ KB


In [5]:
print(df['cyberbullying_type'].value_counts())

cyberbullying_type
religion               7998
age                    7992
gender                 7973
ethnicity              7961
not_cyberbullying      7945
other_cyberbullying    7823
Name: count, dtype: int64


In [8]:
# Text Preprocessing function
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text) # remove numbers
    text = re.sub(r'[^\w\s]', '', text) # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip() # remove extra spaces
    return text

In [9]:
df['cleaned_text'] = df['tweet_text'].apply(clean_text)

In [10]:
# Tokenization and Padding
max_vocab_size = 20000
max_sequence_length = 100

tokenizer = Tokenizer(num_words=max_vocab_size)
tokenizer.fit_on_texts(df['cleaned_text'].values)

X = tokenizer.texts_to_sequences(df['cleaned_text'].values)
X = pad_sequences(X, maxlen=max_sequence_length)

In [13]:
# Label Encoding
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['cyberbullying_type'].values)
y = to_categorical(y)  # One-hot encode the labels for multi-class classification

In [16]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Build the RNN model
embedding_dim = 128

model = Sequential()

# Embedding layer
model.add(Embedding(input_dim=max_vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))

# RNN layer (You can switch to LSTM or GRU if you want)
model.add(LSTM(units=64, return_sequences=False))

# Dense layer with dropout
model.add(Dropout(0.5))
model.add(Dense(units=64, activation='relu'))

# Output layer (use softmax for multi-class classification)
model.add(Dense(units=y.shape[1], activation='softmax'))


In [None]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [24]:
# Model Summary
print(model.summary())

None


In [20]:
# Train the model
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 78ms/step - accuracy: 0.5677 - loss: 1.0102 - val_accuracy: 0.8180 - val_loss: 0.4545
Epoch 2/5
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 77ms/step - accuracy: 0.8670 - loss: 0.3396 - val_accuracy: 0.8283 - val_loss: 0.4374
Epoch 3/5
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 77ms/step - accuracy: 0.9048 - loss: 0.2470 - val_accuracy: 0.8227 - val_loss: 0.4826
Epoch 4/5
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 78ms/step - accuracy: 0.9235 - loss: 0.1979 - val_accuracy: 0.8175 - val_loss: 0.5708
Epoch 5/5
[1m597/597[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 78ms/step - accuracy: 0.9306 - loss: 0.1778 - val_accuracy: 0.8152 - val_loss: 0.5888


In [23]:
# Evaluate the model
score = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {score[1]}")

[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 15ms/step - accuracy: 0.8166 - loss: 0.5831
Test Accuracy: 0.8151797652244568
