In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Conv1D, GRU, Dense, Dropout, Bidirectional, Attention, GlobalAveragePooling1D

# Download NLTK stopwords and lemmatizer resources
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Load CSV file
csv_file = 'HateSpeechDatasetBalanced.csv'  # Replace with the path to your file
df = pd.read_csv(csv_file)

# Preprocess text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

# Apply text preprocessing
df['cleaned_text'] = df['Content'].apply(clean_text)

# Split the dataset
X = df['cleaned_text'].values
y = df['Label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenize text
max_vocab_size = 10000
max_sequence_length = 150

tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post')

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post')

# Define CNN-GRU model
def build_cnn_gru_model(input_length, vocab_size):
    inputs = tf.keras.Input(shape=(input_length,))
    
    # Embedding Layer
    x = Embedding(vocab_size, 128)(inputs)
    
    # CNN Layer
    x = Conv1D(128, 5, activation='relu')(x)
    
    # GRU Layer
    x = Bidirectional(GRU(128, return_sequences=True))(x)
    
    # Attention Mechanism
    attention = Attention()([x, x])
    
    # Global Pooling Layer
    x = GlobalAveragePooling1D()(attention)
    
    # Fully Connected Layer
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.5)(x)
    
    # Output Layer
    outputs = Dense(1, activation='sigmoid')(x)
    
    return tf.keras.Model(inputs, outputs)

# Build and compile model
input_length = max_sequence_length
model = build_cnn_gru_model(input_length, max_vocab_size)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(train_padded, y_train, epochs=5, batch_size=32, validation_data=(test_padded, y_test))

# Save the model
model.save('cyberbullying_cnn_gru_model.h5')

# Predict on test data
test_predictions = model.predict(test_padded)
test_predictions = np.round(test_predictions).astype(int)

# Calculate metrics
accuracy = accuracy_score(y_test, test_predictions)
precision = precision_score(y_test, test_predictions)
recall = recall_score(y_test, test_predictions)
f1 = f1_score(y_test, test_predictions)

# Print evaluation metrics
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

[nltk_data] Downloading package stopwords to C:\Users\Sai
[nltk_data]     keerthan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Sai
[nltk_data]     keerthan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/5
[1m18153/18153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6720s[0m 370ms/step - accuracy: 0.8006 - loss: 0.4240 - val_accuracy: 0.8404 - val_loss: 0.3492
Epoch 2/5
[1m18153/18153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5072s[0m 279ms/step - accuracy: 0.8562 - loss: 0.3229 - val_accuracy: 0.8492 - val_loss: 0.3390
Epoch 3/5
[1m18153/18153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5745s[0m 317ms/step - accuracy: 0.8789 - loss: 0.2767 - val_accuracy: 0.8526 - val_loss: 0.3345
Epoch 4/5
[1m18153/18153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5111s[0m 282ms/step - accuracy: 0.8983 - loss: 0.2375 - val_accuracy: 0.8551 - val_loss: 0.3336
Epoch 5/5
[1m18153/18153[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4354s[0m 240ms/step - accuracy: 0.9125 - loss: 0.2082 - val_accuracy: 0.8534 - val_loss: 0.3584




[1m4539/4539[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m419s[0m 92ms/step
Accuracy: 0.8534195449787914
Precision: 0.8475913621262459
Recall: 0.864582336945382
F1 Score: 0.8560025434792903
