In [1]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt

# NLTK for text processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer

# TensorFlow and Keras for building and training neural network models
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Embedding, LSTM, Dense, Dropout)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.metrics import Precision, Recall
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

# Sklearn for preprocessing and model evaluation
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split

# Load the dataset
dataset_path = 'cyberbullying_tweets.csv'
df = pd.read_csv(dataset_path)

# Clean and preprocess text
nltk.download('punkt')
nltk.download('stopwords')
stemmer = PorterStemmer()

def clean_and_preprocess_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    sentences = sent_tokenize(text)
    processed_text = []
    for sentence in sentences:
        tokens = word_tokenize(sentence)
        tokens = [token for token in tokens if token not in stopwords.words('english')]
        tokens = [stemmer.stem(token) for token in tokens]
        processed_text.extend(tokens)
    return " ".join(processed_text)

df['cleaned_tweet_text'] = df['tweet_text'].apply(clean_and_preprocess_text)

# Encode labels
encoder = LabelBinarizer()
y_encoded = encoder.fit_transform(df['cyberbullying_type'])

# Tokenize and pad sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['cleaned_tweet_text'])
vocab_size = len(tokenizer.word_index) + 1
X = tokenizer.texts_to_sequences(df['cleaned_tweet_text'])
maxlen = 200
X_padded = pad_sequences(X, maxlen=maxlen)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.20, random_state=42)

# Load GloVe embeddings
embeddings_dictionary = dict()
with open('glove.6B.200d.txt', encoding="utf8") as glove_file:
    for line in glove_file:
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        if vector_dimensions.shape[0] == 200:
            embeddings_dictionary[word] = vector_dimensions

embedding_matrix = np.zeros((vocab_size, 200))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        if index < vocab_size:
            embedding_matrix[index] = embedding_vector

# Define the model with LSTM
model = Sequential([
    Embedding(vocab_size, 200, weights=[embedding_matrix], input_length=maxlen, trainable=False),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(y_encoded.shape[1], activation='softmax')
])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.0001),
              loss='categorical_crossentropy',
              metrics=['accuracy', Precision(name='precision'), Recall(name='recall')])

# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, mode='min', restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.00001)

# Train the model with callbacks
history = model.fit(X_train, y_train, validation_split=0.2, epochs=30, batch_size=128,
                    callbacks=[early_stopping, reduce_lr], verbose=1)

# Save the model in the .h5 format
model_save_path_h5 = 'Cyber_Bullying_model_lstm_withcallbacks.h5'
model.save(model_save_path_h5)
print(f"Model saved to {model_save_path_h5}")



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vinnu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vinnu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


KeyboardInterrupt: 