# Cyberbullying model using LSTM

In [1]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform

## Preprocessing the dataset

In [3]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stop_words.update(list(string.punctuation))

In [4]:
df = pd.read_csv("cyberbullying_tweets.csv")
df.head()

Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


In [5]:
messages = df['tweet_text']
y = df['cyberbullying_type']

In [6]:
df['tweet_text']

0        In other words #katandandre, your food was cra...
1        Why is #aussietv so white? #MKR #theblock #ImA...
2        @XochitlSuckkks a classy whore? Or more red ve...
3        @Jason_Gio meh. :P  thanks for the heads up, b...
4        @RudhoeEnglish This is an ISIS account pretend...
                               ...                        
47687    Black ppl aren't expected to do anything, depe...
47688    Turner did not withhold his disappointment. Tu...
47689    I swear to God. This dumb nigger bitch. I have...
47690    Yea fuck you RT @therealexel: IF YOURE A NIGGE...
47691    Bro. U gotta chill RT @CHILLShrammy: Dog FUCK ...
Name: tweet_text, Length: 47692, dtype: object

In [7]:
def get_simple_pos(tag) :
    if tag.startswith('J') :
        return wordnet.ADJ
    elif tag.startswith('V') :
        return wordnet.VERB
    elif tag.startswith('N') :
        return wordnet.NOUN
    elif tag.startswith('R') :
        return wordnet.ADV
    else:
        return wordnet.NOUN

def clean_text(review) :
    global max_len
    words = word_tokenize(review)
    output_words = []
    for word in words :
        if word.lower() not in stop_words :
            pos = pos_tag([word])
            clean_word = lemmatizer.lemmatize(word,pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    max_len = max(max_len, len(output_words))
    return " ".join(output_words)

In [8]:
max_len = 0

In [9]:
print(messages[0])
messages = [clean_text(message) for message in messages]
print(messages[0])

In other words #katandandre, your food was crapilicious! #mkr
word katandandre food crapilicious mkr


In [10]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r', encoding="utf8") as file:
        word_to_vec_map = {}
        word_to_index = {}
        index_to_word = {}
        index = 0
        for line in file:
            line = line.strip().split()
            curr_word = line[0]
            word_to_index[curr_word] = index
            index_to_word[index] = curr_word
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
            index += 1
    return word_to_index, index_to_word, word_to_vec_map

In [11]:
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs('glove.6B.50d.txt')

In [12]:
def sentences_to_indices(X, word_to_index, max_len):
    m = len(X)
    X_indices = np.zeros((m, max_len))
    for i in range(m):
        sentence_words = [w.lower() for w in X[i].split()]
        j = 0
        for word in sentence_words:
            if word in word_to_index:
                X_indices[i, j] = word_to_index[word]
            j += 1
    return X_indices

## The LSTM and CNN model

In [13]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Conv1D, MaxPooling1D, GlobalMaxPooling1D, concatenate, Dense, Dropout
from tensorflow.keras.utils import to_categorical

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_cat = to_categorical(y_encoded)

# Split data
X = sentences_to_indices(messages, word_to_index, max_len)
X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size=0.2, random_state=42)

# Embedding layer preparation
vocab_len = len(word_to_index)
emb_dim = 50

embedding_matrix = np.zeros((vocab_len, emb_dim))
for word, index in word_to_index.items():
    embedding_vector = word_to_vec_map.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

embedding_layer = Embedding(input_dim=vocab_len,
                            output_dim=emb_dim,
                            weights=[embedding_matrix],
                            input_length=max_len,
                            trainable=False)

# Define input
input_layer = Input(shape=(max_len,))

# Embedding
embedded_sequences = embedding_layer(input_layer)

# LSTM Branch
lstm_branch = LSTM(64, return_sequences=True)(embedded_sequences)
lstm_branch = GlobalMaxPooling1D()(lstm_branch)

# CNN Branch
cnn_branch = Conv1D(filters=128, kernel_size=5, activation='relu')(embedded_sequences)
cnn_branch = MaxPooling1D(pool_size=2)(cnn_branch)
cnn_branch = GlobalMaxPooling1D()(cnn_branch)

# Concatenate both branches
merged = concatenate([lstm_branch, cnn_branch])
merged = Dense(128, activation='relu')(merged)
merged = Dropout(0.5)(merged)
output_layer = Dense(y_cat.shape[1], activation='softmax')(merged)

# Define and compile model
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Summary
model.summary()

# Train
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test))


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 505)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 505, 50)      20000000    ['input_1[0][0]']                
                                                                                                  
 conv1d (Conv1D)                (None, 501, 128)     32128       ['embedding[0][0]']              
                                                                                                  
 lstm (LSTM)                    (None, 505, 64)      29440       ['embedding[0][0]']              
                                                                                              

<keras.callbacks.History at 0x222c756ffa0>

In [14]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy * 100:.2f}%")


Test Accuracy: 80.72%


## Accuracy of LSTM: 92.25%

In [None]:
model.save("model.h5")

## Predictions

In [None]:
text = "suck it"
text = [clean_text(text)]
text

In [None]:
text = sentences_to_indices(text, word_to_index, max_len)

In [None]:
text

In [None]:
model.predict(text)[0][0]

## Extras

In [None]:
import pickle

In [None]:
pickle.dump(word_to_index, open('word_to_index.pkl', 'wb'))