In [1]:
import spacy
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, GlobalMaxPooling1D, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Bidirectional, LSTM
from tensorflow.keras.initializers import GlorotNormal




In [2]:
text_folder="data\Text file"

In [3]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [4]:
# Define a custom spaCy tokenizer
def spacy_tokenizer(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return tokens

In [5]:
# Function to read text from files based on file_id
def read_text_from_file(file_id):
    file_path=os.path.join(text_folder,f'{file_id}.txt')
    try:
        with open(file_path,'r',encoding='utf-8') as file:
            return file.read()
    except FileNotFoundError:
        return ''

In [6]:
data=pd.read_csv('data\Annotations_Metadata.csv')

In [7]:
# Preprocess and tokenize the text
data['actual_text'] = data['file_id'].apply(lambda file_id: read_text_from_file(file_id))
data['preprocessed_text'] = data['actual_text'].apply(lambda text: ' '.join(spacy_tokenizer(text)))

In [8]:
# Encode labels using LabelEncoder for multi-class classification
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label'])

In [9]:
data.head()

Unnamed: 0,file_id,user_id,subforum_id,num_contexts,label,actual_text,preprocessed_text,label_encoded
0,12834217_1,572066,1346,0,noHate,"As of March 13th , 2014 , the booklet had been...",March booklet download time counting,2
1,12834217_2,572066,1346,0,noHate,In order to help increase the booklets downloa...,order help increase booklet download great sto...,2
2,12834217_3,572066,1346,0,noHate,( Simply copy and paste the following text int...,simply copy paste following text YouTube video...,2
3,12834217_4,572066,1346,0,hate,Click below for a FREE download of a colorfull...,click free download colorfully illustrate page...,0
4,12834217_5,572066,1346,0,noHate,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...,click DOWNLOAD mb green banner link,2


In [10]:
# Use BERT tokenizer
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')

In [11]:
# Tokenize and pad the sequences
tokenizer.fit_on_texts(data['preprocessed_text'])
sequences = tokenizer.texts_to_sequences(data['preprocessed_text'])
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')

In [12]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, data['label_encoded'], test_size=0.2, random_state=42)

In [13]:
# Build a model using BERT with functional API
input_layer = Input(shape=(100,), dtype='int32')
embedding_layer = tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=100)(input_layer)
bi_lstm = Bidirectional(LSTM(100, return_sequences=True))(embedding_layer)
pooling_layer = GlobalMaxPooling1D()(bi_lstm)
output_layer = Dense(256, activation='relu')(pooling_layer)
output_layer = Dropout(0.5)(output_layer)
output_layer = Dense(len(label_encoder.classes_), activation='softmax')(output_layer)
model = Model(inputs=input_layer, outputs=output_layer)




In [14]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=5e-5), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [15]:
# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)


In [16]:
# Train the model
history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.1, callbacks=[early_stopping])

Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20


In [17]:
# Evaluate model performance on the test set
_, accuracy = model.evaluate(X_test, y_test)
y_pred = model.predict(X_test).argmax(axis=1)
report = classification_report(y_test, y_pred)
conf_mat = confusion_matrix(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report:\n', report)

Accuracy: 0.8739150166511536
Classification Report:
               precision    recall  f1-score   support

           0       0.49      0.27      0.35       225
           1       0.00      0.00      0.00        16
           2       0.90      0.97      0.93      1915
           3       0.00      0.00      0.00        33

    accuracy                           0.87      2189
   macro avg       0.35      0.31      0.32      2189
weighted avg       0.83      0.87      0.85      2189



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [24]:
# Save the model
model.save('hate_speech_detection_model.h5')

  saving_api.save_model(


In [19]:
# Save the label encoder
import joblib
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']

False Positive/Negative Analysis:

In [20]:
# False Positive/Negative Analysis
y_test_labels = label_encoder.inverse_transform(y_test)
y_pred_labels = label_encoder.inverse_transform(y_pred)

In [21]:
# Find indices of false positives and false negatives
false_positive_indices = [i for i in range(len(y_test)) if y_test_labels[i] == 'noHate' and y_pred_labels[i] == 'hate']
false_negative_indices = [i for i in range(len(y_test)) if y_test_labels[i] == 'hate' and y_pred_labels[i] == 'noHate']

In [22]:
# Display false positives
print("\nFalse Positives:")
# Display the shape of the output
print("Shape of False Positives Output:", len(false_positive_indices))
for idx in false_positive_indices:
    print(f"Actual: {y_test_labels[idx]}, Predicted: {y_pred_labels[idx]}, Text: {data['preprocessed_text'][idx]}")



False Positives:
Shape of False Positives Output: 62
Actual: noHate, Predicted: hate, Text: guess stick homeschoole kid
Actual: noHate, Predicted: hate, Text: hear home school kid beat public school kid spelling bee
Actual: noHate, Predicted: hate, Text: unfortunately poor white family parent work
Actual: noHate, Predicted: hate, Text: not like talk fear create division
Actual: noHate, Predicted: hate, Text: honestly well idea private school people average parent poor teacher
Actual: noHate, Predicted: hate, Text: long love Tchaikovsky ballet mussorgsky opera enjoy russian traditional music
Actual: noHate, Predicted: hate, Text: white look somewhat average white swedish girl think
Actual: noHate, Predicted: hate, Text: good news Norge
Actual: noHate, Predicted: hate, Text: thank like put leaflet noticeboard get lot people talk
Actual: noHate, Predicted: hate, Text: see thing country
Actual: noHate, Predicted: hate, Text: Bulgarians slavic time rest seek new cos ethnonym dig
Actual: no

In [23]:
# Display false negatives
print("\nFalse Negatives:")
# Display the shape of the output
print("Shape of False Negatives Output:", len(false_negative_indices))
for idx in false_negative_indices:
    print(f"Actual: {y_test_labels[idx]}, Predicted: {y_pred_labels[idx]}, Text: {data['preprocessed_text'][idx]}")



False Negatives:
Shape of False Negatives Output: 165
Actual: hate, Predicted: noHate, Text: download
Actual: hate, Predicted: noHate, Text: ve consider teach profession like neighbourhood like kid like disgust
Actual: hate, Predicted: noHate, Text: com watch v watch min Obama AIPAC youtube
Actual: hate, Predicted: noHate, Text: improve site
Actual: hate, Predicted: noHate, Text: welcome line
Actual: hate, Predicted: noHate, Text: protester Clash Military Parade Barking
Actual: hate, Predicted: noHate, Text: parent teach weekday teach Saturday
Actual: hate, Predicted: noHate, Text: Obama membership Prince Hall black lodge president accord guy youtube v nlfrsregii Zagami
Actual: hate, Predicted: noHate, Text: reason large white family encourage day build population white Germans ravage jewish attack hyperinflation
Actual: hate, Predicted: noHate, Text: hope day proudly American country
Actual: hate, Predicted: noHate, Text: able finish watch video
Actual: hate, Predicted: noHate, Text: