In [1]:
import spacy
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizer, TFDistilBertModel
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from tensorflow.keras.layers import Input,Dense, GlobalMaxPooling1D
import seaborn as sns 
import matplotlib.pyplot as plt





In [2]:
text_folder="data\Text file"

In [3]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [4]:
# Define a custom spaCy tokenizer
def spacy_tokenizer(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return tokens

In [5]:
# Function to read text from files based on file_id
def read_text_from_file(file_id):
    file_path=os.path.join(text_folder,f'{file_id}.txt')
    try:
        with open(file_path,'r',encoding='utf-8') as file:
            return file.read()
    except FileNotFoundError:
        return ''

In [6]:
data=pd.read_csv('data\Annotations_Metadata.csv')

In [7]:
data['actual_text']=data['file_id'].apply(lambda file_id: read_text_from_file(file_id))

In [8]:
# preprocess and tokenize the text
data['preprocessed_text']=data['actual_text'].apply(lambda text:' '.join(spacy_tokenizer(text)))

In [9]:
# Encode labels using LabelEncoder for multi-class classification
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label'])

In [10]:
data.head()

Unnamed: 0,file_id,user_id,subforum_id,num_contexts,label,actual_text,preprocessed_text,label_encoded
0,12834217_1,572066,1346,0,noHate,"As of March 13th , 2014 , the booklet had been...",March booklet download time counting,2
1,12834217_2,572066,1346,0,noHate,In order to help increase the booklets downloa...,order help increase booklet download great sto...,2
2,12834217_3,572066,1346,0,noHate,( Simply copy and paste the following text int...,simply copy paste following text YouTube video...,2
3,12834217_4,572066,1346,0,hate,Click below for a FREE download of a colorfull...,click free download colorfully illustrate page...,0
4,12834217_5,572066,1346,0,noHate,Click on the `` DOWNLOAD ( 7.42 MB ) '' green ...,click DOWNLOAD mb green banner link,2


In [11]:
# Use DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize and pad the sequences
sequences = data['preprocessed_text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=100, truncation=True))
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')

In [12]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, data['label_encoded'], test_size=0.2, random_state=42)

In [13]:
# Build a model using DistilBERT with functional API
input_layer = Input(shape=(100,), dtype='int32')
distilbert_layer = TFDistilBertModel.from_pretrained('distilbert-base-uncased', trainable=False)(input_layer)
pooling_layer = GlobalMaxPooling1D()(distilbert_layer.last_hidden_state)
output_layer = Dense(len(label_encoder.classes_), activation='softmax')(pooling_layer)

model = keras.Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.





In [14]:
print(X_train.shape)
print(X_test.shape)

(8755, 100)
(2189, 100)


In [15]:
# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1)

Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x1bbfa827450>

In [16]:
# Evaluate model performance on the test set
y_pred_probs = model.predict(X_test)
y_pred = y_pred_probs.argmax(axis=1)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_mat = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:\n', report)

Accuracy: 0.8780264961169484
Classification Report:
               precision    recall  f1-score   support

           0       0.59      0.06      0.11       225
           1       1.00      0.19      0.32        16
           2       0.88      1.00      0.93      1915
           3       0.00      0.00      0.00        33

    accuracy                           0.88      2189
   macro avg       0.62      0.31      0.34      2189
weighted avg       0.84      0.88      0.83      2189



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


False Positive/Negative Analysis:

In [17]:
# False Positive/Negative Analysis
y_test_labels = label_encoder.inverse_transform(y_test)
y_pred_labels = label_encoder.inverse_transform(y_pred)

In [18]:
# Find indices of false positives and false negatives
false_positive_indices = [i for i in range(len(y_test)) if y_test_labels[i] == 'noHate' and y_pred_labels[i] == 'hate']
false_negative_indices = [i for i in range(len(y_test)) if y_test_labels[i] == 'hate' and y_pred_labels[i] == 'noHate']

In [19]:
# Display false positives
print("\nFalse Positives:")
for idx in false_positive_indices:
    print(f"Actual: {y_test_labels[idx]}, Predicted: {y_pred_labels[idx]}, Text: {data['preprocessed_text'][idx]}")



False Positives:
Actual: noHate, Predicted: hate, Text: not like talk fear create division
Actual: noHate, Predicted: hate, Text: GLORY british Activist Nationalist good Poles help
Actual: noHate, Predicted: hate, Text: like French
Actual: noHate, Predicted: hate, Text: hi join forum look forward join discussion fellow white nationalist have interesting debate good friend
Actual: noHate, Predicted: hate, Text: shoot message
Actual: noHate, Predicted: hate, Text: White Princess
Actual: noHate, Predicted: hate, Text: look internet onyl find Hatewatch anti site
Actual: noHate, Predicted: hate, Text: teach direction life plus christian youth intermix race religion
Actual: noHate, Predicted: hate, Text: college allow jews homosexual


In [20]:
# Display false negatives
print("\nFalse Negatives:")
for idx in false_negative_indices:
    print(f"Actual: {y_test_labels[idx]}, Predicted: {y_pred_labels[idx]}, Text: {data['preprocessed_text'][idx]}")



False Negatives:
Actual: hate, Predicted: noHate, Text: download
Actual: hate, Predicted: noHate, Text: ve consider teach profession like neighbourhood like kid like disgust
Actual: hate, Predicted: noHate, Text: Free North Carolina Peterson particular vikinginme
Actual: hate, Predicted: noHate, Text: com watch v watch min Obama AIPAC youtube
Actual: hate, Predicted: noHate, Text: improve site
Actual: hate, Predicted: noHate, Text: welcome line
Actual: hate, Predicted: noHate, Text: protester Clash Military Parade Barking
Actual: hate, Predicted: noHate, Text: parent teach weekday teach Saturday
Actual: hate, Predicted: noHate, Text: Obama membership Prince Hall black lodge president accord guy youtube v nlfrsregii Zagami
Actual: hate, Predicted: noHate, Text: white person send single penny lift single finger help
Actual: hate, Predicted: noHate, Text: reason large white family encourage day build population white Germans ravage jewish attack hyperinflation
Actual: hate, Predicted: no