In [1]:
import spacy
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from transformers import DistilBertTokenizer, TFDistilBertModel
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from tensorflow.keras.layers import Input,Dense, GlobalMaxPooling1D
import seaborn as sns 
import matplotlib.pyplot as plt





In [2]:
text_folder="data\Text file"

In [3]:
# Load spaCy model
nlp = spacy.load("en_core_web_sm")

In [4]:
# Define a custom spaCy tokenizer
def spacy_tokenizer(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return tokens

In [5]:
# Function to read text from files based on file_id
def read_text_from_file(file_id):
    file_path=os.path.join(text_folder,f'{file_id}.txt')
    try:
        with open(file_path,'r',encoding='utf-8') as file:
            return file.read()
    except FileNotFoundError:
        return ''

In [6]:
data=pd.read_csv('data\Annotations_Metadata.csv')

In [7]:
data['actual_text']=data['file_id'].apply(lambda file_id: read_text_from_file(file_id))

In [None]:
# preprocess and tokenize the text
data['preprocessed_text']=data['actual_text'].apply(lambda text:' '.join(spacy_tokenizer(text)))

In [None]:
# Encode labels using LabelEncoder for multi-class classification
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['label'])

In [None]:
data.head()

In [None]:
# Use DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize and pad the sequences
sequences = data['preprocessed_text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=100, truncation=True))
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, data['label_encoded'], test_size=0.2, random_state=42)

In [None]:
# Build a model using DistilBERT with functional API
input_layer = Input(shape=(100,), dtype='int32')
distilbert_layer = TFDistilBertModel.from_pretrained('distilbert-base-uncased', trainable=False)(input_layer)
pooling_layer = GlobalMaxPooling1D()(distilbert_layer.last_hidden_state)
output_layer = Dense(len(label_encoder.classes_), activation='softmax')(pooling_layer)

model = keras.Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
print(X_train.shape)
print(X_test.shape)

In [None]:
# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.1)

In [None]:
# Evaluate model performance on the test set
y_pred_probs = model.predict(X_test)
y_pred = y_pred_probs.argmax(axis=1)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_mat = confusion_matrix(y_test, y_pred)

print(f'Accuracy: {accuracy}')
print('Classification Report:\n', report)

False Positive/Negative Analysis:

In [None]:
# False Positive/Negative Analysis
y_test_labels = label_encoder.inverse_transform(y_test)
y_pred_labels = label_encoder.inverse_transform(y_pred)

In [None]:
# Find indices of false positives and false negatives
false_positive_indices = [i for i in range(len(y_test)) if y_test_labels[i] == 'noHate' and y_pred_labels[i] == 'hate']
false_negative_indices = [i for i in range(len(y_test)) if y_test_labels[i] == 'hate' and y_pred_labels[i] == 'noHate']

In [None]:
# Display false positives
print("\nFalse Positives:")
for idx in false_positive_indices:
    print(f"Actual: {y_test_labels[idx]}, Predicted: {y_pred_labels[idx]}, Text: {data['preprocessed_text'][idx]}")


In [None]:
# Display false negatives
print("\nFalse Negatives:")
for idx in false_negative_indices:
    print(f"Actual: {y_test_labels[idx]}, Predicted: {y_pred_labels[idx]}, Text: {data['preprocessed_text'][idx]}")
