In [8]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # For lemmatizer



def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)         # Remove URLs
    text = re.sub(r"@\w+", "", text)            # Remove mentions
    text = re.sub(r"[^a-zA-Z\s]", "", text)     # Remove everything except letters and spaces
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    tokens = [WordNetLemmatizer().lemmatize(word) for word in tokens]
    return " ".join(tokens)


if __name__ == "__main__":
    df = pd.read_csv(r"C:\Users\remas alhazmi\OneDrive - University of Prince Mugrin\Desktop\NLP\labeled_data.csv")
    print(df.head())
    print(df.columns)
    df['cleaned_text'] = df['tweet'].apply(clean_text)
    df.to_csv(r"C:\Users\remas alhazmi\OneDrive - University of Prince Mugrin\Documents\processed", index=False)
    print("[INFO] Preprocessing complete. Saved to data/processed/preprocessed_data.csv")


    df_cleaned = pd.read_csv(r"C:\Users\remas alhazmi\OneDrive - University of Prince Mugrin\Documents\processed")
    for idx in range(5):
        print("Original:", df_cleaned.loc[idx, 'tweet'])
        print("Cleaned:", df_cleaned.loc[idx, 'cleaned_text'])
        print("-" * 50)

[nltk_data] Downloading package stopwords to C:\Users\remas
[nltk_data]     alhazmi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\remas
[nltk_data]     alhazmi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\remas
[nltk_data]     alhazmi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


   Unnamed: 0  count  hate_speech  offensive_language  neither  class  \
0           0      3            0                   0        3      2   
1           1      3            0                   3        0      1   
2           2      3            0                   3        0      1   
3           3      3            0                   2        1      1   
4           4      6            0                   6        0      1   

                                               tweet  
0  !!! RT @mayasolovely: As a woman you shouldn't...  
1  !!!!! RT @mleew17: boy dats cold...tyga dwn ba...  
2  !!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...  
3  !!!!!!!!! RT @C_G_Anderson: @viva_based she lo...  
4  !!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...  
Index(['Unnamed: 0', 'count', 'hate_speech', 'offensive_language', 'neither',
       'class', 'tweet'],
      dtype='object')


PermissionError: [Errno 13] Permission denied: 'C:\\Users\\remas alhazmi\\OneDrive - University of Prince Mugrin\\Documents\\processed'

In [2]:
# 📄 src/feature_engineering.py

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pickle
import os

# Paths
CLEANED_DATA_PATH = (r'C:\Users\Sana\Downloads\Hate_Speech_Detection\data\processed\preprocessed_data.csv')
FEATURES_SAVE_PATH = (r"C:\Users\Sana\Downloads\Hate_Speech_Detection\data\processed")

def load_cleaned_data():
    df = pd.read_csv(CLEANED_DATA_PATH)
    # Drop any rows with missing or empty cleaned_text
    df = df.dropna(subset=['cleaned_text'])
    df = df[df['cleaned_text'].str.strip() != '']  # Also remove rows where cleaned_text is empty after stripping spaces
    return df

def generate_bow_features(texts):
    vectorizer = CountVectorizer(max_features=5000)  # You can adjust max_features
    X_bow = vectorizer.fit_transform(texts)
    return X_bow, vectorizer

def generate_tfidf_features(texts):
    vectorizer = TfidfVectorizer(max_features=5000)
    X_tfidf = vectorizer.fit_transform(texts)
    return X_tfidf, vectorizer

def save_features(X, vectorizer, prefix):
    os.makedirs(FEATURES_SAVE_PATH, exist_ok=True)

    # Save feature matrix
    with open(os.path.join(FEATURES_SAVE_PATH, f"{prefix}_features.pkl"), "wb") as f:
        pickle.dump(X, f)

    # Save vectorizer
    with open(os.path.join(FEATURES_SAVE_PATH, f"{prefix}_vectorizer.pkl"), "wb") as f:
        pickle.dump(vectorizer, f)

if __name__ == "__main__":
    df = load_cleaned_data()

    print("[INFO] Generating BoW features...")
    X_bow, bow_vectorizer = generate_bow_features(df['cleaned_text'])
    save_features(X_bow, bow_vectorizer, "bow")
    print("[INFO] Bag of Words features saved!")

    print("[INFO] Generating TF-IDF features...")
    X_tfidf, tfidf_vectorizer = generate_tfidf_features(df['cleaned_text'])
    save_features(X_tfidf, tfidf_vectorizer, "tfidf")
    print("[INFO] TF-IDF features saved!")

[INFO] Generating BoW features...
[INFO] Bag of Words features saved!
[INFO] Generating TF-IDF features...
[INFO] TF-IDF features saved!


In [3]:
# 📄 src/pos_feature_engineering.py

import pandas as pd
from nltk import ngrams
import os

# Define paths
POS_TAGGED_PATH = r"C:\Users\Sana\Downloads\Hate_Speech_Detection\data\processed\preprocessed_with_pos.csv"
FEATURE_SAVE_PATH = r"C:\Users\Sana\Downloads\Hate_Speech_Detection\data\processed\preprocessed_with_pos_features.csv"

# Load the POS-tagged dataset
df = pd.read_csv(POS_TAGGED_PATH)

# Function to safely convert strings back to lists if needed
import ast
df['POS_tags_nltk'] = df['POS_tags_nltk'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Create POS unigrams
df['POS_unigrams'] = df['POS_tags_nltk']

# Create POS bigrams
df['POS_bigrams'] = df['POS_tags_nltk'].apply(lambda tags: list(ngrams(tags, 2)))

# Create POS trigrams
df['POS_trigrams'] = df['POS_tags_nltk'].apply(lambda tags: list(ngrams(tags, 3)))

# Create full POS sequence string
df['POS_sequence_string'] = df['POS_tags_nltk'].apply(lambda tags: ' '.join(tags))

# Save the dataframe with POS n-gram features
os.makedirs(os.path.dirname(FEATURE_SAVE_PATH), exist_ok=True)
df.to_csv(FEATURE_SAVE_PATH, index=False)
print(f"[INFO] POS n-gram features complete. Saved at: {FEATURE_SAVE_PATH}")

# Show sample
print(df[['cleaned_text', 'POS_unigrams', 'POS_bigrams', 'POS_trigrams', 'POS_sequence_string']].head())

[INFO] POS n-gram features complete. Saved at: C:\Users\Sana\Downloads\Hate_Speech_Detection\data\processed\preprocessed_with_pos_features.csv
                                        cleaned_text  \
0  rt woman shouldnt complain cleaning house amp ...   
1  rt boy dat coldtyga dwn bad cuffin dat hoe st ...   
2  rt dawg rt ever fuck bitch start cry confused ...   
3                                rt look like tranny   
4  rt shit hear might true might faker bitch told ya   

                                        POS_unigrams  \
0  [., ., ., NNP, NNP, RB, :, IN, DT, NN, PRP, MD...   
1  [., ., ., ., ., NNP, JJ, NN, :, NN, NNS, VBP, ...   
2  [., ., ., ., ., ., ., NNP, NNP, NNP, NNP, ., ....   
3  [., ., ., ., ., ., ., ., ., NNP, NNP, NNP, :, ...   
4  [., ., ., ., ., ., ., ., ., ., ., ., ., NNP, J...   

                                         POS_bigrams  \
0  [(., .), (., .), (., NNP), (NNP, NNP), (NNP, R...   
1  [(., .), (., .), (., .), (., .), (., NNP), (NN...   
2  [(., .), (.,

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import os

POS_FEATURES_SAVE_PATH = r"C:\Users\Sana\Downloads\Hate_Speech_Detection\data\processed"
df = pd.read_csv(r"C:\Users\Sana\Downloads\Hate_Speech_Detection\data\processed\preprocessed_with_pos_features.csv")

# Initialize TF-IDF Vectorizer
tfidf_pos_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform on POS sequences
X_pos_tfidf = tfidf_pos_vectorizer.fit_transform(df['POS_sequence_string'].fillna(''))

print(f"[INFO] POS-TFIDF feature shape: {X_pos_tfidf.shape}")

# Save feature matrix
with open(os.path.join(POS_FEATURES_SAVE_PATH, "pos_tfidf_features.pkl"), "wb") as f:
    pickle.dump(X_pos_tfidf, f)

# Save the TF-IDF vectorizer
with open(os.path.join(POS_FEATURES_SAVE_PATH, "pos_tfidf_vectorizer.pkl"), "wb") as f:
    pickle.dump(tfidf_pos_vectorizer, f)

print("[INFO] POS TF-IDF features and vectorizer saved successfully.")

[INFO] POS-TFIDF feature shape: (24783, 34)
[INFO] POS TF-IDF features and vectorizer saved successfully.


In [5]:
# 📄 src/glove_embeddings.py

import pandas as pd
import numpy as np
import os
import pickle

# Paths
DATA_PATH = r"C:\Users\Sana\Downloads\Hate_Speech_Detection\data\processed\preprocessed_with_pos_features.csv"
GLOVE_PATH = r"C:\Users\Sana\Downloads\Hate_Speech_Detection\glove.6B\glove.6B.300d.txt"  # Path to GloVe file
OUTPUT_EMBEDDINGS_PATH = r"C:\Users\Sana\Downloads\Hate_Speech_Detection\data\processed\tweet_glove_embeddings_300d.pkl"

# Load preprocessed tweets
df = pd.read_csv(DATA_PATH)

# Function to load GloVe embeddings
def load_glove_embeddings(glove_path):
    embeddings_index = {}
    with open(glove_path, encoding="utf8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            coeffs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coeffs
    print(f"[INFO] Loaded {len(embeddings_index)} word vectors from GloVe.")
    return embeddings_index

# Load GloVe vectors
embeddings_index = load_glove_embeddings(GLOVE_PATH)

# Function to get average GloVe vector for a tweet
def get_average_glove_vector(text, embeddings_index, embedding_dim=300):
    words = str(text).split()  # Split tweet into words
    valid_vectors = [embeddings_index[word] for word in words if word in embeddings_index]
    if valid_vectors:
        return np.mean(valid_vectors, axis=0)
    else:
        return np.zeros(embedding_dim)

# Apply function to every tweet
tweet_embeddings = df['cleaned_text'].apply(lambda x: get_average_glove_vector(x, embeddings_index))

# Convert to numpy array
X_glove_embeddings = np.stack(tweet_embeddings.values)

print(f"[INFO] GloVe embeddings shape: {X_glove_embeddings.shape}")

# Save embeddings
with open(OUTPUT_EMBEDDINGS_PATH, "wb") as f:
    pickle.dump(X_glove_embeddings, f)

print(f"[INFO] Tweet GloVe Embeddings saved at: {OUTPUT_EMBEDDINGS_PATH}")

[INFO] Loaded 400000 word vectors from GloVe.
[INFO] GloVe embeddings shape: (24783, 300)
[INFO] Tweet GloVe Embeddings saved at: C:\Users\Sana\Downloads\Hate_Speech_Detection\data\processed\tweet_glove_embeddings_300d.pkl


In [6]:
# 📄 src/train_gru_model.py

import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout, Input
from tensorflow.keras.utils import to_categorical
import os

# Paths
LABELS_PATH = r"C:\Users\Sana\Downloads\Hate_Speech_Detection\data\processed\preprocessed_with_pos_features.csv"
EMBEDDINGS_PATH = r"C:\Users\Sana\Downloads\Hate_Speech_Detection\data\processed\tweet_glove_embeddings_300d.pkl"
MODEL_SAVE_PATH = r"C:\Users\Sana\Downloads\Hate_Speech_Detection\models\gru_model.h5"

# Load labels
df = pd.read_csv(LABELS_PATH)
y = df['class'].values
y_cat = to_categorical(y, num_classes=3)

# Load GloVe embeddings
with open(EMBEDDINGS_PATH, "rb") as f:
    X = pickle.load(f)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size=0.2, random_state=42)

# Build GRU model
model = Sequential()
model.add(Input(shape=(X.shape[1],)))  # Input is already averaged (300d)
model.add(tf.keras.layers.Reshape((1, X.shape[1])))  # Reshape to feed into GRU
model.add(GRU(128, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
print("[INFO] Training GRU model...")
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1, verbose=1)

# Evaluate
print("[INFO] Evaluating on test set...")
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)

print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred))

# Save model
os.makedirs(os.path.dirname(MODEL_SAVE_PATH), exist_ok=True)
model.save(MODEL_SAVE_PATH)
print(f"[INFO] GRU model saved at: {MODEL_SAVE_PATH}")

[INFO] Training GRU model...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[INFO] Evaluating on test set...
[[  83  172   35]
 [  78 3600  154]
 [  12  146  677]]
              precision    recall  f1-score   support

           0       0.48      0.29      0.36       290
           1       0.92      0.94      0.93      3832
           2       0.78      0.81      0.80       835

    accuracy                           0.88      4957
   macro avg       0.73      0.68      0.69      4957
weighted avg       0.87      0.88      0.87      4957

[INFO] GRU model saved at: C:\Users\Sana\Downloads\Hate_Speech_Detection\models\gru_model.h5


In [None]:
# Import Required Modules
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import os

# Load Preprocessed Dataset
df = pd.read_csv(r"C:\Users\Sana\Downloads\Hate_Speech_Detection\data\processed\preprocessed_with_pos_features.csv")

# Only keep the tweet and class label columns
df = df[['tweet', 'class']]
df = df.dropna().reset_index(drop=True)

# Convert to Huggingface Dataset
dataset = Dataset.from_pandas(df)

# Load BERT Tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Tokenize the Dataset
def tokenize(example):
    return tokenizer(example['tweet'], truncation=True, padding='max_length', max_length=128)

tokenized_dataset = dataset.map(tokenize, batched=True)

# Train-Test Split
train_test = tokenized_dataset.train_test_split(test_size=0.2)

# Load Pre-trained BERT Model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

# Training Arguments
training_args = TrainingArguments(
    output_dir="bert_model",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="bert_logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss"
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_test['train'],
    eval_dataset=train_test['test'],
    tokenizer=tokenizer
)

# Train the Model
trainer.train()

# Evaluate the Model
predictions = trainer.predict(train_test['test'])
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = train_test['test']['class']

print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred))

# Save the Fine-Tuned Model
trainer.save_model("bert_model")
tokenizer.save_pretrained("bert_model")
print("[INFO] BERT model and tokenizer saved to ./bert_model")

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map: 100%|██████████████████████████████████████████████████████████████| 24783/24783 [00:08<00:00, 2797.50 examples/s]
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


# Transformer Fine-Tuning

In [None]:
pip install transformers datasets


In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

# Load the dataset (make sure it's in a suitable format for BERT)
dataset = load_dataset("your_dataset_name")  # You may need to load a custom dataset

# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

# Apply the tokenizer to the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test']
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('./fine_tuned_bert')
tokenizer.save_pretrained('./fine_tuned_bert')


In [None]:
pip install spacy
python -m spacy download en_core_web_sm


In [None]:
import spacy

# Load the spaCy model for English
nlp = spacy.load('en_core_web_sm')

def extract_syntax_features(text):
    doc = nlp(text)
    # Extract POS tags, dependencies, and noun chunks
    pos_tags = [token.pos_ for token in doc]
    dependencies = [token.dep_ for token in doc]
    noun_chunks = [chunk.text for chunk in doc.noun_chunks]
    
    return {
        "pos_tags": pos_tags,
        "dependencies": dependencies,
        "noun_chunks": noun_chunks
    }

# Apply syntax-aware features to the dataset
df['syntax_features'] = df['cleaned_text'].apply(extract_syntax_features)


# Evaluation Phase for BERT

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Make predictions on the test set
predictions, labels, _ = trainer.predict(tokenized_datasets['test'])

# Convert predictions to label indices
preds = predictions.argmax(axis=1)

# Calculate evaluation metrics
accuracy = accuracy_score(labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')

# Display the results
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")


# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate confusion matrix
cm = confusion_matrix(labels, preds)

# Plot confusion matrix
plt.figure(figsize=(5, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.title('Confusion Matrix for BERT')
plt.show()
