## Generating Feature Vectors with BERT

In [1]:
from transformers import BertTokenizer, BertModel #Hugging Face Transformers
import torch

### Load the Dataset

In [2]:
import pandas as pd
from src.preprocessing.hatespeech_dataset_querying import prepare_hatespeech_v2_dataset, load_hatespeech_v2_dataset

In [3]:
df = load_hatespeech_v2_dataset("../data/hatespeech_v2/prepared_hatespeech_v2.csv")
df

Unnamed: 0,tweet_id,text,label,topic
0,1344794359233998850,You know maybe doing a “challenge” where I dri...,0.0,1.0
1,1344794162625916935,RT @thehill: Black transgender woman found dea...,0.0,1.0
2,1344794094837637121,2021 Goals: Playtest and release Rumrunners. R...,0.0,1.0
3,1344790842117140483,Guest Co Host: Men Like Us Podcast #StopTheHat...,0.0,1.0
4,1344788907360190465,👏 Congratulations @AyodejiOsowobi @StandtoEndR...,0.0,1.0
...,...,...,...,...
68592,1277310569700196352,Fuck you @Google @GooglePlayDev @Android With ...,1.0,4.0
68593,1277310293467713536,Being an Arsenal fan is tough. Even people tha...,1.0,4.0
68594,1277309147697106945,No subs yet? Fuck off man we aren't playing in...,1.0,4.0
68595,1277309020198633475,Not Manchester United again damn it 🤣 I don't ...,2.0,4.0


In [4]:
# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained BERT model
model = BertModel.from_pretrained('bert-base-uncased')

# Set model to evaluation mode
model.eval()

# Function to process text and obtain BERT embeddings
def get_bert_embeddings(text):
    # Tokenize input text
    tokens = tokenizer.encode_plus(
        text,
        add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
        max_length=128,           # Truncate/pad input sequences to 128 tokens
        padding='max_length',
        return_attention_mask=True,  # Generate attention mask
        return_tensors='pt'       # Return PyTorch tensors
    )

    # Get token IDs and attention mask
    input_ids = tokens['input_ids']
    attention_mask = tokens['attention_mask']

    # Forward pass through the model
    with torch.no_grad():
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)

    # Get BERT embeddings (output of [CLS] token)
    bert_embeddings = outputs[0][:, 0, :].squeeze().tolist()  # Extract embeddings of [CLS] token
    return bert_embeddings


### Generating BERT Embeddings

In [5]:
#sample of the rows
#df = df.sample(5000)

df['bert_embeddings'] = df['text'].apply(get_bert_embeddings)

# BERT embeddings:
df.head()

Unnamed: 0,tweet_id,text,label,topic,bert_embeddings
0,1344794359233998850,You know maybe doing a “challenge” where I dri...,0.0,1.0,"[0.16847427189350128, 0.038471419364213943, 0...."
1,1344794162625916935,RT @thehill: Black transgender woman found dea...,0.0,1.0,"[-0.17179889976978302, -0.3453545570373535, -0..."
2,1344794094837637121,2021 Goals: Playtest and release Rumrunners. R...,0.0,1.0,"[0.2647630572319031, -0.13153664767742157, 0.2..."
3,1344790842117140483,Guest Co Host: Men Like Us Podcast #StopTheHat...,0.0,1.0,"[-0.2707246243953705, 0.10960787534713745, -0...."
4,1344788907360190465,👏 Congratulations @AyodejiOsowobi @StandtoEndR...,0.0,1.0,"[0.06990789622068405, -0.16728679835796356, -0..."


#### Classification Model

### SVM Classifier

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['bert_embeddings'].tolist(), df['label'], test_size=0.2, random_state=42)

# Initialize and train the SVM model
svm_model = SVC(kernel='linear') #with linear kernel
svm_model.fit(X_train, y_train)

# Predict labels
svm_y_pred = svm_model.predict(X_test)

# Evaluate
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_report = classification_report(y_test, svm_y_pred)

print(f"Accuracy: {svm_accuracy * 100:.2f}%")  # Improve formatting to two decimal places
print("Classification report:\n", svm_report)  # Remove unnecessary f-string

Accuracy: 88.24%
Classification report:
               precision    recall  f1-score   support

         0.0       0.91      0.96      0.93     10839
         1.0       0.75      0.65      0.70      2566
         2.0       0.54      0.19      0.29       315

    accuracy                           0.88     13720
   macro avg       0.73      0.60      0.64     13720
weighted avg       0.87      0.88      0.87     13720



### Logistic Regression Classifier

In [8]:
from sklearn.linear_model import LogisticRegression

In [9]:
# Initialize and train the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Predict labels
lr_y_pred = lr_model.predict(X_test)

# Evaluate model performance
lr_accuracy = accuracy_score(y_test, lr_y_pred)
lr_report = classification_report(y_test, lr_y_pred)

print(f"Accuracy: {lr_accuracy * 100:.2f}%")  # Improve formatting to two decimal places
print("Classification report:\n", lr_report)  # Remove unnecessary f-string

Accuracy: 88.27%
Classification report:
               precision    recall  f1-score   support

         0.0       0.91      0.96      0.93     10839
         1.0       0.75      0.65      0.70      2566
         2.0       0.53      0.27      0.36       315

    accuracy                           0.88     13720
   macro avg       0.73      0.62      0.66     13720
weighted avg       0.87      0.88      0.88     13720



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Random Forest Classifier

In [10]:
from sklearn.ensemble import RandomForestClassifier

In [11]:
# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict labels
rf_y_pred = rf_model.predict(X_test)

# Evaluate model performance
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_report = classification_report(y_test, rf_y_pred)

print(f"Accuracy: {rf_accuracy * 100:.2f}%")  # Improve formatting to two decimal places
print("Classification report:\n", rf_report)  # Remove unnecessary f-string

Accuracy: 84.18%
Classification report:
               precision    recall  f1-score   support

         0.0       0.85      0.98      0.91     10839
         1.0       0.77      0.35      0.48      2566
         2.0       0.00      0.00      0.00       315

    accuracy                           0.84     13720
   macro avg       0.54      0.44      0.46     13720
weighted avg       0.81      0.84      0.81     13720



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### RNN Classifier

In [12]:
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader
# import numpy as np

In [13]:
# # Define the RNN model
# class RNNClassifier(nn.Module):
#     def __init__(self, input_size, hidden_size, output_size, num_layers=1):
#         super(RNNClassifier, self).__init__()
#         self.hidden_size = hidden_size
#         self.num_layers = num_layers
#         self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
#         self.fc = nn.Linear(hidden_size, output_size)
#         self.sigmoid = nn.Sigmoid()
# 
#     def forward(self, x):
#         batch_size = x.size(0)
#         h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)
#         out, _ = self.rnn(x, h0)
#         out = self.fc(out[:, -1, :])
#         out = self.sigmoid(out)
#         return out
# 
# # Define a custom dataset for BERT embeddings
# class BERTDataset(Dataset):
#     def __init__(self, embeddings, labels):
#         self.embeddings = embeddings
#         self.labels = labels
# 
#     def __len__(self):
#         return len(self.labels)
# 
#     def __getitem__(self, idx):
#         return torch.FloatTensor(self.embeddings[idx]), self.labels[idx]
# 
# # Hyperparameters
# input_size = 768  # Size of BERT embeddings
# hidden_size = 128  # Size of hidden layer
# output_size = 1  # Number of output classes (binary classification)
# num_layers = 1  # Number of RNN layers
# learning_rate = 0.001
# num_epochs = 10
# batch_size = 32
# 
# 
# # Convert BERT embeddings and labels to numpy arrays
# X = np.array(df['bert_embeddings'].tolist())
# y = np.array(df['label'])
# 
# # Split the data into training and testing sets (80% training, 20% testing)
# rnn_X_train, rnn_X_test, rnn_y_train, rnn_y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 
# # Define a DataLoader for training
# train_dataset = BERTDataset(rnn_X_train, rnn_y_train)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# 
# # Initialize RNN model, loss function, and optimizer
# model = RNNClassifier(input_size, hidden_size, output_size, num_layers)
# criterion = nn.BCELoss()
# optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# 
# # Train the model
# for epoch in range(num_epochs):
#     running_loss = 0.0
#     for inputs, labels in train_loader:
#         optimizer.zero_grad()
#         outputs = model(inputs)
#         loss = criterion(outputs.squeeze(), labels.float())
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()
#     print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader)}")
# 
# # Evaluate the model on testing data
# test_dataset = BERTDataset(rnn_X_test, rnn_y_test)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# 
# model.eval()  # Set the model to evaluation mode
# rnn_y_true = []
# rnn_y_true = []
# with torch.no_grad():
#     for inputs, labels in test_loader:
#         outputs = model(inputs)
#         predictions = (outputs.squeeze() > 0.5).int()  # Convert probabilities to binary predictions
#         rnn_y_true.extend(labels.tolist())
#         rnn_y_true.extend(predictions.tolist())
# 
# # Generate classification report
# rnn_report = classification_report(rnn_y_true, rnn_y_true)
# print("Classification Report:")
# print(rnn_report)