## Generating Feature Vectors with BERT

In [6]:
from transformers import BertTokenizer, BertModel #Hugging Face Transformers
import torch

### Load the Dataset

In [7]:
import pandas as pd
from src.preprocessing.hatespeech_dataset_querying import prepare_hatespeech_v2_dataset, load_hatespeech_v2_dataset

In [8]:
df = load_hatespeech_v2_dataset("../data/hatespeech_v2/prepared_hatespeech_v2.csv")
df

Unnamed: 0,tweet_id,text,label,topic
0,1344794359233998850,You know maybe doing a “challenge” where I dri...,0,1
1,1344794162625916935,RT @thehill: Black transgender woman found dea...,0,1
2,1344794094837637121,2021 Goals: Playtest and release Rumrunners. R...,0,1
3,1344790842117140483,Guest Co Host: Men Like Us Podcast #StopTheHat...,0,1
4,1344788907360190465,👏 Congratulations @AyodejiOsowobi @StandtoEndR...,0,1
...,...,...,...,...
68592,1277310569700196352,Fuck you @Google @GooglePlayDev @Android With ...,1,4
68593,1277310293467713536,Being an Arsenal fan is tough. Even people tha...,1,4
68594,1277309147697106945,No subs yet? Fuck off man we aren't playing in...,1,4
68595,1277309020198633475,Not Manchester United again damn it 🤣 I don't ...,2,4


In [9]:
# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained BERT model
model = BertModel.from_pretrained('bert-base-uncased')

# Set model to evaluation mode
model.eval()

# Function to process text and obtain BERT embeddings
def get_bert_embeddings(text):
    # Tokenize input text
    tokens = tokenizer.encode_plus(
        text,
        add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
        max_length=128,           # Truncate/pad input sequences to 128 tokens
        padding='max_length',
        return_attention_mask=True,  # Generate attention mask
        return_tensors='pt'       # Return PyTorch tensors
    )

    # Get token IDs and attention mask
    input_ids = tokens['input_ids']
    attention_mask = tokens['attention_mask']

    # Forward pass through the model
    with torch.no_grad():
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)

    # Get BERT embeddings (output of [CLS] token)
    bert_embeddings = outputs[0][:, 0, :].squeeze().tolist()  # Extract embeddings of [CLS] token
    return bert_embeddings


### Generating BERT Embeddings

In [10]:
#sample of the rows
#df = df.sample(5000)

df['bert_embeddings'] = df['text'].apply(get_bert_embeddings)

# BERT embeddings:
df.head()

Unnamed: 0,tweet_id,text,label,topic,bert_embeddings
0,1344794359233998850,You know maybe doing a “challenge” where I dri...,0,1,"[0.16847427189350128, 0.038471419364213943, 0...."
1,1344794162625916935,RT @thehill: Black transgender woman found dea...,0,1,"[-0.17179889976978302, -0.3453545570373535, -0..."
2,1344794094837637121,2021 Goals: Playtest and release Rumrunners. R...,0,1,"[0.2647630572319031, -0.13153664767742157, 0.2..."
3,1344790842117140483,Guest Co Host: Men Like Us Podcast #StopTheHat...,0,1,"[-0.2707246243953705, 0.10960787534713745, -0...."
4,1344788907360190465,👏 Congratulations @AyodejiOsowobi @StandtoEndR...,0,1,"[0.06990789622068405, -0.16728679835796356, -0..."


#### Classification Model

### SVM Classifier

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [12]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['bert_embeddings'].tolist(), df['label'], test_size=0.2, random_state=42)

# Initialize and train the SVM model
svm_model = SVC(kernel='linear') #with linear kernel
svm_model.fit(X_train, y_train)

# Predict labels
svm_y_pred = svm_model.predict(X_test)

# Evaluate
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_report = classification_report(y_test, svm_y_pred, digits=4)

print(f"Accuracy: {svm_accuracy * 100:.2f}%")  # Improve formatting to two decimal places
print("Classification report:\n", svm_report)  # Remove unnecessary f-string

Accuracy: 88.24%
Classification report:
               precision    recall  f1-score   support

           0     0.9126    0.9571    0.9343     10839
           1     0.7463    0.6512    0.6955      2566
           2     0.5398    0.1937    0.2850       315

    accuracy                         0.8824     13720
   macro avg     0.7329    0.6007    0.6383     13720
weighted avg     0.8729    0.8824    0.8747     13720



### Random Forest Classifier

In [15]:
from sklearn.ensemble import RandomForestClassifier

In [16]:
# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict labels
rf_y_pred = rf_model.predict(X_test)

# Evaluate model performance
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_report = classification_report(y_test, rf_y_pred, digits=4)

print(f"Accuracy: {rf_accuracy * 100:.2f}%")  # Improve formatting to two decimal places
print("Classification report:\n", rf_report)  # Remove unnecessary f-string

Accuracy: 84.18%
Classification report:
               precision    recall  f1-score   support

           0     0.8487    0.9822    0.9106     10839
           1     0.7679    0.3519    0.4826      2566
           2     0.0000    0.0000    0.0000       315

    accuracy                         0.8418     13720
   macro avg     0.5388    0.4447    0.4644     13720
weighted avg     0.8141    0.8418    0.8096     13720



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
