Generating Feature Vectors with BERT

In [51]:
from transformers import BertTokenizer, BertModel #Hugging Face Transformers
import torch

### Data Preprocessing

##### Toxigen Dataset

In [52]:
import pandas as pd
df = pd.read_csv('Dataset/Toxigen-data/annotated_train.csv')

df = df[['text','label']]
df['text'] = df['text'].str[2:-2]
df.head()

Unnamed: 0,text,label
0,asians are trying to take over the world with ...,hate
1,i have never seen a hotter lineup in my life t...,neutral
2,muslims deseve to di,hate
3,people who are deaf or hard of hearing should ...,neutral
4,Jews were all bad people because they're in th...,hate


In [53]:
# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Load pre-trained BERT model
model = BertModel.from_pretrained('bert-base-uncased')

# Set model to evaluation mode
model.eval()

# Function to process text and obtain BERT embeddings
def get_bert_embeddings(text):
    # Tokenize input text
    tokens = tokenizer.encode_plus(
        text,
        add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
        max_length=128,           # Truncate/pad input sequences to 128 tokens
        padding='max_length',
        return_attention_mask=True,  # Generate attention mask
        return_tensors='pt'       # Return PyTorch tensors
    )

    # Get token IDs and attention mask
    input_ids = tokens['input_ids']
    attention_mask = tokens['attention_mask']

    # Forward pass through the model
    with torch.no_grad():
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)

    # Get BERT embeddings (output of [CLS] token)
    bert_embeddings = outputs[0][:, 0, :].squeeze().tolist()  # Extract embeddings of [CLS] token
    return bert_embeddings


### Generating BERT Embeddings

In [54]:
#sample of the rows
df = df.sample(5000)

df['bert_embeddings'] = df['text'].apply(get_bert_embeddings)

# BERT embeddings:
df.head()

Unnamed: 0,text,label,bert_embeddings
4560,there is no such thing as an illegal immigrant...,neutral,"[0.41316166520118713, 0.026114080101251602, -0..."
3599,"why do you have to be so loud, i cant hear mys...",hate,"[-0.22732821106910706, 0.428143173456192, -0.0..."
7812,I spent the last year or so in Mexico and love...,neutral,"[0.1076439917087555, 0.24130432307720184, -0.0..."
6831,i think it's really important to talk about th...,neutral,"[0.10297361016273499, 0.013329430483281612, -0..."
699,i can sing tha,hate,"[0.19419747591018677, 0.17297789454460144, 0.0..."


#### Classification Model

### SVM Classifier

In [63]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [65]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['bert_embeddings'].tolist(), df['label'], test_size=0.2, random_state=42)

# Initialize and train the SVM model
svm_model = SVC(kernel='linear') #with linear kernel
svm_model.fit(X_train, y_train)

# Predict labels
svm_y_pred = svm_model.predict(X_test)

# Evaluate model performance
svm_report = classification_report(y_test, svm_y_pred)
print("Classification Report:")
print(svm_report)

Classification Report:
              precision    recall  f1-score   support

        hate       0.74      0.74      0.74       487
     neutral       0.75      0.75      0.75       513

    accuracy                           0.74      1000
   macro avg       0.74      0.74      0.74      1000
weighted avg       0.75      0.74      0.75      1000

1000


### Logistic Regression Classifier

In [57]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [58]:
# Initialize and train the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, y_train)

# Predict labels
lr_y_pred = lr_model.predict(X_test)

# Evaluate model performance
lr_report = classification_report(y_test, lr_y_pred)
print("Classification Report:")
print(lr_report)

Classification Report:
              precision    recall  f1-score   support

        hate       0.75      0.74      0.74       487
     neutral       0.76      0.76      0.76       513

    accuracy                           0.75      1000
   macro avg       0.75      0.75      0.75      1000
weighted avg       0.75      0.75      0.75      1000



### Random Forest Classifier

In [59]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [60]:
# Initialize and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict labels
rf_y_pred = rf_model.predict(X_test)

# Evaluate model performance
rf_report = classification_report(y_test, rf_y_pred)
print("Classification Report:")
print(rf_report)

Classification Report:
              precision    recall  f1-score   support

        hate       0.76      0.77      0.76       487
     neutral       0.78      0.77      0.77       513

    accuracy                           0.77      1000
   macro avg       0.77      0.77      0.77      1000
weighted avg       0.77      0.77      0.77      1000



### RNN Classifier

In [61]:
# import torch.nn as nn
# import torch.optim as optim
# from torch.utils.data import Dataset, DataLoader
# import numpy as np

In [62]:
# # Define the RNN model
# class RNNClassifier(nn.Module):
#     def __init__(self, input_size, hidden_size, output_size, num_layers=1):
#         super(RNNClassifier, self).__init__()
#         self.hidden_size = hidden_size
#         self.num_layers = num_layers
#         self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
#         self.fc = nn.Linear(hidden_size, output_size)
#         self.sigmoid = nn.Sigmoid()
# 
#     def forward(self, x):
#         batch_size = x.size(0)
#         h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)
#         out, _ = self.rnn(x, h0)
#         out = self.fc(out[:, -1, :])
#         out = self.sigmoid(out)
#         return out
# 
# # Define a custom dataset for BERT embeddings
# class BERTDataset(Dataset):
#     def __init__(self, embeddings, labels):
#         self.embeddings = embeddings
#         self.labels = labels
# 
#     def __len__(self):
#         return len(self.labels)
# 
#     def __getitem__(self, idx):
#         return torch.FloatTensor(self.embeddings[idx]), self.labels[idx]
# 
# # Hyperparameters
# input_size = 768  # Size of BERT embeddings
# hidden_size = 128  # Size of hidden layer
# output_size = 1  # Number of output classes (binary classification)
# num_layers = 1  # Number of RNN layers
# learning_rate = 0.001
# num_epochs = 10
# batch_size = 32
# 
# 
# # Convert BERT embeddings and labels to numpy arrays
# X = np.array(df['bert_embeddings'].tolist())
# y = np.array(df['label'])
# 
# # Split the data into training and testing sets (80% training, 20% testing)
# rnn_X_train, rnn_X_test, rnn_y_train, rnn_y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 
# # Define a DataLoader for training
# train_dataset = BERTDataset(rnn_X_train, rnn_y_train)
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# 
# # Initialize RNN model, loss function, and optimizer
# model = RNNClassifier(input_size, hidden_size, output_size, num_layers)
# criterion = nn.BCELoss()
# optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# 
# # Train the model
# for epoch in range(num_epochs):
#     running_loss = 0.0
#     for inputs, labels in train_loader:
#         optimizer.zero_grad()
#         outputs = model(inputs)
#         loss = criterion(outputs.squeeze(), labels.float())
#         loss.backward()
#         optimizer.step()
#         running_loss += loss.item()
#     print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(train_loader)}")
# 
# # Evaluate the model on testing data
# test_dataset = BERTDataset(rnn_X_test, rnn_y_test)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
# 
# model.eval()  # Set the model to evaluation mode
# rnn_y_true = []
# rnn_y_true = []
# with torch.no_grad():
#     for inputs, labels in test_loader:
#         outputs = model(inputs)
#         predictions = (outputs.squeeze() > 0.5).int()  # Convert probabilities to binary predictions
#         rnn_y_true.extend(labels.tolist())
#         rnn_y_true.extend(predictions.tolist())
# 
# # Generate classification report
# rnn_report = classification_report(rnn_y_true, rnn_y_true)
# print("Classification Report:")
# print(rnn_report)