In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from datasets import load_from_disk

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification

model_path = "../../data/bert_finetuned" 
tokenizer = BertTokenizer.from_pretrained(model_path)

try:
    model = BertForSequenceClassification.from_pretrained(model_path)
except Exception as e:
    print("Error loading model:", e)

# If using a GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [3]:
folder_path = '../../data/seq2seq'
# Load the dataset from disk
seq2seq_dataset = load_from_disk(folder_path)

# Loading Datasets
test_dataset = seq2seq_dataset['test']

def create_bert_dataset(dataset):
    texts = []
    labels = []  # 1 for subjective, 0 for neutral

    for item in dataset:
        texts.append(item['source'])
        labels.append(1)  # Assuming 'source' is subjective
        texts.append(item['target'])
        labels.append(0)  # Assuming 'target' is neutralized

    return texts, labels

test_texts, test_labels = create_bert_dataset(test_dataset)

In [6]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

def tokenize_data(texts, labels, max_length=128):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    return input_ids, attention_masks, labels

test_inputs, test_masks, test_labels = tokenize_data(test_texts, test_labels)
test_dataset = TensorDataset(test_inputs, test_masks, test_labels)

# Create the DataLoader
batch_size = 32
test_dataloader = DataLoader(
    test_dataset,
    sampler=SequentialSampler(test_dataset),
    batch_size=batch_size
)

  labels = torch.tensor(labels)


In [7]:
from sklearn.metrics import accuracy_score, classification_report

model.eval()
predictions, true_labels = [], []

for batch in test_dataloader:
    # Ensure batch items are tensors before calling .to(device)
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    predictions.extend(np.argmax(logits, axis=1))
    true_labels.extend(label_ids)

# Performance metrics
print(f"Test Accuracy: {accuracy_score(true_labels, predictions)}")
print(classification_report(true_labels, predictions))

Test Accuracy: 0.7476635514018691
              precision    recall  f1-score   support

           0       0.73      0.78      0.76       963
           1       0.77      0.71      0.74       963

    accuracy                           0.75      1926
   macro avg       0.75      0.75      0.75      1926
weighted avg       0.75      0.75      0.75      1926



In [14]:
#Generate Predictions and Probabilities

import torch.nn.functional as F

model.eval()
probabilities = []

for batch in test_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

    # Apply softmax to the logits to get probabilities
    probs = F.softmax(outputs.logits, dim=1).detach().cpu().numpy()
    probabilities.extend(probs)


In [15]:
import pandas as pd

data = {
    "source_text": test_texts[::2],  # Every second entry starting from 0
    "source_label_0": [prob[0] for prob in probabilities[::2]],  # Probabilities for label 0 of source
    "source_label_1": [prob[1] for prob in probabilities[::2]],  # Probabilities for label 1 of source
    "target_text": test_texts[1::2],  # Every second entry starting from 1
    "target_label_0": [prob[0] for prob in probabilities[1::2]],  # Probabilities for label 0 of target
    "target_label_1": [prob[1] for prob in probabilities[1::2]],  # Probabilities for label 1 of target
    # Add columns for predicted text and its probabilities if available
}

df = pd.DataFrame(data)


In [18]:
print(df.head())

                                         source_text  source_label_0  \
0  in april 2009 a brazilian human rights group, ...        0.917522   
1  the 51 day standoff and ensuing murder of 76 m...        0.727432   
2  mark oaten (born 8 march 1964, watford) is a d...        0.001391   
3  another infamous period of colonisation in anc...        0.001802   
4  photo sequence of astonishing 2005 chicagoland...        0.001345   

   source_label_1                                        target_text  \
0        0.082478  in april 2009 a brazilian human rights group, ...   
1        0.272568  the 51 day standoff and ensuing deaths of 76 m...   
2        0.998609  mark oaten (born 8 march 1964, watford) is a l...   
3        0.998198  another period of colonisation in ancient time...   
4        0.998655  photo sequence of 2005 chicagoland crash with ...   

   target_label_0  target_label_1  
0        0.819904        0.180096  
1        0.876027        0.123973  
2        0.961041        0