# Evaluation of Baseline Model

Using fine-tuned BERT Model to generate probabilities of Source, Target and Predicted Texts from Baseline Model

In [1]:
import torch
import torch.nn.functional as F
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd
import numpy as np
from datasets import load_from_disk

In [2]:
# Model and Tokenizer
model_path = "bert_finetuned/"  # Update if the path is different
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# New file path
new_file_path = "baseline_predictions.csv"  

# Load the dataset
new_dataset = pd.read_csv(new_file_path)

# Assuming your new dataset has similar columns: 'source_text', 'target_text', and 'predictions'
source_texts = new_dataset['source'].tolist()
target_texts = new_dataset['target'].tolist()
predicted_texts = new_dataset['predicted'].tolist()

# Tokenize Data Function
def tokenize_data(texts):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    return input_ids, attention_masks

# Tokenize source, target, and predicted texts
source_inputs, source_masks = tokenize_data(source_texts)
target_inputs, target_masks = tokenize_data(target_texts)
predicted_inputs, predicted_masks = tokenize_data(predicted_texts)

# DataLoader for each set
batch_size = 32
source_dataloader = DataLoader(TensorDataset(source_inputs, source_masks), batch_size=batch_size)
target_dataloader = DataLoader(TensorDataset(target_inputs, target_masks), batch_size=batch_size)
predicted_dataloader = DataLoader(TensorDataset(predicted_inputs, predicted_masks), batch_size=batch_size)

# Function to generate probabilities
def generate_probabilities(dataloader):
    model.eval()
    probabilities = []
    for batch in dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        probs = F.softmax(outputs.logits, dim=1).detach().cpu().numpy()
        probabilities.extend(probs)
    return probabilities

# Generating probabilities
source_probs = generate_probabilities(source_dataloader)
target_probs = generate_probabilities(target_dataloader)
predicted_probs = generate_probabilities(predicted_dataloader)

# Creating a DataFrame with results
new_data = {
    "source_text": source_texts,
    "source_label_0": [prob[0] for prob in source_probs],
    "source_label_1": [prob[1] for prob in source_probs],
    "target_text": target_texts,
    "target_label_0": [prob[0] for prob in target_probs],
    "target_label_1": [prob[1] for prob in target_probs],
    "predicted_text": predicted_texts,
    "predicted_label_0": [prob[0] for prob in predicted_probs],
    "predicted_label_1": [prob[1] for prob in predicted_probs],
}

new_df = pd.DataFrame(new_data)
new_df.to_csv('baseline_evaluated.csv', index=False)  # Save to a new CSV file
