In [1]:
# Binary Authorship Attribution G.K. Chesterton using BERT

# --------------------------------------------
# Import Libraries
# --------------------------------------------

import pandas as pd
import numpy as np

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import time
import datetime


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# --------------------------------------------
# Load the Dataset
# --------------------------------------------

# Load the dataset
df = pd.read_csv('text_to_authorship.csv')

# Display first few rows
print("First 5 entries:")
print(df.head())

# Check for null values
print("\nNull values in each column:")
print(df.isnull().sum())

First 5 entries:
                                                text  label
0  \n      We have had some dramatic entrances an...      0
1  \n\nThe little village of Bohun Beacon was per...      1
2  \nAt three o’clock in the morning, there were ...      0
3  \n      Shortly after my marriage I had bought...      0
4  \n“It is really remarkable, Velmont, what a cl...      0

Null values in each column:
text     0
label    0
dtype: int64


In [4]:
# Remove missing values
df = df.dropna()

# Encode labels
label_encoder = LabelEncoder()
df['author_encoded'] = label_encoder.fit_transform(df['label'])

# Split into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'],
    df['author_encoded'],
    test_size=0.2,
    random_state=42
)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

NameError: name 'LabelEncoder' is not defined

In [None]:
# --------------------------------------------
# Data Preprocessing
# --------------------------------------------

# Remove missing values
df = df.dropna()

# Create binary labels
# 1: Text written by G.K. Chesterton
# 0: Text written by other authors
df['label'] = df['author'].apply(lambda x: 1 if x == 'G.K. Chesterton' else 0)

# Split into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'],
    df['label'],
    test_size=0.2,
    random_state=42
)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# --------------------------------------------
# Prepare Dataset and DataLoaders
# --------------------------------------------

from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len
            
    def __len__(self):
        return len(self.labels)
        
    def __getitem__(self, idx):
        encoding = self.tokenizer.encode_plus(
            self.texts[idx],
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
            
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create datasets
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

KeyError: 'author'

In [None]:
# --------------------------------------------
# Model Setup
# --------------------------------------------

# Load pre-trained BERT model for binary classification
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,  # Binary classification
    output_attentions=False,
    output_hidden_states=False
)

# Use GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
# --------------------------------------------
# Training the Model
# --------------------------------------------

# Set up optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

epochs = 3
total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

def train_epoch(model, data_loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0
        
    for batch in data_loader:
        optimizer.zero_grad()
            
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
            
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
            
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        
    avg_loss = total_loss / len(data_loader)
    return avg_loss

# --- 6.3 Training Loop ---
for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    train_loss = train_epoch(model, train_loader, optimizer, device, scheduler)
    print(f'Training loss: {train_loss:.4f}')

In [None]:
# --------------------------------------------
# Evaluating the Model
# --------------------------------------------

def eval_model(model, data_loader, device):
    model.eval()
    predictions, true_labels = [], []
        
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
                
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
        
    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions, target_names=['Other Authors', 'G.K. Chesterton'])
    cm = confusion_matrix(true_labels, predictions)
    return accuracy, report, cm

# Evaluate the model
accuracy, report, cm = eval_model(model, val_loader, device)
print(f'Validation Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(report)
print('Confusion Matrix:')
print(cm)

In [None]:
# --------------------------------------------
# Saving the Model
# --------------------------------------------

# Save the fine-tuned model
model.save_pretrained('bert-authorship-attribution-binary')
tokenizer.save_pretrained('bert-authorship-attribution-binary')