In [None]:
# CS524 NLP - Project 2: 
# Binary Authorship Attribution G.K. Chesterton using BERT
# Team 7: Zack Malkmus, Tyler Nitzsche, Andrew Meuller, Gabriel Laboy
#
# TO RUN:
#   1. Install jupyter notebooks
#   2. 'pip install -r requirements.txt'
#   3. Run the code in the jupyter notebook

In [None]:
# --------------------------------------------
# Import Libraries
# --------------------------------------------

import os
import csv

import pandas as pd
import numpy as np

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier


In [None]:
# --------------------------------------------
# Reading Dataset
# --------------------------------------------

df = pd.read_csv('text_to_authorship.csv')

print("First 5 entries:")
print(df.head())

print("\nNull values in each column:")
print(df.isnull().sum())

In [None]:
# --------------------------------------------
# Data Preprocessing
# --------------------------------------------

df = df.dropna()

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'],
    df['label'],
    test_size=0.2,
    random_state=42
)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
# --------------------------------------------
# Dataset and DataLoaders
# --------------------------------------------

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len
            
    def __len__(self):
        return len(self.labels)
        
    def __getitem__(self, idx):
        encoding = self.tokenizer.encode_plus(
            self.texts[idx],
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
            
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create datasets
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)

In [None]:
# --------------------------------------------
# Model Setup
# --------------------------------------------

model = BertModel.from_pretrained(
    'bert-base-uncased',
    output_hidden_states=False,
    output_attentions=False
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def get_embeddings(dataset, model, device):
    loader = DataLoader(dataset, batch_size=16)
    embeddings = []
    labels = []
    model.eval()
    
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            label = batch['labels'].cpu().numpy()
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            pooled_output = outputs.pooler_output.cpu().numpy()
            
            embeddings.extend(pooled_output)
            labels.extend(label)
    
    return np.array(embeddings), np.array(labels)

# Get embeddings for training and validation data
train_embeddings, train_labels = get_embeddings(train_dataset, model, device)
val_embeddings, val_labels = get_embeddings(val_dataset, model, device)

In [None]:
# --------------------------------------------
# Training Random Forest Classifier
# --------------------------------------------

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(train_embeddings, train_labels)

In [None]:
# --------------------------------------------
# Evaluation
# --------------------------------------------

val_preds = rf_model.predict(val_embeddings)

accuracy = accuracy_score(val_labels, val_preds)
report = classification_report(val_labels, val_preds, target_names=['Other Authors', 'G.K. Chesterton'], output_dict=True)
cm = confusion_matrix(val_labels, val_preds)

print(f'Validation Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(report)
print('Confusion Matrix:')
print(cm)

In [None]:
# --------------------------------------------
# Save Evaluation Results
# --------------------------------------------

cm_flat = cm.flatten()

csv_header = [
    'Name',
    'Accuracy',
    'Precision_Other Authors', 'Recall_Other Authors', 'F1-Score_Other Authors', 'Support_Other Authors',
    'Precision_G.K. Chesterton', 'Recall_G.K. Chesterton', 'F1-Score_G.K. Chesterton', 'Support_G.K. Chesterton',
    'Confusion_Matrix_TN', 'Confusion_Matrix_FP', 'Confusion_Matrix_FN', 'Confusion_Matrix_TP'
]

name = 'Random Forest'

csv_row = [
    name,
    accuracy,
    report['Other Authors']['precision'], report['Other Authors']['recall'], report['Other Authors']['f1-score'], report['Other Authors']['support'],
    report['G.K. Chesterton']['precision'], report['G.K. Chesterton']['recall'], report['G.K. Chesterton']['f1-score'], report['G.K. Chesterton']['support'],
    *cm_flat
]

csv_file = 'evaluation_results.csv'
file_exists = os.path.isfile(csv_file)

with open(csv_file, mode='a', newline='') as file:
    writer = csv.writer(file)
    if not file_exists:
        writer.writerow(csv_header)
    writer.writerow(csv_row)