In [1]:
# CS524 NLP - Project 2: 
# Binary Authorship Attribution G.K. Chesterton using BERT
# Team 7: Zack Malkmus, Tyler Nitzsche, Andrew Meuller, Gabriel Laboy
#
# TO RUN:
#   1. Install jupyter notebooks
#   2. 'pip install -r requirements.txt'
#   3. Run the code in the jupyter notebook

In [2]:
# --------------------------------------------
# Import Libraries
# --------------------------------------------

import os
import csv

import pandas as pd

import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_linear_schedule_with_warmup

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# --------------------------------------------
# Reading Dataset
# --------------------------------------------

df = pd.read_csv('text_to_authorship.csv')

print("First 5 entries:")
print(df.head())

print("\nNull values in each column:")
print(df.isnull().sum())

First 5 entries:
                                                text  label
0  \n“But why Turkish?” asked Mr. Sherlock Holmes...      0
1  \nOf all the problems which have been submitte...      0
2  Valentin, Chief of the Paris Police, was late ...      1
3  \nOn glancing over my notes of the seventy odd...      0
4  \n    "Monsieur Arsène Lupin has the honour to...      0

Null values in each column:
text     0
label    0
dtype: int64


In [4]:
# --------------------------------------------
# Data Preprocessing
# --------------------------------------------

df = df.dropna()

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'],
    df['label'],
    test_size=0.2,
    random_state=42
)

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [5]:
# --------------------------------------------
# Dataset and DataLoaders
# --------------------------------------------

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts.tolist()
        self.labels = labels.tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len
            
    def __len__(self):
        return len(self.labels)
        
    def __getitem__(self, idx):
        encoding = self.tokenizer.encode_plus(
            self.texts[idx],
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
            
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create datasets
train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [6]:
# --------------------------------------------
# Model Setup
# --------------------------------------------

# Load pre-trained DistilBERT model for binary classification
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [7]:
# --------------------------------------------
# Training
# --------------------------------------------

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

epochs = 3
total_steps = len(train_loader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

def train_epoch(model, data_loader, optimizer, device, scheduler):
    model.train()
    total_loss = 0
        
    for batch in data_loader:
        optimizer.zero_grad()
            
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
            
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
            
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()
        
    avg_loss = total_loss / len(data_loader)
    return avg_loss

for epoch in range(epochs):
    print(f'Epoch {epoch + 1}/{epochs}')
    train_loss = train_epoch(model, train_loader, optimizer, device, scheduler)
    print(f'Training loss: {train_loss:.4f}')



Epoch 1/3
Training loss: 0.6595
Epoch 2/3
Training loss: 0.5412
Epoch 3/3
Training loss: 0.4172


In [10]:
# --------------------------------------------
# Evaluation
# --------------------------------------------

def eval_model(model, data_loader, device):
    model.eval()
    predictions, true_labels = [], []
        
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
                
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
        
    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions, target_names=['Other Authors', 'G.K. Chesterton'], output_dict=True)
    cm = confusion_matrix(true_labels, predictions)
    return accuracy, report, cm

accuracy, report, cm = eval_model(model, val_loader, device)
print(f'Validation Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(report)
print('Confusion Matrix:')
print(cm)

Validation Accuracy: 0.9024
Classification Report:
{'Other Authors': {'precision': 1.0, 'recall': 0.8333333333333334, 'f1-score': 0.9090909090909091, 'support': 24.0}, 'G.K. Chesterton': {'precision': 0.8095238095238095, 'recall': 1.0, 'f1-score': 0.8947368421052632, 'support': 17.0}, 'accuracy': 0.9024390243902439, 'macro avg': {'precision': 0.9047619047619048, 'recall': 0.9166666666666667, 'f1-score': 0.9019138755980861, 'support': 41.0}, 'weighted avg': {'precision': 0.9210220673635308, 'recall': 0.9024390243902439, 'f1-score': 0.9031392227797876, 'support': 41.0}}
Confusion Matrix:
[[20  4]
 [ 0 17]]


In [11]:
# --------------------------------------------
# Save Evaluation Results
# --------------------------------------------

cm_flat = cm.flatten()

csv_header = [
    'Name',
    'Accuracy',
    'Precision_Other Authors', 'Recall_Other Authors', 'F1-Score_Other Authors', 'Support_Other Authors',
    'Precision_G.K. Chesterton', 'Recall_G.K. Chesterton', 'F1-Score_G.K. Chesterton', 'Support_G.K. Chesterton',
    'Confusion_Matrix_TN', 'Confusion_Matrix_FP', 'Confusion_Matrix_FN', 'Confusion_Matrix_TP'
]

name = 'DistilBERT'

csv_row = [
    name,
    accuracy,
    report['Other Authors']['precision'], report['Other Authors']['recall'], report['Other Authors']['f1-score'], report['Other Authors']['support'],
    report['G.K. Chesterton']['precision'], report['G.K. Chesterton']['recall'], report['G.K. Chesterton']['f1-score'], report['G.K. Chesterton']['support'],
    *cm_flat
]

csv_file = 'evaluation_results.csv'
file_exists = os.path.isfile(csv_file)

with open(csv_file, mode='a', newline='') as file:
    writer = csv.writer(file)
    if not file_exists:
        writer.writerow(csv_header)
    writer.writerow(csv_row)