In [None]:
# CS524 NLP - Project 2: 
# Binary Authorship Attribution G.K. Chesterton using BERT
# Team 7: Zack Malkmus, Tyler Nitzsche, Andrew Meuller, Gabriel Laboy
#
# TO RUN:
#   1. Install jupyter notebooks
#   2. 'pip install -r requirements.txt'
#   3. Run the code in the jupyter notebook

In [None]:
# --------------------------------------------
# Import Libraries
# --------------------------------------------

import os
import csv

import pandas as pd

from transformers import BertTokenizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# --------------------------------------------
# Reading Dataset
# --------------------------------------------

df = pd.read_csv('text_to_authorship.csv')

print("First 5 entries:")
print(df.head())

print("\nNull values in each column:")
print(df.isnull().sum())

In [None]:
# --------------------------------------------
# Data Preprocessing
# --------------------------------------------

df = df.dropna()

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'],
    df['label'],
    test_size=0.2,
    random_state=42
)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(text):
    tokens = tokenizer.tokenize(text)
    return tokens

vectorizer = CountVectorizer(tokenizer=tokenize_function)
train_features = vectorizer.fit_transform(train_texts)
val_features = vectorizer.transform(val_texts)

In [None]:
# --------------------------------------------
# Model Setup
# --------------------------------------------

model = MultinomialNB()

In [None]:
# --------------------------------------------
# Training
# --------------------------------------------

model.fit(train_features, train_labels)

In [None]:
# --------------------------------------------
# Evaluation
# --------------------------------------------

predictions = model.predict(val_features)

accuracy = accuracy_score(val_labels, predictions)
report = classification_report(val_labels, predictions, target_names=['Other Authors', 'G.K. Chesterton'], output_dict=True)
cm = confusion_matrix(val_labels, predictions)

print(f'Validation Accuracy: {accuracy:.4f}')
print('Classification Report:')
print(report)
print('Confusion Matrix:')
print(cm)

In [None]:
# --------------------------------------------
# Save Evaluation Results
# --------------------------------------------

cm_flat = cm.flatten()

csv_header = [
    'Name',
    'Accuracy',
    'Precision_Other Authors', 'Recall_Other Authors', 'F1-Score_Other Authors', 'Support_Other Authors',
    'Precision_G.K. Chesterton', 'Recall_G.K. Chesterton', 'F1-Score_G.K. Chesterton', 'Support_G.K. Chesterton',
    'Confusion_Matrix_TN', 'Confusion_Matrix_FP', 'Confusion_Matrix_FN', 'Confusion_Matrix_TP'
]

name = 'Bayesian'  

csv_row = [
    name,
    accuracy,
    report['Other Authors']['precision'], report['Other Authors']['recall'], report['Other Authors']['f1-score'], report['Other Authors']['support'],
    report['G.K. Chesterton']['precision'], report['G.K. Chesterton']['recall'], report['G.K. Chesterton']['f1-score'], report['G.K. Chesterton']['support'],
    *cm_flat
]

csv_file = 'evaluation_results.csv'
file_exists = os.path.isfile(csv_file)

with open(csv_file, mode='a', newline='') as file:
    writer = csv.writer(file)
    if not file_exists:
        writer.writerow(csv_header)
    writer.writerow(csv_row)
