# Bag of Words Model

This bag of words model for authorship attribution of Elizabethan plays is implemented after Fox et al. (2014), though with a Naive Bayes classifier. It only takes word frequencies into account.

In [1]:
#Packages needed
import sklearn 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import numpy
import os 
#change directory to EL folder with data
os.chdir("EL") 
from sklearn.model_selection import LeaveOneOut

## Data Processing

In [2]:
#Define list of author labels for prediction

all_authors = ['Early Shakespeare', 'Late Shakespeare', 'Marlowe', 'Middleton',
              'Jonson', 'Chapman']


In [3]:
#Creates a list of all text file names corresponding to one author 


e_shakespeare_texts = []
l_shakespeare_texts = []
marlowe_texts = []
middleton_texts = []
jonson_texts = []
chapman_texts = []

for filename in os.listdir():
    if filename.endswith("L-Shakespeare.tok"): 
        l_shakespeare_texts.append(filename)
    if filename.endswith("E-Shakespeare.tok"): 
        e_shakespeare_texts.append(filename)
    if filename.endswith("Marlowe.tok"): 
        marlowe_texts.append(filename)
    if filename.endswith("Middleton.tok"): 
        middleton_texts.append(filename)        
    if filename.endswith("Jonson.tok"): 
        jonson_texts.append(filename)
    if filename.endswith("Chapman.tok"): 
        chapman_texts.append(filename)

        
all_author_files = [e_shakespeare_texts, l_shakespeare_texts,
                   marlowe_texts, middleton_texts, jonson_texts, chapman_texts]

In [4]:
def read_files_author(file_paths):
    """
    Reads in texts of one author.
    
    Input: List of file names/paths of texts by same author
    
    Output: List of texts as strings
    """
    author_corpus = []
    for path in file_paths:
        with open(path) as open_text:
            text = open_text.read()
        author_corpus.append(text)
    return author_corpus

In [5]:
#Read in all files of each author

e_shkp_data = read_files_author(e_shakespeare_texts)
l_shkp_data = read_files_author(l_shakespeare_texts)
marlowe_data = read_files_author(marlowe_texts)
middleton_data = read_files_author(middleton_texts)
jonson_data = read_files_author(jonson_texts)
chapman_data = read_files_author(chapman_texts)

all_author_texts = [e_shkp_data, l_shkp_data, marlowe_data,
                   middleton_data, jonson_data, chapman_data]

In [6]:
#Compute number of all texts used for classification

number_texts = 0
for author in all_author_texts:
    number_texts += len(author)

In [7]:
def merge_corpora(all_author_texts):
    """
    Merges texts of one author into one document, for several authors given in list.
    
    Input: List of lists of texts per author
    
    Output: List of document per author as string
    """
    corpus = []
    for author_texts in all_author_texts:
        text = ''
        for author_text in author_texts:
            text += author_text
        corpus.append(text)
    return corpus


## Bag of Words as Feature

In [8]:
def GV_feature_vector(corpus):
    """
    Transforms a corpus into feature vectors representing counts of words.
    
    Input: List of documents making up the corpus
    
    Output: Array of word counts for each document (feature vectors)
    """
    vectorizer = CountVectorizer()
    word_count_matrix = vectorizer.fit_transform(corpus)
    word_count_array = word_count_matrix.toarray()
    
    return word_count_array
    

## Naive Bayes Classifier

In [9]:
def build_classifier(word_count_array, all_authors):
    """
    Builds a classifier from provided feature vectors.
    
    Input: 1) Array of feature vectors for each author and the test data
    2) List of all authors considered, in same order as their feature vectors
    
    Output: Instance of a multinomial Naive Bayes classifier trained 
    with the data excluding the test data.
    """
    
    train_word_count = word_count_array[:-1]
    
    classifier = MultinomialNB(alpha = 0.02)
    classifier.fit(train_word_count, all_authors)
    
    return classifier

In [10]:
def test_bagofwords(corpus, all_authors):
    """
    Pipeline putting above functions together.
    
    Input: 1) Corpus of all authors considered including the test text as last sample.
    2) List of all authors considered, in same order as their feature vectors
    
    Output: Predicted class as string
    """
    
    word_count_array = GV_feature_vector(corpus)

    test_word_count = word_count_array[-1]
    test_word_count = test_word_count.reshape(1,-1)

    cla = build_classifier(word_count_array, all_authors)

    return cla.predict(test_word_count)

## Test Classifier with Leave One Out (LOO) Cross Validation

In [11]:
"""
Run all cells above, then this one will provide leave one out cross validation.
Creates a document with predictions and computed accuracy.
"""
#get out of EL folder
new_dir = os.getcwd()[:-2]
os.chdir(new_dir)

leaveOO = LeaveOneOut()
correct = 0 

with open('BagOfWords_LOO', 'w') as bow_file:
    
    for author_position, author_texts in enumerate(all_author_texts):
        
        current_author = all_authors[author_position]
        bow_file.write(current_author + '\n')
        
        for train_index, test_index in leaveOO.split(author_texts):
            train_author_texts = [author_texts[train] for train in train_index]
            test_data = [author_texts[test] for test in test_index]
            new_author_texts = [train_author_texts if texts == author_texts 
                                    else texts for texts in all_author_texts]

            corpus = merge_corpora(new_author_texts)
            corpus.extend(test_data)
            
            predicted_author = test_bagofwords(corpus, all_authors)[0]
            
            bow_file.write('\t' + all_author_files[author_position][test_index[0]] + 
                           '\n\t' + predicted_author + '\n\n')
            
            if predicted_author == current_author:
                correct += 1
    
    accuracy = correct/number_texts
    bow_file.write('Accuracy: {}/{} = {}'.format(correct, number_texts, accuracy))

print("File is created. Accuracy: {}".format(accuracy))

File is created. Accuracy: 0.8311688311688312
