In [17]:
import sklearn as sk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import numpy

In [18]:
# Bag of words model with Naive Bayes classifier

In [19]:
def one_author_corpus(file_paths):
    """
    Merges several texts of one author into one big string.
    
    Input: List of file names/paths of texts to be merged (by same author)
    
    Output: String of merged texts
    """
    author_corpus = ''
    for path in file_paths:
        with open(path) as open_text:
            text = open_text.read()
        author_corpus += text
    return author_corpus

In [22]:
def GV_feature_vector(corpus):
    """
    Transforms a corpus into feature vectors representing counts of words.
    
    Input: List of documents making up the corpus
    
    Output: Array of word counts for each document (feature vectors)
    """
    vectorizer = CountVectorizer()
    word_count_matrix = vectorizer.fit_transform(corpus)
    word_count_array = word_count_matrix.toarray()
    
    return word_count_array
    

In [23]:
def build_classifier(word_count_array, all_authors):
    """
    Builds a classifier from provided feature vectors.
    
    Input: 1) Array of feature vectors for each author + the test data vector
    2) List of all authors considered, in same order as their feature vectors
    
    Output: Instance of a multinomial Naive Bayes classifier trained 
    with the data excluding the test data
    """
    
    train_word_count = word_count_array[:-1]
    
    classifier = MultinomialNB(alpha = 0.02)
    classifier.fit(train_word_count, all_authors)
    
    return classifier

In [36]:
def test_bagofwords(corpus, all_authors):
    """
    Pipeline putting all functions together.
    
    Input: 1) Corpus of all authors considered including the test text as last sample
    2) List of all authors considered, in same order as their feature vectors
    
    Output: Predicted class as String
    """
    
    word_count_array = GV_feature_vector(corpus)

    test_word_count = word_count_array[-1]
    test_word_count = test_word_count.reshape(1,-1)

    cla = build_classifier(word_count_array, all_authors)

    return cla.predict(test_word_count)

In [37]:
#Tiny example
late_shkp_data = one_author_corpus(['kinglearLS.txt', 
                   'winterstaleLS.txt'])
marlowe_data = one_author_corpus(['parisM.txt', 'edwardM.txt'])
test_data = one_author_corpus(['macbeth.txt'])


#Create corpus of all considered authors
corpus = [late_shkp_data, marlowe_data, test_data]
all_authors = ['Late Shakespeare', 'Marlowe']

#Test tiny example
#Expected: Late Shakespeare
print(test_bagofwords(corpus, all_authors))

['Late Shakespeare']


In [38]:
#Test more examples
#Expected: Early Shakespeare
late_shkp_data = one_author_corpus(['kinglearLS.txt', 
                   'winterstaleLS.txt', 'macbeth.txt'])
early_shkp_data = one_author_corpus(['thetamingoftheshrew.txt', 'romeoandjuliet.txt'])
marlowe_data = one_author_corpus(['parisM.txt', 'edwardM.txt'])
test_data = one_author_corpus(['hamlet.txt'])

corpus = [late_shkp_data,early_shkp_data, marlowe_data, test_data]
all_authors = ['Late Shakespeare', 'Early Shakespeare', 'Marlowe']

print(test_bagofwords(corpus, all_authors))

['Late Shakespeare']


In [41]:
#Test more examples
#Expected: Marlowe
late_shkp_data = one_author_corpus(['kinglearLS.txt', 
                   'winterstaleLS.txt', 'macbeth.txt'])
early_shkp_data = one_author_corpus(['thetamingoftheshrew.txt', 'romeoandjuliet.txt'])
marlowe_data = one_author_corpus(['parisM.txt', 'edwardM.txt'])
test_data = one_author_corpus(['thejewofmalta.txt'])

corpus = [late_shkp_data,early_shkp_data, marlowe_data, test_data]
all_authors = ['Late Shakespeare', 'Early Shakespeare', 'Marlowe']

print(test_bagofwords(corpus, all_authors))

['Early Shakespeare']
