## N-Gram-Tracing (implemented after Grieve et al. 2018)

In [18]:
import os
import re
import random

In [19]:
#Creates a list of all text file names corresponding to one author 

e_shakespeare_texts = []
l_shakespeare_texts = []
marlowe_texts = []
middleton_texts = []
jonson_texts = []
chapman_texts = []
file_count = 0

for filename in os.listdir("EL"):
    file_count += 1
    if filename.endswith("L-Shakespeare.tok"): 
        l_shakespeare_texts.append(filename)
    if filename.endswith("E-Shakespeare.tok"): 
        e_shakespeare_texts.append(filename)
    if filename.endswith("Marlowe.tok"): 
        marlowe_texts.append(filename)
    if filename.endswith("Middleton.tok"): 
        middleton_texts.append(filename)        
    if filename.endswith("Jonson.tok"): 
        jonson_texts.append(filename)
    if filename.endswith("Chapman.tok"): 
        chapman_texts.append(filename)

        
all_author_files = [e_shakespeare_texts, l_shakespeare_texts,
                   marlowe_texts, middleton_texts, jonson_texts, chapman_texts]
all_author_names = ["E-Shakespeare.tok", "L-Shakespeare.tok", "Marlowe.tok", "Middleton.tok", "Jonson.tok", "Chapman.tok"]

In [20]:
def generate_char_list(filelist, tested_file):
    """
    Generates overlapping character ngram of a given order for a given document.
    Input: List with document names, name of file to be tested
    Output: List with characters
    """
    
    rawlist = []
    
    for document in filelist:
        if document != tested_file:
            with open(os.path.join("EL/" + document), "r", encoding="utf-8") as fh:
                for line in fh:
                    raw_words = re.split(r'(\s+)', line)
                    for word in raw_words:
                        stripped = word.lower()
                        for character in stripped:
                            rawlist.append(character)
            
    return rawlist

In [21]:
def generate_word_list(filelist, tested_file):
    """
    Generates overlapping words ngram of a given order for a given document.
    Input: List with document names, name of the file to be tested
    Output: List with words
    """
    
    rawlist = []

    for document in filelist:
        if document != tested_file:
            with open(os.path.join("EL/" + document), "r", encoding="utf-8") as fh:
                for line in fh:
                    raw_words = line.split()
                    for word in raw_words:
                        stripped = word.lower()
                        rawlist.append(stripped)
            
    return rawlist

In [22]:
stopwords = {"."}

def generate_ngram(rawlist, ngram_order):
    ngrams = zip(*(rawlist[i:] for i in range(ngram_order)))
    return (ngram for ngram in ngrams if not any(w in stopwords for w in ngram[:-1]))

In [23]:
def overlapp_coefficient(set_of_ngrams1,set_of_ngrams2):
    """
    Input: Two sets (usually the ngrams of the file to be tested and of an author sample)
    Output: Overlapp Co-Efficient as described by Grieve et al. 2017 
    """

    return len(set(set_of_ngrams1).intersection(set(set_of_ngrams2))) / min(len(set(set_of_ngrams1)),len(set(set_of_ngrams2)))

In [24]:
def calculating_co_efficient(testfile, words_or_chars, ngram_order):
    """
    Takes the length of the file to be tested and takes a random sample of lines from each play 
    of an author based on that, such that their sizes are equal to another.
    Input: Name of the file to be tested, whether the co-efficient should be over words or
        characters, ngram-order
    Output: The co-efficient between the tested file and a randomized author sample
    """

    punctuation = """ !"',;:.-?)([]<>*#\n\t\r """
    all_co_efficients = []
    with open(os.path.join("EL/" + testfile), "r", encoding="utf-8") as fh:
        length = len(fh.read())
        
        for author in all_author_files:
            rawsamples = []
            all_author_words = []
            all_author_chars = []
            list_of_tuples = []
            chars_per_author_file = int(length / len(author))
        
            for file in author:
                if file != testfile:
                    with open(os.path.join("EL/" + file), "r", encoding="utf-8") as play:
                        all_lines = []
                        sum_of_chars = 0
                
                        for line in play:
                            all_lines.append(str(line.strip(punctuation)))
                            sum_of_chars += len(line)
                    
                        sum_of_lines = len(all_lines)
                        line_length_average = int(sum_of_chars / sum_of_lines)
                        amount_of_lines = int(chars_per_author_file / line_length_average)
                        rawsamples += random.sample(all_lines, amount_of_lines)
                
                        for string in rawsamples:
                            all_author_words += string.split()
                            
                        if words_or_chars == "chars":
                            for word in all_author_words:
                                all_author_chars += list(word)                       
            
            if words_or_chars == "chars":
                set1 = {*generate_ngram(all_author_chars,ngram_order)}
                set2 = {*generate_ngram(generate_char_list([testfile],"empty"),ngram_order)}
            elif words_or_chars == "words":
                set1 = {*generate_ngram(all_author_words,ngram_order)}
                set2 = {*generate_ngram(generate_word_list([testfile],"empty"),ngram_order)}
                
            co_efficient = overlapp_coefficient(set1,set2)
            all_co_efficients.append(co_efficient)
 
    most_likely_author = all_author_names[all_co_efficients.index(max(all_co_efficients))]
    return testfile, most_likely_author


In [12]:
predicted_right = 0
for play in os.listdir("EL"):
    testfile, most_likely_author = calculating_co_efficient(play,"chars",7)
    if testfile.endswith(most_likely_author):
        predicted_right += 1
accuracy = predicted_right / file_count

Accuracy for 7-character ngrams

In [13]:
print(accuracy)

0.38461538461538464


In [14]:
predicted_right = 0
for play in os.listdir("EL"):
    testfile, most_likely_author = calculating_co_efficient(play,"chars",8)
    if testfile.endswith(most_likely_author):
        predicted_right += 1
accuracy = predicted_right / file_count

Accuracy for 8-character ngrams

In [15]:
print(accuracy)

0.6025641025641025


In [16]:
predicted_right = 0
for play in os.listdir("EL"):
    testfile, most_likely_author = calculating_co_efficient(play,"words",2)
    if testfile.endswith(most_likely_author):
        predicted_right += 1
accuracy = predicted_right / file_count

Accuracy for 2-word ngrams

In [17]:
print(accuracy)

0.2948717948717949


Testing the n-gram-tracing on "Second Maiden's Tragedy"

In [25]:
testfile, most_likely_author = calculating_co_efficient("the second maiden.txt.Middleton.tok","chars",7)
print("Second Maiden's Tragedy was written by ", most_likely_author," according to 7-character-ngram-tracing")
testfile, most_likely_author = calculating_co_efficient("the second maiden.txt.Middleton.tok","chars",8)
print("Second Maiden's Tragedy was written by ", most_likely_author," according to 8-character-ngram-tracing")
testfile, most_likely_author = calculating_co_efficient("the second maiden.txt.Middleton.tok","words",2)
print("Second Maiden's Tragedy was written by ", most_likely_author," according to 2-word-ngram-tracing")

Second Maiden's Tragedy was written by  L-Shakespeare.tok  according to 7-character-ngram-tracing
Second Maiden's Tragedy was written by  Middleton.tok  according to 8-character-ngram-tracing
Second Maiden's Tragedy was written by  Middleton.tok  according to 2-word-ngram-tracing
