In [1]:
import re
from nltk import ngrams, sent_tokenize, FreqDist
from os import listdir
import sys

In [2]:
rmv = re.compile('(?:[^\w. \n]+)')
rep = re.compile('(?:[.][. ]*[.])')
trs = re.compile('[\n ]+')

def CleanString(inString):
    res = rmv.sub('', inString)
    res = rep.sub(' ', res)
    res = trs.sub(' ', res)
    return res.strip().lower()


In [3]:
class LangModel:
    def __init__(self, authName, datText):
        self.auth = authName
        self.trigram = []
        self.bigram = []
        self.tokTxt = []
        self.triCount = {}
        self.biCount = {}

        self.tokTxt = sent_tokenize(datText)

        for txt in self.tokTxt:
            txt="<s> " + txt + " </s>"
            splitTxt = txt.split()

            trigramTxt = list(ngrams(splitTxt, 3))
            self.trigram += trigramTxt

            bigramTxt = list(ngrams(splitTxt, 2))
            self.bigram += bigramTxt

        triFDist = FreqDist(self.trigram)
        biFDist = FreqDist(self.bigram)

        for triG, Count in triFDist.items():
            self.triCount[triG] = Count

        for biG, Count in biFDist.items():
            self.biCount[biG] = Count
        
        print("Auth: {}, Trigram Size: {}, Bigram Size: {}\n".format(self.auth, len(self.triCount), len(self.biCount)))
        
    def Predict(self, testText):
        prob = 1
        tokTest = sent_tokenize(testText)
        
        testTriG = []
        testBiG = []
        
        for txt in tokTest:
            txt="<s> " + txt + " </s>"
            splitTxt = txt.split()

            trigramTxt = list(ngrams(splitTxt, 3))
            testTriG += trigramTxt

            bigramTxt = list(ngrams(splitTxt, 2))
            testBiG += bigramTxt
        
        for testTup in testTriG:
            testwrd1, testwrd2, testwrd3 = testTup
            testPair = (testwrd1, testwrd2)
            
            triGCount = 0
            biGCount = 0
            
            if (testTup not in self.triCount):
                triGCount = 0
            else:
                triGCount = self.triCount[testTup]

            if (testPair not in self.biCount):
                biGCount = 0
            else:
                biGCount = self.biCount[testPair]

            prob *= (triGCount + 1) / (biGCount + len(self.biCount.keys()))
        
        return prob


In [4]:
foldPath = 'author_dataset'
trainPath = './{}/training/'.format(foldPath)
testPath = './{}/test/'.format(foldPath)

In [5]:
training = listdir(trainPath)
training = sorted(training)

In [6]:
AuthorName = ['Abraham Lincoln', 'Andrew Lang', 'Jack London', 'Jane Austen', 'Sir Arthur Conan Doyle']
Models = []
for i in range(0,5):
    authorText = ""
    for j in range(i * 5 + 0,i * 5 + 5):
        authorText += CleanString(open(trainPath + training[j],'r').read() + '\n')
    Models.append(LangModel(AuthorName[i], authorText))

Auth: Abraham Lincoln, Trigram Size: 94792, Bigram Size: 57126

Auth: Andrew Lang, Trigram Size: 253858, Bigram Size: 150512

Auth: Jack London, Trigram Size: 252228, Bigram Size: 144334

Auth: Jane Austen, Trigram Size: 377381, Bigram Size: 172531

Auth: Sir Arthur Conan Doyle, Trigram Size: 167946, Bigram Size: 95627



In [7]:
test = listdir(testPath)
test = sorted(test)

In [8]:
cnt = 0
for tester in test:
    mProb = -1
    predAuth = ""
    realAuth = tester.split('__')[0]
    testTxt = CleanString(open(testPath + tester, 'r').read())
    for model in Models:
        pred = model.Predict(testTxt)
        if pred > mProb:
            mProb = pred
            predAuth = model.auth
    print("Target: {}, Prediction: {}\n".format(realAuth,predAuth))
    if realAuth == predAuth:
        cnt += 1
print("Accuracy = {}%".format(cnt * 20))

Target: Abraham Lincoln, Prediction: Abraham Lincoln

Target: Andrew Lang, Prediction: Abraham Lincoln

Target: Jack London, Prediction: Abraham Lincoln

Target: Jane Austen, Prediction: Abraham Lincoln

Target: Sir Arthur Conan Doyle, Prediction: Abraham Lincoln

Accuracy = 20%
