In [2]:
import numpy as np
import matplotlib.pyplot as plt
import string
from sklearn.model_selection import train_test_split

In [4]:
input_files = ['edgar_allan_poe.txt', 'robert_frost.txt']

In [8]:
# collect data into lists
input_texts = []
labels = []

for label, f in enumerate(input_files):
    print(f'{f} corresponds to label {label}')

    for line in open(f'../Datasets/{f}'):
        line = line.rstrip().lower()
        if line:
            # remove punctuation
            line = line.translate(str.maketrans('', '', string.punctuation))

            input_texts.append(line)
            labels.append(label)

edgar_allan_poe.txt corresponds to label 0
robert_frost.txt corresponds to label 1


In [10]:
train_text, test_text, Ytrain, Ytest = train_test_split(input_texts, labels)

In [12]:
len(Ytrain), len(Ytest)

(1618, 540)

In [18]:
idx = 1
word2idx = {'<unk>': 0}

In [20]:
# populate word2idx
for text in train_text:
    tokens = text.split()
    for token in tokens:
        if token not in word2idx:
            word2idx[token] = idx
            idx += 1

In [22]:
word2idx

{'<unk>': 0,
 'he': 1,
 'guessed': 2,
 'theyd': 3,
 'know': 4,
 'what': 5,
 'had': 6,
 'to': 7,
 'put': 8,
 'up': 9,
 'with': 10,
 'a': 11,
 'swamp': 12,
 'of': 13,
 'cedar': 14,
 'choked': 15,
 'oil': 16,
 'take': 17,
 'it': 18,
 'year': 19,
 'in': 20,
 'out': 21,
 'doesnt': 22,
 'make': 23,
 'much': 24,
 'who': 25,
 'otherwise': 26,
 'would': 27,
 'fall': 28,
 'from': 29,
 'life': 30,
 'and': 31,
 'heaven': 32,
 'when': 33,
 'i': 34,
 'come': 35,
 'the': 36,
 'garden': 37,
 'ground': 38,
 'lift': 39,
 'themselves': 40,
 'off': 41,
 'was': 42,
 'half': 43,
 'boring': 44,
 'through': 45,
 'climbing': 46,
 'thence': 47,
 'they': 48,
 'were': 49,
 'sprung': 50,
 'so': 51,
 'numerous': 52,
 'tribe': 53,
 'blew': 54,
 'him': 55,
 'on': 56,
 'icy': 57,
 'crust': 58,
 'every': 59,
 'way': 60,
 'you': 61,
 'may': 62,
 'please': 63,
 'yourself': 64,
 'theyll': 65,
 'find': 66,
 'theyve': 67,
 'got': 68,
 'whole': 69,
 'thing': 70,
 'do': 71,
 'over': 72,
 'like': 73,
 'some': 74,
 'men': 75,
 

In [24]:
len(word2idx)

2533

In [28]:
# convert data into integer format
train_text_int = []
test_text_int = []

for text in train_text:
    tokens = text.split()
    line_as_int = [word2idx[token] for token in tokens]
    train_text_int.append(line_as_int)

for text in test_text:
    tokens = text.split()
    line_as_int = [word2idx.get(token, 0) for token in tokens]
    test_text_int.append(line_as_int)

In [30]:
# initialize A and pi matrices - for both classes

V = len(word2idx)

A0 = np.ones((V, V))
pi0 = np.ones(V)

A1 = np.ones((V, V))
pi1 = np.ones(V)

In [32]:
# compute counts for A and pi
def compute_counts(text_as_int, A, pi):
    for tokens in text_as_int:
        last_idx = None
        for idx in tokens:
            if last_idx is None:
                # it's the first word in a sentence
                pi[idx] += 1
            else:
                # the last word exists, so count a transition
                A[last_idx, idx] += 1
                
            last_idx = idx

In [34]:
compute_counts([t for t, y in zip(train_text_int, Ytrain) if y == 0], A0, pi0)
compute_counts([t for t, y in zip(train_text_int, Ytrain) if y == 1], A1, pi1)

In [36]:
# normlaize A and pi so they are valid probability matrices
# convine yourself that thus is equivalent to the formaulas shown below
A0 = A0/A0.sum(axis = 1, keepdims = True)
pi0 = pi0/pi0.sum()

A1 = A1/A1.sum(axis = 1, keepdims = True)
pi1 = pi1/pi1.sum()

In [38]:
# log A and pi since we don't need the actual probs
logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)

In [40]:
# compute priors
count0 = sum(y == 0 for y in Ytrain)
count1 = sum(y == 1 for y in Ytrain)
total = len(Ytrain)
p0 = count0 / total
p1 = count1 / total
logp0 = np.log(p0)
logp1 = np.log(p1)
p0, p1

(0.32818294190358466, 0.6718170580964153)

In [54]:
# build a classifer
class Classifier:
    def __init__(self, logAs, logpis, logpriors):
        self.logAs = logAs
        self.logpis = logpis
        self.logpriors = logpriors
        self.K = len(logpriors) # number of classes
        
    def _compute_log_likelihood(self, input_, class_):
        logA = self.logAs[class_]
        logpi = self.logpis[class_]
        last_idx = None
        logprob = 0
        for idx in input_:
            if last_idx is None:
                # it's the first token
                logprob += logpi[idx]
            else:
                logprob += logA[last_idx, idx]
            
            # update last_idx
            last_idx = idx
        return logprob

    def predict(self, inputs):
        predictions = np.zeros(len(inputs))
        for i, input_ in enumerate(inputs):
            posteriors = [self._compute_log_likelihood(input_, c) + self.logpriors[c] for c in range(self.K)]
            pred = np.argmax(posteriors)
            predictions[i] = pred
        return predictions

In [56]:
# each array must be in order since classes are assumed to index these lists
clf = Classifier([logA0, logA1], [logpi0, logpi1], [logp0, logp1])

In [58]:
Ptrain = clf.predict(train_text_int)

In [60]:
print(f"Train acc: {np.mean(Ptrain == Ytrain)}")

Train acc: 0.9944375772558715


In [62]:
Ptest = clf.predict(test_text_int)
print(f"Test acc: {np.mean(Ptest == Ytest)}")

Test acc: 0.8240740740740741


In [82]:
from sklearn.metrics import confusion_matrix, f1_score

In [84]:
cm_train = confusion_matrix(Ytrain, Ptrain)
cm_train

array([[ 522,    9],
       [   0, 1087]], dtype=int64)

In [86]:
cm_test = confusion_matrix(Ytest, Ptest)
cm_test

array([[105,  86],
       [  9, 340]], dtype=int64)

In [90]:
f1_score(Ytrain, Ptrain)

0.9958772331653688

In [92]:
f1_score(Ytest, Ptest)

0.8774193548387097