In [1]:
import numpy as np
import matplotlib.pyplot as plt
import string
from sklearn.model_selection import train_test_split

In [2]:
input_files = [
  'edgar_allan_poe.txt',
  'robert_frost.txt',
]

In [3]:
with open('edgar_allan_poe.txt', 'r') as file:
    print("First 10 lines of edgar_allan_poe.txt:")
    for _ in range(10):
        print(file.readline().strip()) #reads the next line from the file, removes any leading and trailing whitespace (including the newline character), and prints the cleaned line to the console.

First 10 lines of edgar_allan_poe.txt:
LO! Death hath rear'd himself a throne
In a strange city, all alone,
Far down within the dim west
Where the good, and the bad, and the worst, and the best,
Have gone to their eternal rest.
â€‰
There shrines, and palaces, and towers
Are not like any thing of ours
Oh no! O no! ours never loom
To heaven with that ungodly gloom!


In [4]:
with open('robert_frost.txt', 'r') as file:
    print("\nFirst 10 lines of robert_frost.txt:")
    for _ in range(10):
        print(file.readline().strip())


First 10 lines of robert_frost.txt:
Two roads diverged in a yellow wood,
And sorry I could not travel both
And be one traveler, long I stood
And looked down one as far as I could
To where it bent in the undergrowth;

Then took the other, as just as fair,
And having perhaps the better claim
Because it was grassy and wanted wear,
Though as for that the passing there


In [5]:
# Collect data into lists ; we will consider each line as an individual data source and assign it a label
input_texts = []
labels = []

for label, f in enumerate(input_files):
    print(f"{f} corresponds to label {label}")
    
    with open(f, 'r') as file:
        for line in file:
            line = line.rstrip().lower() #rstrip() removes the newline charectors
            if line:
                # Remove punctuation
                line = line.translate(str.maketrans('', '', string.punctuation))
                #str.maketrans('', '', string.punctuation): Creates a translation table that maps each punctuation character to None.
                #line.translate(...): Removes all punctuation characters from the line using the translation table.
                input_texts.append(line)
                labels.append(label)

edgar_allan_poe.txt corresponds to label 0
robert_frost.txt corresponds to label 1


In [6]:
labels

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [7]:
input_texts

['lo death hath reard himself a throne',
 'in a strange city all alone',
 'far down within the dim west',
 'where the good and the bad and the worst and the best',
 'have gone to their eternal rest',
 'â€‰',
 'there shrines and palaces and towers',
 'are not like any thing of ours',
 'oh no o no ours never loom',
 'to heaven with that ungodly gloom',
 'timeeaten towers that tremble not',
 'resemble nothing that is ours',
 'around by lifting winds forgot',
 'resignedly beneath the sky',
 'the melancholy waters lie',
 'â€‰',
 'no holy rays from heaven come down',
 'on the long nighttime of that town',
 'but light from out the lurid sea',
 'streams up the turrets silently',
 'up thrones up longforgotten bowers',
 'of sculturd ivy and stone flowers',
 'up domes up spires up kingly halls',
 'up fanes up babylonlike walls',
 'up many a melancholy shrine',
 'whose entablatures intertwine',
 'the mask the viol and the vine',
 'â€‰',
 'there open temples open graves',
 'are on a level with the 

In [8]:
train_text, test_text, Ytrain, Ytest = train_test_split(input_texts, labels)

In [9]:
len(Ytrain), len(Ytest)

(1618, 540)

In [10]:
train_text[:5] #First 5 elements of the list


['save but the soul in thine uplifted eyes',
 'yes whats it all about when did she go',
 'you poor dear great great great great granny',
 'estelles to take me when shes settled down',
 'with a strange sound as of a harpstring broken']

In [21]:
Ytrain[:5]

[0, 1, 1, 1, 0]

In [23]:
idx = 1
word2idx = {'<unk>': 0} #for the words that do not appear in the training set

In [25]:
# populate word2idx
for text in train_text:
    tokens = text.split()
    for token in tokens:
      if token not in word2idx:
        word2idx[token] = idx
        idx += 1

In [27]:
word2idx

{'<unk>': 0,
 'save': 1,
 'but': 2,
 'the': 3,
 'soul': 4,
 'in': 5,
 'thine': 6,
 'uplifted': 7,
 'eyes': 8,
 'yes': 9,
 'whats': 10,
 'it': 11,
 'all': 12,
 'about': 13,
 'when': 14,
 'did': 15,
 'she': 16,
 'go': 17,
 'you': 18,
 'poor': 19,
 'dear': 20,
 'great': 21,
 'granny': 22,
 'estelles': 23,
 'to': 24,
 'take': 25,
 'me': 26,
 'shes': 27,
 'settled': 28,
 'down': 29,
 'with': 30,
 'a': 31,
 'strange': 32,
 'sound': 33,
 'as': 34,
 'of': 35,
 'harpstring': 36,
 'broken': 37,
 'an': 38,
 'hour': 39,
 'winter': 40,
 'day': 41,
 'might': 42,
 'seem': 43,
 'too': 44,
 'short': 45,
 'said': 46,
 'sadly': 47,
 'this': 48,
 'star': 49,
 'i': 50,
 'mistrust': 51,
 'before': 52,
 'got': 53,
 'up': 54,
 'do': 55,
 'anything': 56,
 'hope': 57,
 'if': 58,
 'he': 59,
 'is': 60,
 'where': 61,
 'sees': 62,
 'now': 63,
 'and': 64,
 'youre': 65,
 'lost': 66,
 'enough': 67,
 'find': 68,
 'yourself': 69,
 'among': 70,
 'unearthed': 71,
 'potatoes': 72,
 'standing': 73,
 'still': 74,
 'trembling

In [29]:
len(word2idx)

2525

In [31]:
# convert data into integer format(replaces words with their corresponding indices)
train_text_int = []
test_text_int = []

for text in train_text:
  tokens = text.split()
  line_as_int = [word2idx[token] for token in tokens]
  train_text_int.append(line_as_int)

for text in test_text:
  tokens = text.split()
  line_as_int = [word2idx.get(token, 0) for token in tokens] # If a word doesn't appear in the training set then automatically assign it 0)
  test_text_int.append(line_as_int)

In [33]:
train_text_int

[[1, 2, 3, 4, 5, 6, 7, 8],
 [9, 10, 11, 12, 13, 14, 15, 16, 17],
 [18, 19, 20, 21, 21, 21, 21, 22],
 [23, 24, 25, 26, 14, 27, 28, 29],
 [30, 31, 32, 33, 34, 35, 31, 36, 37],
 [38, 39, 35, 40, 41, 42, 43, 44, 45],
 [46, 47, 48, 49, 50, 51],
 [52, 50, 53, 54, 24, 55, 56],
 [50, 57, 58, 59, 60, 61, 59, 62, 26, 63],
 [64, 58, 65, 66, 67, 24, 68, 69],
 [70, 71, 72, 73, 74],
 [3, 75, 76, 77],
 [78, 79, 24, 56, 80, 81],
 [34, 82, 34, 59, 83, 5, 3, 84, 35, 3, 85],
 [86, 87, 86, 88, 89, 90],
 [91, 92, 24, 93, 12, 94, 95],
 [96, 97, 13, 11],
 [3, 98, 99, 64, 3, 100, 101],
 [102, 3, 103, 35, 86, 66, 104],
 [105, 106],
 [107, 108, 109, 63, 110, 111],
 [5, 40, 112, 113, 114],
 [18, 115, 116, 42, 117, 118, 64, 74, 118, 119],
 [120, 121, 122, 123, 124, 125, 126],
 [127, 3, 128, 35, 3, 129, 130],
 [131, 132, 133, 134, 135, 136],
 [137, 24, 25, 138, 139, 50, 140, 141],
 [142, 143, 144, 143, 145, 146],
 [3, 147, 148, 149, 150, 151, 152, 153],
 [24, 154, 131, 155, 30, 156, 157, 8],
 [158, 159, 160, 3, 16

In [35]:
# initialize A and pi matrices - for both classes (For Markov Model)
V = len(word2idx)
#Since we have 2 Classes we need 2 A's and 2 pi's
A0 = np.ones((V, V)) #"Add 1 smoothing"
pi0 = np.ones(V)

A1 = np.ones((V, V))
pi1 = np.ones(V)

In [37]:
# compute counts for A and pi
def compute_counts(text_as_int, A, pi):
  for tokens in text_as_int:
    last_idx = None #For first word
    for idx in tokens:
      if last_idx is None:
        # it's the first word in a sentence
        pi[idx] += 1
      else:
        # the last word exists, so count a transition
        A[last_idx, idx] += 1

      # update last idx
      last_idx = idx


compute_counts([t for t, y in zip(train_text_int, Ytrain) if y == 0], A0, pi0) #Edgar Allen
compute_counts([t for t, y in zip(train_text_int, Ytrain) if y == 1], A1, pi1) #Robert Frost

In [39]:
A0

array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 2., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])

In [41]:
pi0

array([ 1.,  4., 13., ...,  2.,  1.,  1.])

In [43]:
# normalize A and pi so they are valid probability matrices
# convince yourself that this is equivalent to the formulas shown before
A0 /= A0.sum(axis=1, keepdims=True) #keepdims=True ensures the sum is 2-D
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
pi1 /= pi1.sum()

In [45]:
# log A and pi since we don't need the actual probs
logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)

In [47]:
# compute priors (How much data of which poet)
count0 = sum(y == 0 for y in Ytrain)
count1 = sum(y == 1 for y in Ytrain)
total = len(Ytrain)
p0 = count0 / total
p1 = count1 / total
logp0 = np.log(p0)
logp1 = np.log(p1)
p0, p1

(0.3331273176761434, 0.6668726823238567)

In [49]:
# build a classifier
class Classifier:
  def __init__(self, logAs, logpis, logpriors): #Constructor
    self.logAs = logAs
    self.logpis = logpis
    self.logpriors = logpriors
    self.K = len(logpriors) # number of classes

  def _compute_log_likelihood(self, input_, class_):
    logA = self.logAs[class_]
    logpi = self.logpis[class_]

    last_idx = None #Initializing for the first word
    logprob = 0
    for idx in input_:
      if last_idx is None:
        # it's the first token
        logprob += logpi[idx]
      else:
        logprob += logA[last_idx, idx]
      
      # update last_idx
      last_idx = idx
    
    return logprob
  
  def predict(self, inputs):
    predictions = np.zeros(len(inputs))
    for i, input_ in enumerate(inputs):
      posteriors = [self._compute_log_likelihood(input_, c) + self.logpriors[c] \
             for c in range(self.K)]
      pred = np.argmax(posteriors)
      predictions[i] = pred
    return predictions

In [51]:
# each array must be in order since classes are assumed to index these lists
clf = Classifier([logA0, logA1], [logpi0, logpi1], [logp0, logp1])

In [53]:
Ptrain = clf.predict(train_text_int)
print(f"Train acc: {np.mean(Ptrain == Ytrain)}")

Train acc: 0.9962917181705809


In [55]:
Ptest = clf.predict(test_text_int)
print(f"Test acc: {np.mean(Ptest == Ytest)}")

Test acc: 0.8111111111111111


In [57]:
from sklearn.metrics import confusion_matrix, f1_score

# read about F-score: https://en.wikipedia.org/wiki/F-score

In [59]:
cm = confusion_matrix(Ytrain, Ptrain)
cm

array([[ 533,    6],
       [   0, 1079]], dtype=int64)

In [61]:
cm_test = confusion_matrix(Ytest, Ptest)
cm_test

array([[ 91,  92],
       [ 10, 347]], dtype=int64)

In [63]:
f1_score(Ytrain, Ptrain)

0.9972273567467653

In [65]:
f1_score(Ytest, Ptest)

0.8718592964824121