In [83]:
import requests

In [84]:
input_files = ['edgar_allan_poe.txt',
  'robert_frost.txt']

In [86]:
import numpy as np
import pandas as pd
import string
from sklearn.model_selection import train_test_split

In [87]:
input_texts = []
labels = []

for label, f in enumerate(input_files):
  print(f"{f} corresponds to label {label}")

  for line in open(f):
    line = line.rstrip().lower()
    if line:
      # remove punctuation
      line = line.translate(str.maketrans('', '', string.punctuation))

      input_texts.append(line)
      labels.append(label)

edgar_allan_poe.txt corresponds to label 0
robert_frost.txt corresponds to label 1


In [88]:
train_text, test_text, Ytrain, Ytest = train_test_split(input_texts, labels)


In [89]:
len(Y_train), len(Y_test)

(1618, 540)

In [90]:
train_text[0:5]

['yes stark and you',
 'its as you throw a picture on a screen',
 'a broken drinking goblet like the grail',
 'the pen falls powerless from my shivering hand',
 'is this some trance you are withdrawing into']

In [91]:
Y_train[0:5]

[1, 1, 0, 1, 0]

In [92]:
idx = 1
word_to_idx = {'<unk>': 0}


for line in train_text:
    tokens = line.split()
    
    for token in tokens:
        if token not in word_to_idx:
            word_to_idx[token] = idx
            idx += 1
    



In [93]:
word_to_idx

{'<unk>': 0,
 'yes': 1,
 'stark': 2,
 'and': 3,
 'you': 4,
 'its': 5,
 'as': 6,
 'throw': 7,
 'a': 8,
 'picture': 9,
 'on': 10,
 'screen': 11,
 'broken': 12,
 'drinking': 13,
 'goblet': 14,
 'like': 15,
 'the': 16,
 'grail': 17,
 'pen': 18,
 'falls': 19,
 'powerless': 20,
 'from': 21,
 'my': 22,
 'shivering': 23,
 'hand': 24,
 'is': 25,
 'this': 26,
 'some': 27,
 'trance': 28,
 'are': 29,
 'withdrawing': 30,
 'into': 31,
 'but': 32,
 'get': 33,
 'color': 34,
 'music': 35,
 'out': 36,
 'of': 37,
 'life': 38,
 'old': 39,
 'davis': 40,
 'owned': 41,
 'solid': 42,
 'mica': 43,
 'mountain': 44,
 'all': 45,
 'lost': 46,
 'himself': 47,
 'tongue': 48,
 'fire': 49,
 'give': 50,
 'headshake': 51,
 'she': 52,
 'should': 53,
 'shouldnt': 54,
 'youre': 55,
 'so': 56,
 'many': 57,
 'times': 58,
 'by': 59,
 'picking': 60,
 'faded': 61,
 'blue': 62,
 'to': 63,
 'eye': 64,
 'man': 65,
 'shall': 66,
 'his': 67,
 'undivided': 68,
 'time': 69,
 'ill': 70,
 'tell': 71,
 'what': 72,
 'voices': 73,
 'really

In [94]:
len(word_to_idx)

2545

In [111]:
train_text_int = []
test_text_int = []

for text in train_text:
  tokens = text.split()
  line_as_int = [word_to_idx[token] for token in tokens]
  train_text_int.append(line_as_int)

for text in test_text:
  tokens = text.split()
  line_as_int = [word_to_idx.get(token, 0) for token in tokens]
  test_text_int.append(line_as_int)

In [112]:
train_text_int[100:105]

[[382, 5, 383, 260, 384, 6, 8, 385],
 [3, 386, 387, 130, 287, 388, 389],
 [45, 221, 37, 91, 63, 390, 72, 199, 106, 391],
 [52, 392, 157, 16, 393, 344, 8, 264, 37, 125],
 [123, 394, 29, 395, 394]]

In [113]:
V = len(word_to_idx)

A0 = np.ones((V, V))
pi0 = np.ones(V)

A1 = np.ones((V, V))
pi1 = np.ones(V)

In [114]:
def compute_counts(text_as_int, A, pi):
  for tokens in text_as_int:
    last_idx = None
    for idx in tokens:
      if last_idx is None:
        # it's the first word in a sentence
        pi[idx] += 1
      else:
        # the last word exists, so count a transition
        A[last_idx, idx] += 1

      # update last idx
      last_idx = idx


compute_counts([t for t, y in zip(train_text_int, Ytrain) if y == 0], A0, pi0)
compute_counts([t for t, y in zip(train_text_int, Ytrain) if y == 1], A1, pi1)

In [115]:
A0 /= A0.sum(axis=1, keepdims=True)
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
pi1 /= pi1.sum()

In [116]:
logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)

In [117]:
count0 = sum(y == 0 for y in Ytrain)
count1 = sum(y == 1 for y in Ytrain)
total = len(Ytrain)
p0 = count0 / total
p1 = count1 / total
logp0 = np.log(p0)
logp1 = np.log(p1)
p0, p1

(0.33868974042027195, 0.6613102595797281)

In [118]:
class Classifier:
  def __init__(self, logAs, logpis, logpriors):
    self.logAs = logAs
    self.logpis = logpis
    self.logpriors = logpriors
    self.K = len(logpriors) # number of classes

  def _compute_log_likelihood(self, input_, class_):
    logA = self.logAs[class_]
    logpi = self.logpis[class_]

    last_idx = None
    logprob = 0
    for idx in input_:
      if last_idx is None:
        # it's the first token
        logprob += logpi[idx]
      else:
        logprob += logA[last_idx, idx]
      
      # update last_idx
      last_idx = idx
    
    return logprob
  
  def predict(self, inputs):
    predictions = np.zeros(len(inputs))
    for i, input_ in enumerate(inputs):
      posteriors = [self._compute_log_likelihood(input_, c) + self.logpriors[c] \
             for c in range(self.K)]
      pred = np.argmax(posteriors)
      predictions[i] = pred
    return predictions

In [119]:
clf = Classifier([logA0, logA1], [logpi0, logpi1], [logp0, logp1])

In [121]:
def mean_accuracy(predicted, actual):
    correct = sum(p == a for p, a in zip(predicted, actual))
    total = len(predicted)
    return correct / total

In [122]:
Ptrain = clf.predict(train_text_int)

train_acc = mean_accuracy(Ptrain, Y_train)
print(f"Train acc: {train_acc}")

Train acc: 0.5673671199011124


In [126]:
len(Ptest), len(Y_test)

(540, 540)

In [127]:
Ptest = clf.predict(test_text_int)


test_acc = mean_accuracy(Ptest, Y_test)
print(f"Test acc: {test_acc}")

Test acc: 0.6


In [128]:
from sklearn.metrics import confusion_matrix, f1_score

In [129]:
cm = confusion_matrix(Ytrain, Ptrain)
cm

array([[ 545,    3],
       [   0, 1070]], dtype=int64)

In [130]:
cm_test = confusion_matrix(Ytest, Ptest)
cm_test

array([[ 90,  84],
       [ 17, 349]], dtype=int64)

In [131]:
f1_score(Ytrain, Ptrain)

0.9986000933271115

In [134]:
f1_score(Ytest, Ptest)

0.8735919899874842