In [1]:
import requests

In [2]:
input_files = ['edgar_allan_poe.txt',
  'robert_frost.txt']

In [3]:
import numpy as np
import pandas as pd
import string
from sklearn.model_selection import train_test_split

In [4]:
input_texts = []
input_labels = []

for label, f in enumerate(input_files):
    print(f"{f} corresponds to label {label}")

    for line in open(f):
        line = line.rstrip().lower()
        if line:
            line = line.translate(str.maketrans('', '', string.punctuation))
            
            input_texts.append(line)
            input_labels.append(label)
    

edgar_allan_poe.txt corresponds to label 0
robert_frost.txt corresponds to label 1


In [5]:
train_text, test_text, Y_train, Y_test = train_test_split(input_texts, input_labels)

In [6]:
len(Y_train), len(Y_test)

(1618, 540)

In [7]:
train_text[0:5]

['from their throats',
 'you let me say it on consideration',
 'as if by eye pairs out of forty firkins',
 'sancta maria turn thine eyes ',
 'of cypress with psyche my soul']

In [8]:
Y_train[0:5]

[0, 1, 1, 0, 0]

In [9]:
word_to_idx = {'<unk>': 0}
idx = 1

for line in train_text:
    tokens = line.split()
    
    for token in tokens:
        if token not in word_to_idx:
            word_to_idx[token] = idx
            idx += 1
    



In [10]:
word_to_idx

{'<unk>': 0,
 'from': 1,
 'their': 2,
 'throats': 3,
 'you': 4,
 'let': 5,
 'me': 6,
 'say': 7,
 'it': 8,
 'on': 9,
 'consideration': 10,
 'as': 11,
 'if': 12,
 'by': 13,
 'eye': 14,
 'pairs': 15,
 'out': 16,
 'of': 17,
 'forty': 18,
 'firkins': 19,
 'sancta': 20,
 'maria': 21,
 'turn': 22,
 'thine': 23,
 'eyes': 24,
 'cypress': 25,
 'with': 26,
 'psyche': 27,
 'my': 28,
 'soul': 29,
 'would': 30,
 'have': 31,
 'been': 32,
 'starks': 33,
 'and': 34,
 'doubtless': 35,
 'here': 36,
 'today': 37,
 'i': 38,
 'wonder': 39,
 'why': 40,
 'he': 41,
 'doesnt': 42,
 'marry': 43,
 'her': 44,
 'can': 45,
 'come': 46,
 'down': 47,
 'everything': 48,
 'to': 49,
 'nothing': 50,
 'what': 51,
 'is': 52,
 'lest': 53,
 'should': 54,
 'truant': 55,
 'be': 56,
 'like': 57,
 'no': 58,
 'she': 59,
 'aint': 60,
 'back': 61,
 'kiting': 62,
 'yet': 63,
 'who': 64,
 'alterest': 65,
 'all': 66,
 'things': 67,
 'thy': 68,
 'peering': 69,
 'the': 70,
 'hillside': 71,
 'day': 72,
 'sun': 73,
 'lets': 74,
 'go': 75,


In [11]:
len(word_to_idx)

2532

In [12]:
train_text_int = []
test_text_int = []

for line in train_text:
    tokens = line.split()
    line_as_int = [ word_to_idx[token] for token in tokens]
    train_text_int.append(line_as_int)
    
for line in test_text:
    tokens = line.split()
    line_as_int = [ word_to_idx.get(token, 0) for token in tokens]
    train_text_int.append(line_as_int)


In [13]:
train_text_int[100:105]

[[347, 28, 348, 349, 350, 81, 99],
 [70, 351, 17, 352, 110, 353, 17, 354],
 [28, 355, 49, 56, 356, 13, 2, 357, 246],
 [49, 358, 359, 49, 4],
 [360, 122, 361, 103, 110, 155, 362, 47, 76, 70, 363]]

In [14]:
V = len(word_to_idx)

A0 = np.ones((V, V))
Pi0 = np.ones(V)

A1 = np.ones((V, V))
Pi1 = np.ones(V)

In [15]:
def compute_counts(train_text_int, A, Pi):
    for token in train_text_int:
        last_idx = None
        
        for idx in token:
            if last_idx is None:
                Pi[idx] += 1
            else:
                A[last_idx, idx] += 1
            
            last_idx = idx


compute_counts([t for t, y in zip(train_text_int, Y_train) if y == 0], A0, Pi0)
compute_counts([t for t, y in zip(train_text_int, Y_train) if y == 1], A1, Pi1)
                

In [16]:
A0 /= A0.sum(axis = 1, keepdims = True)
Pi0 /= Pi0.sum()

A1 /= A1.sum(axis = 1, keepdims = True)

Pi1 /= Pi1.sum()

In [17]:
logA0 = np.log(A0)
logPi0 = np.log(Pi0)

logA1 = np.log(A1)
logPi1 = np.log(Pi1)

In [24]:
count0 = sum(y == 0 for y in Y_train)
count1 = sum(y == 1 for y in Y_train)

total = len(Y_train)

P0 = count0 / total
P1 = count1 / total

logP0 = np.log(P0)
logP1 = np.log(P1)

P0, P1

(0.3368355995055624, 0.6631644004944376)

In [25]:
class Classifier:
  def __init__(self, logAs, logpis, logpriors):
    self.logAs = logAs
    self.logpis = logpis
    self.logpriors = logpriors
    self.K = len(logpriors) 

  def _compute_log_likelihood(self, input_, class_):
    logA = self.logAs[class_]
    logpi = self.logpis[class_]

    last_idx = None
    logprob = 0
    for idx in input_:
      if last_idx is None:
        # it's the first token
        logprob += logpi[idx]
      else:
        logprob += logA[last_idx, idx]
      
      # update last_idx
      last_idx = idx
    
    return logprob
  
  def predict(self, inputs):
    predictions = np.zeros(len(inputs))
    for i, input_ in enumerate(inputs):
      posteriors = [self._compute_log_likelihood(input_, c) + self.logpriors[c] \
             for c in range(self.K)]
      pred = np.argmax(posteriors)
      predictions[i] = pred
    return predictions

In [26]:
clf = Classifier([logA0, logA1], [logPi0, logPi1], [logP0, logP1])

In [32]:
Ptrain = clf.predict(train_text_int)
Ptrain

array([0., 1., 1., ..., 0., 1., 1.])

In [33]:
def mean_accuracy(predicted, actual):
    correct = sum(p == a for p, a in zip(predicted, actual))
    total = len(predicted)
    return correct / total

In [34]:
train_acc = mean_accuracy(Ptrain, Y_train)
train_acc

0.7460611677479148

In [35]:
test_acc = mean_accuracy(Ptest, Y_test)
test_acc

ZeroDivisionError: division by zero