# Text Classifier

Use the Markov principles to build a text classifier that can identify the lines of an unknown poem as belonging to either Edgar Allan Poe or Robert Frost.

Lines of poems belonging to both authors have been saved to separate text files in the data folder.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import string

%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
# View the head of text files

with open('data/edgar_allan_poe.txt') as input_file: 
    head = [next(input_file) for _ in range(20)]

print(head)

["LO! Death hath rear'd himself a throne\n", 'In a strange city, all alone,\n', 'Far down within the dim west\n', 'Where the good, and the bad, and the worst, and the best,\n', 'Have gone to their eternal rest.\n', 'â€‰\n', 'There shrines, and palaces, and towers\n', 'Are not like any thing of ours\n', 'Oh no! O no! ours never loom\n', 'To heaven with that ungodly gloom!\n', 'Time-eaten towers that tremble not!\n', 'Resemble nothing that is ours.\n', 'Around, by lifting winds forgot,\n', 'Resignedly beneath the sky\n', 'The melancholy waters lie.\n', 'â€‰\n', 'No holy rays from heaven come down\n', 'On the long night-time of that town,\n', 'But light from out the lurid sea\n', 'Streams up the turrets silently\n']


In [4]:
with open('data/robert_frost.txt') as input_file: 
    head = [next(input_file) for _ in range(20)]

print(head)

['Two roads diverged in a yellow wood,\n', 'And sorry I could not travel both\n', 'And be one traveler, long I stood\n', 'And looked down one as far as I could\n', 'To where it bent in the undergrowth; \n', '\n', 'Then took the other, as just as fair,\n', 'And having perhaps the better claim\n', 'Because it was grassy and wanted wear,\n', 'Though as for that the passing there\n', 'Had worn them really about the same,\n', '\n', 'And both that morning equally lay\n', 'In leaves no step had trodden black.\n', 'Oh, I kept the first for another day! \n', 'Yet knowing how way leads on to way\n', 'I doubted if I should ever come back.\n', '\n', 'I shall be telling this with a sigh\n', 'Somewhere ages and ages hence:\n']


In [5]:
# Note that each line is terminated with '\n' character (new line) - use .rstrip() to remove
# Note that some lines are empty (â€‰ is single quotation mark)
# Note that there are capitals and lower-case letters - use .lower()

In [6]:
input_files = ['data/edgar_allan_poe.txt', 'data/robert_frost.txt']

In [7]:
input_texts = []
labels = []

# enumerate() assigns an index number at the same time
for label, f in enumerate(input_files): 
    print(f"{f} corresponds to label {label}") 
    
    # This will only run if line is not empty
    for line in open(f): 
        line = line.rstrip().lower() 
        
        if line: 
            # Remove punctuation (taken from StackOverflow)
            line = line.translate(str.maketrans('', '', string.punctuation)) 
            input_texts.append(line) 
            labels.append(label)

data/edgar_allan_poe.txt corresponds to label 0
data/robert_frost.txt corresponds to label 1


In [8]:
# Hmmm...those 'â€‰' lines are still there...

input_texts[:10]

['lo death hath reard himself a throne',
 'in a strange city all alone',
 'far down within the dim west',
 'where the good and the bad and the worst and the best',
 'have gone to their eternal rest',
 'â€‰',
 'there shrines and palaces and towers',
 'are not like any thing of ours',
 'oh no o no ours never loom',
 'to heaven with that ungodly gloom']

In [9]:
# i.e. top half belongs to Edgar Allan Poe, so lines appear in the order in which files were processed

labels[:10]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [10]:
train_text, test_text, y_train, y_test = train_test_split(input_texts, labels)

In [11]:
len(y_train), len(y_test)

(1618, 540)

In [12]:
# Order of lines and labels now mixed up

train_text[:10]

['to go west to a worse fight with the desert',
 'that made him throw his bare legs out of bed',
 'oh fly let us fly for we must',
 'were not too much to pay for birth',
 'wherever the ground was low and wet',
 'and the smell of fire drowned in rain',
 'no fãªte today he said',
 'they left the named',
 'to tell the truth suppose the time had come',
 'something i must have learned riding in trains']

In [13]:
y_train[:10]

[1, 1, 0, 1, 1, 1, 1, 1, 1, 1]

## Convert lines of text to lines of integers

You must create a **word-to-index mapping** dictionary that holds the index number for each word. These indices are used to replace the words when converting the text to numbers. The dictionary is built using the training data only, since you would not have access to the test set in the real world.

NOTE: You should always assign an index for unknown words that will be introduced in the test set.

In [14]:
# Acts as counter index as you loop through the words/tokens, i.e. current index
idx = 1

# Intialize word-to-index mapping dictionary with special index in place for unknown words
word2idx = {'<unk>': 0}

In [15]:
# Loop through training text and populate word2idx mapping

for text in train_text: 
    # Split text into tokens, i.e. words
    tokens = text.split() 
    
    for token in tokens: 
        if token not in word2idx: 
            word2idx[token] = idx 
            idx += 1

In [16]:
word2idx['snow']

217

In [17]:
# This will determine the size of Markov matrix (2491x2491), as well as vocabulary size

len(word2idx)

2491

In [18]:
# Convert all text lines into integer lines with word2idx mapping replacing the words

train_text_int = []
test_text_int = []

# Loop through training set
for text in train_text: 
    tokens = text.split() 
    line_as_int = [word2idx[token] for token in tokens] 
    train_text_int.append(line_as_int)

# Loop through test set - need to account for unknown words, i.e. assign 0
for text in test_text: 
    tokens = text.split() 
    # Get token index or return 0
    line_as_int = [word2idx.get(token, 0) for token in tokens] 
    test_text_int.append(line_as_int)

In [19]:
# Each number in list corresponds to a word

train_text_int[100:105]

[[37, 378, 8, 379, 18, 8, 19, 380, 67],
 [42, 370, 381, 382, 18, 8, 383],
 [8, 384, 136, 385, 18, 386, 387, 37, 388],
 [274, 389, 1, 108, 390, 391, 392],
 [18, 393, 7, 394, 100, 395]]

In [20]:
test_text_int[100:105]

[[37, 1488, 100, 168, 0, 1, 2360],
 [756, 275, 42, 8, 0, 0, 18, 233],
 [1658, 1571, 1659, 876],
 [357, 59, 484, 1985],
 [8, 0, 1578, 8, 0, 1149, 362]]

## Setting up the `A` matrix and `pi` array

In [21]:
# Total number of words
V = len(word2idx)

# Initialize A and pi matrices for both models representing each category (Poe model | Frost model)
# All matrices and arrays initialized with 1s to use add-one smoothing

A0 = np.ones((V, V))
pi0 = np.ones(V)

A1 = np.ones((V, V))
pi1 = np.ones(V)

In [22]:
# Create function to compute word counts for A and pi for a single Markov model, i.e. single category
# Input list of integers and arrays to be populated

def compute_counts(text_as_int, A, pi): 
    # Loop through each line of integers
    for tokens in text_as_int: 
        # No index for previous word - this helps to know whether you are dealing with A or pi
        last_idx = None 
        
        for idx in tokens: 
            if last_idx is None: 
                # If previous word is None, it's the first word in a sentence and populate pi
                pi[idx] += 1 
            else: 
                # The previous word exists so count as transition from one word to the next and populate A
                A[last_idx, idx] += 1 
                
            # Update last idx to current idx so it has correct value for next iteration
            last_idx = idx


In [23]:
# Populate matrices with word count probability distributions
compute_counts([t for t, y in zip(train_text_int, y_train) if y == 0], A0, pi0)

In [24]:
compute_counts([t for t, y in zip(train_text_int, y_train) if y == 1], A1, pi1)

In [25]:
# Output objects are the populated A and pi matrices for both Poe and Frost models
A0

array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]])

In [26]:
pi1

array([ 1., 38.,  1., ...,  1.,  1.,  1.])

In [27]:
# Normalize A and pi so they go from counts to probability matrices
# keepdims=True ensures that the 2-dimensions are kept for the output (pi is 1D-array so no need)

A0 /= A0.sum(axis=1, keepdims=True)
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
pi1 /= pi1.sum()

In [28]:
# Find log A and pi since we don't need the actual probability values

logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)

In [29]:
# COMPUTE CLASS BALANCE STATISTICS

# How many samples belong to class 0 and 1 in training set?
count0 = sum(y == 0 for y in y_train)
count1 = sum(y == 1 for y in y_train)

total = len(y_train)

# Calculate proportions of each class in training set
p0 = count0 / total
p1 = count1 / total

# Calculate logs probabilities of each class in training set - these are the 'log priors' to use in classifier
logp0 = np.log(p0)
logp1 = np.log(p1)

In [30]:
# There is approx 34% Edgar vs 66% Frost - quite imbalanced dataset
p0, p1

(0.323238566131026, 0.676761433868974)

In [31]:
# It is not recommended to use Maximum Likelihood method in classifier

## Building the Markov Model

The `Classifier` object below contains the methods to compute the log likelihoods and predict categories for new unseen text (lines of integers) based on the A and pi objects populated by the training data, as well as input from the log probabilities for class representation calculated above.

In [32]:
class Classifier: 
    # The Constructor, which takes in the 'fitted' data and log priors and saves them as attributes of the object
    def __init__(self, logAs, logpis, logpriors): 
        self.logAs = logAs 
        self.logpis = logpis 
        self.logpriors = logpriors 
        self.K = len(logpriors) # Number of classes (2)
        
    # Takes in line of integer text and class     
    def _compute_log_likelihood(self, input_, class_):
        logA = self.logAs[class_] 
        logpi = self.logpis[class_] 
        
        last_idx = None 
        # logprob will hold the final answer
        logprob = 0 
        for idx in input_: 
            if last_idx is None: 
                # If previous word doesn't exist, it's the first token 
                logprob += logpi[idx]
            else: 
                logprob += logA[last_idx, idx] 
                
            # Update last_idx to current idx for next iteration 
            last_idx = idx 
            
        return logprob
    
    # Takes in list of integer lines
    def predict(self, inputs): 
        # Initialize variable to hold predicted classes
        predictions = np.zeros(len(inputs)) 
        # Loop through each line and compute log likelihood for each
        for i, input_ in enumerate(inputs): 
            posteriors = [self._compute_log_likelihood(input_, c) + self.logpriors[c] \
                          for c in range(self.K)]
            pred = np.argmax(posteriors)
            predictions[i] = pred 
            
        return predictions

In [33]:
# Input is list of log(A)s, list of log(pi)s and list of log priors
# Each array MUST BE IN ORDER of classes since the classes index the input lists (0, 1)

clf = Classifier([logA0, logA1], [logpi0, logpi1], [logp0, logp1])

In [34]:
# Test accuracy of classifier on training set, almost perfect as expected

p_train = clf.predict(train_text_int)

print(f"Train accuracy: {np.mean(p_train == y_train)}")

Train accuracy: 0.9913473423980222


In [35]:
# Test accuracy of classifier on test set ~ 78% (hmmm...but dataset is imbalanced)

p_test = clf.predict(test_text_int)

print(f"Test accuracy: {np.mean(p_test == y_test)}")

Test accuracy: 0.7888888888888889


In [36]:
from sklearn.metrics import confusion_matrix, f1_score

In [41]:
# 14 mis-labelled lines in training set

cm_train = confusion_matrix(y_train, p_train)

cm_train

array([[ 509,   14],
       [   0, 1095]], dtype=int64)

In [38]:
# 114 mis-labelled lines in test set

cm_test = confusion_matrix(y_test, p_test)

cm_test

array([[ 91, 108],
       [  6, 335]], dtype=int64)

In [39]:
# As expected, an almost perfect score
f1_score(y_train, p_train)

0.9936479128856625

In [40]:
# F1- score is quite good, so overall a sufficient text classifier - maybe there were a lot more unknown words

f1_score(y_test, p_test)

0.8545918367346939