<a href="https://colab.research.google.com/github/ShivinM-17/nlp-practices/blob/main/Markov_model_text_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt
!wget https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt

--2023-09-10 17:20:50--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/edgar_allan_poe.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26622 (26K) [text/plain]
Saving to: ‘edgar_allan_poe.txt’


2023-09-10 17:20:50 (121 MB/s) - ‘edgar_allan_poe.txt’ saved [26622/26622]

--2023-09-10 17:20:50--  https://raw.githubusercontent.com/lazyprogrammer/machine_learning_examples/master/hmm_class/robert_frost.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 56286 (55K) [text/plain]
Saving t

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import string
from sklearn.model_selection import train_test_split

### Loading the poems and checking the data

In [None]:
input_files = [
    '/content/edgar_allan_poe.txt',
    '/content/robert_frost.txt'
]

In [None]:
!head /content/edgar_allan_poe.txt

LO! Death hath rear'd himself a throne
In a strange city, all alone,
Far down within the dim west
Where the good, and the bad, and the worst, and the best,
Have gone to their eternal rest.
 
There shrines, and palaces, and towers
Are not like any thing of ours
Oh no! O no! ours never loom
To heaven with that ungodly gloom!


In [None]:
!head /content/robert_frost.txt

Two roads diverged in a yellow wood,
And sorry I could not travel both
And be one traveler, long I stood
And looked down one as far as I could
To where it bent in the undergrowth; 

Then took the other, as just as fair,
And having perhaps the better claim
Because it was grassy and wanted wear,
Though as for that the passing there


In [None]:
# Collect the data into lists

input_texts = []
labels = []

for label, f in enumerate(input_files):
  print(f"{f} corresponds to label {label}")

  for line in open(f):
    line = line.rstrip().lower() # this helps to remove the empty line which are present
    # then we lowercase the whole sentence

    if line:
      # remove punctuation
      line = line.translate(str.maketrans('','',string.punctuation))

    input_texts.append(line)
    labels.append(label)

/content/edgar_allan_poe.txt corresponds to label 0
/content/robert_frost.txt corresponds to label 1


In [None]:
input_texts, labels

(['lo death hath reard himself a throne',
  'in a strange city all alone',
  'far down within the dim west',
  'where the good and the bad and the worst and the best',
  'have gone to their eternal rest',
  '',
  'there shrines and palaces and towers',
  'are not like any thing of ours',
  'oh no o no ours never loom',
  'to heaven with that ungodly gloom',
  'timeeaten towers that tremble not',
  'resemble nothing that is ours',
  'around by lifting winds forgot',
  'resignedly beneath the sky',
  'the melancholy waters lie',
  '',
  'no holy rays from heaven come down',
  'on the long nighttime of that town',
  'but light from out the lurid sea',
  'streams up the turrets silently',
  'up thrones up longforgotten bowers',
  'of sculturd ivy and stone flowers',
  'up domes up spires up kingly halls',
  'up fanes up babylonlike walls',
  'up many a melancholy shrine',
  'whose entablatures intertwine',
  'the mask the viol and the vine',
  '',
  'there open temples open graves',
  'are

Here, we can see that labels only consists of (0,1) since only two authors are present to be classified.

Therefore, this is a case of binary classification

In [None]:
# train-test-split the data
train_text, test_text, y_train, y_test = train_test_split(input_texts,
                                                          labels,
                                                          random_state=42)

In [None]:
len(y_train), len(y_test)

(1783, 595)

In [None]:
len(train_text), len(test_text)

(1783, 595)

In [None]:
train_text[:5]

['',
 'youll be expecting john i pity estelle',
 'her pallor i strangely mistrust ',
 'you ought to have the kitchen to yourself',
 'i name all the flowers i am sure they werent']

In [None]:
y_train[:5]

[1, 1, 0, 1, 1]

### Converting text into integers

In [None]:
idx = 1
# making a '<unk>' token, for the cases where vocabulary is unknown
word2idx = {'<unk>':0}

In [None]:
# populate word2idx
for text in train_text:
  tokens = text.split()
  for token in tokens:
    if token not in word2idx:
      word2idx[token]=idx
      idx+=1

In [None]:
word2idx

{'<unk>': 0,
 'youll': 1,
 'be': 2,
 'expecting': 3,
 'john': 4,
 'i': 5,
 'pity': 6,
 'estelle': 7,
 'her': 8,
 'pallor': 9,
 'strangely': 10,
 'mistrust': 11,
 'you': 12,
 'ought': 13,
 'to': 14,
 'have': 15,
 'the': 16,
 'kitchen': 17,
 'yourself': 18,
 'name': 19,
 'all': 20,
 'flowers': 21,
 'am': 22,
 'sure': 23,
 'they': 24,
 'werent': 25,
 'this': 26,
 'is': 27,
 'a': 28,
 'good': 29,
 'home': 30,
 'dont': 31,
 'ask': 32,
 'for': 33,
 'better': 34,
 'and': 35,
 'queenly': 36,
 'lily': 37,
 'adown': 38,
 'dale': 39,
 'what': 40,
 'do': 41,
 'we': 42,
 'see': 43,
 'in': 44,
 'such': 45,
 'hole': 46,
 'wonder': 47,
 'ill': 48,
 'find': 49,
 'that': 50,
 'fountain': 51,
 'if': 52,
 'it': 53,
 'takes': 54,
 'summer': 55,
 'poetess': 56,
 'who': 57,
 'wrote': 58,
 'book': 59,
 'of': 60,
 'verses': 61,
 'noted': 62,
 'not': 63,
 'dim': 64,
 'lake': 65,
 'auber': 66,
 'among': 67,
 'raspberries': 68,
 'hew': 69,
 'shape': 70,
 'he': 71,
 'said': 72,
 'had': 73,
 'besides': 74,
 'becaus

In [None]:
len(word2idx)

2510

We have 2510 unique words present in the training set

### Convert the data into integer format

In [None]:
train_text_int = []
test_text_int = []

for text in train_text:
  tokens = text.split()
  line_as_int = [word2idx[token] for token in tokens]
  train_text_int.append(line_as_int)

for text in test_text:
  tokens = text.split()

  # Setting 0 as default value for a unknown token if it is present
  line_as_int = [word2idx.get(token, 0) for token in tokens]
  test_text_int.append(line_as_int)

In [None]:
train_text_int[90:100]

[[48, 77, 12, 40, 12, 236, 91, 12, 79],
 [328, 12, 41, 105, 329, 330, 12, 191, 16, 331],
 [154, 103, 332, 44, 333, 334, 335, 336, 12],
 [],
 [31, 16, 337, 338, 14, 339],
 [16, 340, 341, 35, 16, 342, 343],
 [57, 122, 24, 33],
 [191, 53, 222, 44, 222, 269, 71, 344, 85, 315],
 [14, 16, 345, 60, 346],
 [35, 347, 44, 348, 349, 173]]

In [None]:
# Initialise A and pi matrices - for both classes
V = len(word2idx)

# These will represent the 2 markov models
A0 = np.ones((V,V))
pi0 = np.ones(V)

A1 = np.ones((V,V))
pi1 = np.ones(V)

In [None]:
# Compute counts for A and pi
# Here, data is passed for a single class at a time
def compute_counts(text_as_int, A, pi):
  for tokens in text_as_int:
    last_idx = None
    for idx in tokens:
      if last_idx is None:
        # If it is None, then this is the first word of the sentence
        pi[idx] += 1
      else:
        # if last word exist, then we count a transition
        A[last_idx, idx] += 1

      # update last idx
      last_idx = idx

In [None]:
compute_counts([t for t,y in zip(train_text_int, y_train) if y==0], A0, pi0)
compute_counts([t for t,y in zip(train_text_int, y_train) if y==1], A1, pi1)

In [None]:
# Normalize A and pi so they are valid probability matrices
A0 /= A0.sum(axis=1, keepdims=True)
pi0 /= pi0.sum()

A1 /= A1.sum(axis=1, keepdims=True)
pi1 /= pi1.sum()

In [None]:
# Taking log of A and pi
logA0 = np.log(A0)
logpi0 = np.log(pi0)

logA1 = np.log(A1)
logpi1 = np.log(pi1)

In [None]:
# compute priors
count0 = sum(y==0 for y in y_train)
count1 = sum(y==1 for y in y_train)

total = len(y_train)

p0 = count0/total
p1 = count1/total

logp0 = np.log(p0)
logp1 = np.log(p1)

In [None]:
p0, p1

(0.3303421200224341, 0.6696578799775659)

In [None]:
logp0, logp1

(-1.1076264342115536, -0.400988323910593)

### Building a classifier

In [None]:
# build a classifier
class Classifier:
  def __init__(self, logAs, logpis, logpriors):
    self.logAs = logAs
    self.logpis = logpis
    self.logpriors = logpriors
    self.K = len(logpriors)

  def _compute_log_likelihood(self, input_, class_):
    # class will tell us which markov model to use
    logA = self.logAs[class_]
    logpi = self.logpis[class_]

    last_idx = None
    logprob = 0
    for idx in input_:
      if last_idx is None:
        # It is the first token, beginning of the sentence
        logprob += logpi[idx]
      else:
        logprob += logA[last_idx, idx]

      # update last_idx
      last_idx = idx
    return logprob

  def predict(self, inputs):
    predictions = np.zeros(len(inputs))
    for i, input_ in enumerate(inputs):
      posteriors = [self._compute_log_likelihood(input_, c)+self.logpriors[c]
                    for c in range(self.K)]
      pred = np.argmax(posteriors)
      predictions[i]=pred
    return predictions


In [None]:
# each array must be in order since classes are assumed to index these lists
clf = Classifier([logA0, logA1], [logpi0, logpi1], [logp0, logp1])

In [None]:
p_train = clf.predict(train_text_int)
print(f"Train acc: {np.mean(p_train==y_train)}")

Train acc: 0.96578799775659


In [None]:
p_test = clf.predict(test_text_int)
print(f"Test acc: {np.mean(p_test==y_test)}")

Test acc: 0.7983193277310925


In [None]:
# Getting a confusion matrix, since the data is imbalanced in nature
from sklearn.metrics import confusion_matrix, f1_score

In [None]:
cm = confusion_matrix(y_train, p_train)

In [None]:
cm

array([[ 528,   61],
       [   0, 1194]])

In [None]:
cm_test = confusion_matrix(y_test, p_test)
cm_test

array([[ 92, 116],
       [  4, 383]])

In [None]:
f1_score(y_train, p_train)

0.9750918742343814

In [None]:
f1_score(y_test, p_test)

0.8645598194130927