In [None]:
import nltk
import numpy as np
import math
nltk.download('universal_tagset')
nltk.download('brown')

[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


True

In [None]:
corpus = nltk.corpus.brown.tagged_sents(tagset='universal')[:10000]

In [None]:
def get_tag_list_and_mapping(corpus):
    tag_set = set([item[1] for sentence in corpus for item in sentence])
    unique_tag_list = list(sorted(tag_set))
    tag_mapping = {tag:idx for idx, tag in enumerate(unique_tag_list)}
    return unique_tag_list, tag_mapping

unique_tag_list, tag_mapping = get_tag_list_and_mapping(corpus)

print(unique_tag_list)
tag_mapping

['.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'VERB', 'X']


{'.': 0,
 'ADJ': 1,
 'ADP': 2,
 'ADV': 3,
 'CONJ': 4,
 'DET': 5,
 'NOUN': 6,
 'NUM': 7,
 'PRON': 8,
 'PRT': 9,
 'VERB': 10,
 'X': 11}

In [None]:
def get_word_list_and_mapping(corpus):
    word_frequency_dict = {}
    for sentence in corpus:
        for item in sentence:
            word = item[0].lower()
            word_frequency_dict[word] = word_frequency_dict.get(word, 0) + 1
    OOV_count = 0
    word_frequency = {}
    for word, count in word_frequency_dict.items():
        if count<=2:
            OOV_count += count
        else:
            word_frequency[word] = count
    word_frequency['oov'] = OOV_count
    unique_word_list = list(word_frequency.keys())
    word_mapping = {word:idx for idx, word in enumerate(unique_word_list)}
    return unique_word_list, word_mapping

unique_word_list, word_mapping = get_word_list_and_mapping(corpus)

print(unique_word_list)



In [None]:
# Initial Distribution
def get_initial_state_distribution(corpus, unique_tag_list):
    initial_distribution = {}
    for sentence in corpus:
        tag = sentence[0][1]
        initial_distribution[tag] = initial_distribution.get(tag, 0) + 1
    for tag in initial_distribution:
        initial_distribution[tag] = (initial_distribution[tag]+0.001)/(len(corpus)+0.001)
    pi = []
    for tag in unique_tag_list:
        pi.append(initial_distribution[tag])
    return initial_distribution, np.array(pi)

initial_distribution, pi = get_initial_state_distribution(corpus, unique_tag_list)

print(pi)
initial_distribution

[0.06500009 0.0422001  0.12070009 0.07750009 0.05490009 0.24030008
 0.19650008 0.0173001  0.11340009 0.0307001  0.0410001  0.0005001 ]


{'.': 0.06500009349999064,
 'ADJ': 0.04220009577999042,
 'ADP': 0.1207000879299912,
 'ADV': 0.07750009224999077,
 'CONJ': 0.054900094509990546,
 'DET': 0.24030007596999242,
 'NOUN': 0.19650008034999195,
 'NUM': 0.017300098269990172,
 'PRON': 0.11340008865999113,
 'PRT': 0.030700096929990303,
 'VERB': 0.04100009589999041,
 'X': 0.000500099949990005}

**Transition Matrix**

In [None]:
# Transition Matrix
def create_transition_matrix(corpus, tag_mapping):
    transition_matrix = [[0 for j in range(len(tag_mapping))] for i in range(len(tag_mapping))]
    for sentence in corpus:
        previous = sentence[0][1]
        for idx in range(1, len(sentence)):
            current = sentence[idx][1]
            transition_matrix[tag_mapping[previous]][tag_mapping[current]] += 1
            previous = current 
    transition_matrix = np.array([[(value+0.001)/(sum(row)+0.001) for value in row] for row in transition_matrix])
    return transition_matrix

transition_matrix = create_transition_matrix(corpus, tag_mapping)
transition_matrix

array([[1.41649974e-01, 5.28306906e-02, 1.11393505e-01, 6.30540679e-02,
        9.38423890e-02, 1.20139515e-01, 1.78997803e-01, 2.13332336e-02,
        6.38813932e-02, 2.01513403e-02, 1.30776555e-01, 1.95018308e-03],
       [8.94240116e-02, 5.92502859e-02, 7.97318452e-02, 7.61968865e-03,
        3.32825947e-02, 6.03480609e-03, 6.74855247e-01, 1.28010355e-02,
        2.37738480e-03, 1.85309955e-02, 1.54831444e-02, 6.09631173e-04],
       [1.13778993e-02, 8.45791632e-02, 1.89263194e-02, 1.36240145e-02,
        1.28879147e-03, 4.43147528e-01, 2.87281858e-01, 3.90308919e-02,
        4.96723231e-02, 1.21879736e-02, 3.85153900e-02, 3.68252435e-04],
       [1.39919531e-01, 1.50413488e-01, 1.38329538e-01, 9.63537104e-02,
        1.36740499e-02, 8.06657748e-02, 4.01739410e-02, 1.54760425e-02,
        3.62519571e-02, 2.93619854e-02, 2.59169042e-01, 2.12105129e-04],
       [2.29678116e-02, 1.17558308e-01, 6.76943083e-02, 8.80932173e-02,
        4.53460191e-04, 1.57147302e-01, 2.89815762e-01, 1.94

**Observation Matrix**

In [None]:
# Observation Matrix
def create_observation_matrix(corpus, tag_mapping, word_mapping):
    observation_matrix = [[0 for j in range(len(word_mapping))] for i in range(len(tag_mapping))]
    for sentence in corpus:
        for word_tag in sentence:
            tag = tag_mapping[word_tag[1]]
            word = word_mapping[word_tag[0].lower()] if word_tag[0].lower() in word_mapping else word_mapping['oov']
            observation_matrix[tag][word] += 1
    observation_matrix = np.array([[(value+0.001)/(sum(row)+0.001) for value in row] for row in observation_matrix])
    return observation_matrix

observation_matrix = create_observation_matrix(corpus, tag_mapping, word_mapping)
print(unique_tag_list)
print(unique_word_list)
observation_matrix

['.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'VERB', 'X']


array([[3.78229117e-08, 3.78229117e-08, 3.78229117e-08, ...,
        3.78229117e-08, 3.78229117e-08, 3.78229117e-08],
       [6.08827969e-08, 6.08827969e-08, 6.08827969e-08, ...,
        6.08827969e-08, 6.08827969e-08, 1.49162913e-01],
       [3.68202056e-08, 3.68202056e-08, 3.68202056e-08, ...,
        3.68202056e-08, 3.68202056e-08, 6.99620726e-04],
       ...,
       [1.97199724e-07, 1.97199724e-07, 1.97199724e-07, ...,
        1.97199724e-07, 1.97199724e-07, 6.90218756e-03],
       [3.08327928e-08, 3.08327928e-08, 3.08327928e-08, ...,
        3.08327928e-08, 3.08327928e-08, 9.93432893e-02],
       [3.93699237e-06, 3.93699237e-06, 3.93699237e-06, ...,
        3.93699237e-06, 3.93699237e-06, 4.80317007e-01]])

**Viterbi Algorithm**

In [None]:
# Viterbi Algorithm
def viterbi(obs, pi, A, B):
    dp = [[0 for i in range(len(obs))] for j in range(len(pi))]
    state_dp = [[-1 for i in range(len(obs))] for j in range(len(pi))]

    for idx, initial_prob in enumerate(pi):
        dp[idx][0] = initial_prob*B[idx][obs[0]]

    for i in range(1, len(obs)):
        for j in range(len(pi)):
            max = float('-inf')
            arg_max_state = -1
            for k in range(len(pi)):
                value = np.log(dp[k][i-1]) + np.log(A[k][j]) + np.log(B[j][obs[i]])
                if value > max:
                    max = value
                    arg_max_state = k
            dp[j][i] = np.exp(max)
            state_dp[j][i] = arg_max_state

    column_max = float('-inf')
    column_arg_max_state = -1 
    last_state = -1
    for j in range(len(pi)):
        if dp[j][len(obs)-1] > column_max:
            column_max = dp[j][len(obs)-1]
            column_arg_max_state = state_dp[j][len(obs)-1]
            last_state = j
            
    states = [column_arg_max_state, last_state]
    for i in range(len(obs)-2, 0, -1):
        states = [state_dp[0][i]] + states

    return states

In [None]:
test_corpus = nltk.corpus.brown.tagged_sents(tagset='universal')[10150:10153]

for sentence in test_corpus:
    obs = [word_mapping.get(word_tag[0].lower(), word_mapping.get('oov')) for word_tag in sentence]
    states = viterbi(obs, pi, transition_matrix, observation_matrix)
    predicted_tags = [unique_tag_list[state] for state in states]
    original_tags = [word_tag[1] for word_tag in sentence]
    original_sentence = [word_tag[0] for word_tag in sentence]
    print("Sentence: ", original_sentence)
    print("Original Tags: ",original_tags)
    print("Predicted Tags: ",predicted_tags)

Sentence:  ['Those', 'coming', 'from', 'other', 'denominations', 'will', 'welcome', 'the', 'opportunity', 'to', 'become', 'informed', '.']
Original Tags:  ['DET', 'VERB', 'ADP', 'ADJ', 'NOUN', 'VERB', 'VERB', 'DET', 'NOUN', 'PRT', 'VERB', 'VERB', '.']
Predicted Tags:  ['DET', 'NOUN', 'ADP', 'ADJ', 'NOUN', 'VERB', 'VERB', 'DET', 'NOUN', 'PRT', 'VERB', 'VERB', '.']
Sentence:  ['The', 'preparatory', 'class', 'is', 'an', 'introductory', 'face-to-face', 'group', 'in', 'which', 'new', 'members', 'become', 'acquainted', 'with', 'one', 'another', '.']
Original Tags:  ['DET', 'ADJ', 'NOUN', 'VERB', 'DET', 'ADJ', 'ADJ', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'VERB', 'VERB', 'ADP', 'NUM', 'DET', '.']
Predicted Tags:  ['DET', 'NOUN', 'NOUN', 'VERB', 'DET', 'NOUN', 'NOUN', 'NOUN', 'ADP', 'DET', 'ADJ', 'NOUN', 'VERB', 'NOUN', 'ADP', 'NUM', 'DET', '.']
Sentence:  ['It', 'provides', 'a', 'natural', 'transition', 'into', 'the', 'life', 'of', 'the', 'local', 'church', 'and', 'its', 'organizations', '.']
O