In [None]:
import nltk
from nltk.corpus import brown
from collections import defaultdict
nltk.download('brown')
nltk.download('universal_tagset')

# Step 1: Load corpus and prepare data
tagged_sentences = brown.tagged_sents(tagset='universal')
train_data = tagged_sentences[:40000]
test_sentence = ["Dog", "is", "about", "to", "jump", "on", "a", "cat"]
# Step 2: Compute Transition and Emission Probabilities
transition_prob = defaultdict(lambda: defaultdict(int))
emission_prob = defaultdict(lambda: defaultdict(int))
tag_counts = defaultdict(int)
for sentence in train_data:
    prev_tag = "<s>"
    for word, tag in sentence:
        transition_prob[prev_tag][tag] += 1
        emission_prob[tag][word.lower()] += 1
        tag_counts[tag] += 1
        prev_tag = tag
    transition_prob[prev_tag]["</s>"] += 1
# Normalize probabilities
for prev_tag in transition_prob:
    total = sum(transition_prob[prev_tag].values())
    for tag in transition_prob[prev_tag]:
        transition_prob[prev_tag][tag] /= total

for tag in emission_prob:
    total = sum(emission_prob[tag].values())
    for word in emission_prob[tag]:
        emission_prob[tag][word] /= total

# Step 3: Implement Viterbi Algorithm
def viterbi(sentence, tags):
    viterbi_matrix = defaultdict(lambda: defaultdict(float))
    backpointer = defaultdict(lambda: defaultdict(str))
    # Initialization
    for tag in tags:
        viterbi_matrix[0][tag] = (
            transition_prob["<s>"][tag] * emission_prob[tag].get(sentence[0].lower(), 1e-6)
        )
        backpointer[0][tag] = "<s>"

    # Recursion
    for t in range(1, len(sentence)):
        for tag in tags:
            max_prob, best_prev_tag = max(
                [
                    (
                        viterbi_matrix[t - 1][prev_tag]
                        * transition_prob[prev_tag][tag]
                        * emission_prob[tag].get(sentence[t].lower(), 1e-6),
                        prev_tag,
                    )
                    for prev_tag in tags
                ],
                key=lambda x: x[0],
            )
            viterbi_matrix[t][tag] = max_prob
            backpointer[t][tag] = best_prev_tag

    # Termination
    best_path = []
    max_prob, best_last_tag = max(
        [(viterbi_matrix[len(sentence) - 1][tag], tag) for tag in tags],
        key=lambda x: x[0],
    )
    best_path.append(best_last_tag)

    # Trace back the best path
    for t in range(len(sentence) - 1, 0, -1):
        best_path.append(backpointer[t][best_path[-1]])

    return list(reversed(best_path))

tags = list(tag_counts.keys())
pos_tags = viterbi(test_sentence, tags)
print(list(zip(test_sentence, pos_tags)))

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


[('Dog', 'NOUN'), ('is', 'VERB'), ('about', 'ADV'), ('to', 'PRT'), ('jump', 'VERB'), ('on', 'ADP'), ('a', 'DET'), ('cat', 'NOUN')]


In [7]:
import nltk
from nltk.corpus import brown
from collections import defaultdict

# Download required corpora
nltk.download('brown')
nltk.download('universal_tagset')

# Step 1: Load corpus and prepare data
tagged_sentences = list(brown.tagged_sents(tagset='universal'))
train_data = tagged_sentences[:40000]
test_sentence = ["Dog", "is", "about", "to", "jump", "on", "a", "cat"]

# Step 2: Compute Transition and Emission Probabilities
transition_prob = defaultdict(lambda: defaultdict(int))
emission_prob = defaultdict(lambda: defaultdict(int))
tag_counts = defaultdict(int)

for sentence in train_data:
    prev_tag = "<s>"
    for word, tag in sentence:
        word = word.lower()
        transition_prob[prev_tag][tag] += 1
        emission_prob[tag][word] += 1
        tag_counts[tag] += 1
        prev_tag = tag
    transition_prob[prev_tag]["</s>"] += 1

# Normalize probabilities
for prev_tag in transition_prob:
    total = sum(transition_prob[prev_tag].values())
    for tag in transition_prob[prev_tag]:
        transition_prob[prev_tag][tag] /= total

for tag in emission_prob:
    total = sum(emission_prob[tag].values())
    for word in emission_prob[tag]:
        emission_prob[tag][word] /= total

# Step 3: Viterbi Algorithm
def viterbi(sentence, tags):
    viterbi_matrix = defaultdict(lambda: defaultdict(float))
    backpointer = defaultdict(lambda: defaultdict(str))

    # Initialization
    for tag in tags:
        viterbi_matrix[0][tag] = (
            transition_prob["<s>"][tag] * emission_prob[tag].get(sentence[0].lower(), 1e-6)
        )
        backpointer[0][tag] = "<s>"

    # Recursion
    for t in range(1, len(sentence)):
        for tag in tags:
            max_prob, best_prev_tag = max(
                [
                    (
                        viterbi_matrix[t - 1][prev_tag]
                        * transition_prob[prev_tag][tag]
                        * emission_prob[tag].get(sentence[t].lower(), 1e-6),
                        prev_tag,
                    )
                    for prev_tag in tags
                ],
                key=lambda x: x[0],
            )
            viterbi_matrix[t][tag] = max_prob
            backpointer[t][tag] = best_prev_tag

    # Termination
    best_path = []
    max_prob, best_last_tag = max(
        [(viterbi_matrix[len(sentence) - 1][tag], tag) for tag in tags],
        key=lambda x: x[0],
    )
    best_path.append(best_last_tag)

    # Backtrace
    for t in range(len(sentence) - 1, 0, -1):
        best_path.append(backpointer[t][best_path[-1]])

    return list(reversed(best_path))

# POS Tagging
tags = list(tag_counts.keys())
pos_tags = viterbi(test_sentence, tags)

# Output
print("\nPOS tagging using Viterbi Algorithm:")
for word, tag in zip(test_sentence, pos_tags):
    print(f"{word}: {tag}")


[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\ishan\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!



POS tagging using Viterbi Algorithm:
Dog: NOUN
is: VERB
about: ADV
to: PRT
jump: VERB
on: ADP
a: DET
cat: NOUN
