In [1]:
import nltk 
import random 
import numpy as np 
from nltk.corpus import brown 
from sklearn.metrics import classification_report

In [2]:
nltk.download('brown') 
nltk.download('universal_tagset')

tagged_sentences = brown.tagged_sents(tagset="universal")

sentences = list(tagged_sentences) 
random.seed(42)

num_folds = 5 
fold_size = len(sentences) // num_folds 
folds = []

for i in range(num_folds):
    start = i * fold_size
    
    if i == num_folds - 1:
        fold = sentences[start:]
    else: fold = sentences[start:start + fold_size]
    folds.append(fold)

unique_tags = sorted(set(tag for sent in sentences for _, tag in sent)) 
num_tags = len(unique_tags)

tag_to_index = {tag: i for i, tag in enumerate(unique_tags)} 
index_to_tag = {i: tag for tag, i in tag_to_index.items()}

[nltk_data] Downloading package brown to /home/cs240lab/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/cs240lab/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [3]:
def train_hmm(train_sentences):
    transition_counts = np.ones((num_tags, num_tags)) 
    emission_counts = {}
    
    for sentence in train_sentences:
        prev_tag = None
        for word, tag in sentence:
            tag_idx = tag_to_index[tag]
            if word not in emission_counts:
                emission_counts[word] = np.ones(num_tags)
            emission_counts[word][tag_idx] += 1
            if prev_tag is not None:
                prev_idx = tag_to_index[prev_tag]
                transition_counts[prev_idx][tag_idx] += 1
            prev_tag = tag

    transition_probs = np.log(transition_counts / transition_counts.sum(axis=1, keepdims=True))
    emission_probs = {word: np.log(counts / counts.sum()) for word, counts in emission_counts.items()}

    return transition_probs, emission_probs

In [4]:
def viterbi(sentence, transition_probs, emission_probs): 
    V = [{}] 
    for i, tag in enumerate(unique_tags): 
        V[0][tag] = {"prob": emission_probs.get(sentence[0], np.full(num_tags, -np.inf))[i] + np.log(1/num_tags), "prev": None}

    for t in range(1, len(sentence)):
        V.append({})
        for i, tag in enumerate(unique_tags):
            max_tr_prob = max(V[t-1][prev_tag]["prob"] + transition_probs[tag_to_index[prev_tag], i] for prev_tag in unique_tags)
            for prev_tag in unique_tags:
                if V[t-1][prev_tag]["prob"] + transition_probs[tag_to_index[prev_tag], i] == max_tr_prob:
                    max_prob = max_tr_prob + emission_probs.get(sentence[t], np.full(num_tags, -np.inf))[i]
                    V[t][tag] = {"prob": max_prob, "prev": prev_tag}
                    break

    opt = []
    max_prob = max(value["prob"] for value in V[-1].values())
    previous = None

    for tag, data in V[-1].items():
        if data["prob"] == max_prob:
            opt.append(tag)
            previous = tag
            break

    for t in range(len(V) - 2, -1, -1):
        opt.insert(0, V[t + 1][previous]["prev"])
        previous = V[t + 1][previous]["prev"]

    return opt

actual_tags, predicted_tags = [], [] 
for i in range(num_folds): 
    print("\nFold:", i+1) 
    test_set = folds[i] 
    train_set = [sent for j in range(num_folds) if j != i for sent in folds[j]]

    trans_probs, emiss_probs = train_hmm(train_set)

    for sentence in test_set:
        words = [word for word, _ in sentence]
        actual = [tag for _, tag in sentence]
        predicted = viterbi(words, trans_probs, emiss_probs)
        actual_tags.extend(actual)
        predicted_tags.extend(predicted)

    print("\nClassification Report:") 
    print(classification_report(actual_tags, predicted_tags, labels=unique_tags))



Fold: 1

Classification Report:
              precision    recall  f1-score   support

           .       0.24      1.00      0.38     30115
         ADJ       0.91      0.48      0.63     18941
         ADP       0.91      0.56      0.69     31048
         ADV       0.92      0.54      0.68     10971
        CONJ       0.99      0.53      0.69      7662
         DET       0.95      0.62      0.75     29455
        NOUN       0.93      0.46      0.62     66790
         NUM       0.99      0.33      0.50      3974
        PRON       0.95      0.65      0.78      8204
         PRT       0.93      0.50      0.65      5791
        VERB       0.97      0.56      0.71     37128
           X       0.75      0.01      0.02       281

    accuracy                           0.58    250360
   macro avg       0.87      0.52      0.59    250360
weighted avg       0.85      0.58      0.64    250360


Fold: 2

Classification Report:
              precision    recall  f1-score   support

           .

In [7]:
def pos_tag_sentence(sentence): 
    trans_probs, emiss_probs = train_hmm(sentences) 
    return viterbi(sentence.split(), trans_probs, emiss_probs)

example_sentence = "I have been using this for more than ten years 10 guy" 
m=example_sentence.lower()
print(m)

print("\n pos Tags for input sentence:", pos_tag_sentence(m))




i have been using this for more than ten years 10 guy

 pos Tags for input sentence: ['PRON', 'VERB', 'VERB', 'VERB', 'DET', 'ADP', 'ADJ', 'ADP', 'NUM', 'NOUN', 'NUM', 'NOUN']
