In [42]:

import nltk
import random
import numpy as np
from nltk.corpus import brown

nltk.download('brown')
nltk.download('universal_tagset')

tagged_sentences = brown.tagged_sents(tagset="universal")
sentences = list(tagged_sentences)


[nltk_data] Downloading package brown to /home/cs240lab/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/cs240lab/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [45]:
random.seed(42)
#random.shuffle(sentences)

num_folds = 5
fold_size = len(sentences) // num_folds
folds = []


for i in range(num_folds):
    start = i * fold_size
    
    if i == num_folds - 1:
        fold = sentences[start:]
    else:
        fold = sentences[start:start + fold_size]
    folds.append(fold)

unique_tags = set()
for sent in sentences:
    for _, tag in sent:
        unique_tags.add(tag)
unique_tags = sorted(list(unique_tags))  
num_tags = len(unique_tags)
print(num_tags)
print(unique_tags)
tag_to_index = {tag: i for i, tag in enumerate(unique_tags)}
print(tag_to_index)

# train HMM from training data
def train_hmm(train_sentences):
    
    transition_counts = [[0 for _ in range(num_tags)] for _ in range(num_tags)]
    
    emission_counts = {}  

    for sentence in train_sentences:
        prev_tag = None
        for word, tag in sentence:
            tag_idx = tag_to_index[tag]
            # If word is new
            if word not in emission_counts:
                emission_counts[word] = [0] * num_tags
            emission_counts[word][tag_idx] += 1

            if prev_tag is not None:
                prev_idx = tag_to_index[prev_tag]
                transition_counts[prev_idx][tag_idx] += 1

            prev_tag = tag

    # Apply add-one laplace
    transition_probs = [[0 for _ in range(num_tags)] for _ in range(num_tags)]
    for i in range(num_tags):
        row_total = sum(transition_counts[i]) + num_tags  
        for j in range(num_tags):
            transition_probs[i][j] = (transition_counts[i][j] + 1) / row_total


    emission_probs = {}
    for word, counts in emission_counts.items():
        total = sum(counts) + num_tags  
        probs = []
        for count in counts:
            probs.append((count + 1) / total)
        emission_probs[word] = probs

    return transition_probs, emission_probs

for i in range(num_folds):
    print("\nFold: ", i+1)
    
    test_set = folds[i]
    train_set = []
    for j in range(num_folds):
        if j != i:
            train_set.extend(folds[j])
    

    trans_probs, emiss_probs = train_hmm(train_set)

    print("First few rows of the transition matrix:")
    for row in trans_probs[:12]:
        print(["{0:.4f}".format(prob) for prob in row])

    sample_words = list(emiss_probs.keys())[:5]
    print("\nSample emission probabilities:")
    for word in sample_words:
        print(f"Word: '{word}' ->", ["{0:.4f}".format(prob) for prob in emiss_probs[word]])

12
['.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'VERB', 'X']
{'.': 0, 'ADJ': 1, 'ADP': 2, 'ADV': 3, 'CONJ': 4, 'DET': 5, 'NOUN': 6, 'NUM': 7, 'PRON': 8, 'PRT': 9, 'VERB': 10, 'X': 11}

Fold:  1
First few rows of the transition matrix:
['0.1807', '0.0449', '0.1036', '0.0722', '0.1159', '0.1065', '0.1237', '0.0187', '0.0774', '0.0320', '0.1223', '0.0021']
['0.1030', '0.0564', '0.0909', '0.0103', '0.0387', '0.0059', '0.6466', '0.0056', '0.0043', '0.0196', '0.0182', '0.0005']
['0.0093', '0.0816', '0.0206', '0.0159', '0.0020', '0.4590', '0.2520', '0.0283', '0.0743', '0.0146', '0.0418', '0.0005']
['0.1783', '0.1333', '0.1428', '0.0966', '0.0181', '0.0714', '0.0312', '0.0128', '0.0508', '0.0286', '0.2360', '0.0001']
['0.0207', '0.1101', '0.0734', '0.0918', '0.0003', '0.1494', '0.2341', '0.0188', '0.0715', '0.0256', '0.2037', '0.0006']
['0.0127', '0.2364', '0.0091', '0.0171', '0.0007', '0.0057', '0.6305', '0.0087', '0.0102', '0.0021', '0.0654', '0.0014']
['0.2902', '