# Practical 6
## POS tagging part 2: Analyze the result of POS Tagging.

In [1]:
from nlp_lib import *
from collections import defaultdict, Counter

### Load Dataset and Normalize it

In [8]:
# Load dataset using pandas
df = pd.read_json('pos.json')

# Normalize the dataset to access sentences and labels
sentences = df['sentence'].tolist()
tags = df['labels'].tolist()

# Convert tags to a DataFrame for easier manipulation
tags_df = pd.DataFrame(tags)

df.head()

Unnamed: 0,index,sentence,labels
0,0,"[The, Arizona, Corporations, Commission, autho...","[DT, NNP, NNP, NNP, VBD, DT, CD, NN, NN, NN, I..."
1,1,"[The, ruling, follows, a, host, of, problems, ...","[DT, NN, VBZ, DT, NN, IN, NNS, IN, NNP, NNP, ,..."
2,2,"[The, Arizona, regulatory, ruling, calls, for,...","[DT, NNP, JJ, NN, VBZ, IN, $, CD, CD, IN, JJ, ..."
3,3,"[The, company, had, sought, increases, totalin...","[DT, NN, VBD, VBN, NNS, VBG, $, CD, CD, ,, CC,..."
4,4,"[The, decision, was, announced, after, trading...","[DT, NN, VBD, VBN, IN, NN, VBD, .]"


### Calculate Initial Probabilities

In [9]:
initial_prob = tags_df[0].value_counts(normalize=True).to_dict()
print("Initial Probabilities:", initial_prob)

Initial Probabilities: {'NNP': 0.22561968518183462, 'DT': 0.213859236475484, 'IN': 0.11416681744165008, '``': 0.08015198118328207, 'PRP': 0.05844038357155781, 'RB': 0.054640853989506064, 'CC': 0.053374344128822145, 'NNS': 0.0499366745069658, 'JJ': 0.04034738556178759, 'NN': 0.035824136059345035, 'VBG': 0.012303238646643748, 'PRP$': 0.009227428984982812, 'CD': 0.007960919124298896, 'WRB': 0.00741812918400579, 'VBN': 0.006151619323321874, 'EX': 0.0041613895422471505, '-LRB-': 0.0036185996019540436, 'WP': 0.0032567396417586395, 'TO': 0.002894879681563235, 'VB': 0.002171159761172426, 'NNPS': 0.001990229781074724, 'JJS': 0.001990229781074724, 'RBR': 0.0012665098606839153, 'VBZ': 0.0012665098606839153, 'MD': 0.0012665098606839153, 'JJR': 0.0012665098606839153, ':': 0.001085579880586213, 'VBD': 0.001085579880586213, 'RBS': 0.0007237199203908088, 'PDT': 0.0007237199203908088, 'SYM': 0.0007237199203908088, "''": 0.0005427899402931065, 'WDT': 0.0003618599601954044, 'VBP': 0.0001809299800977022}


### Calculate Transition Probabilities

In [None]:
tag_counter = Counter([tag for tag_seq in tags for tag in tag_seq])
transition_counter = defaultdict(Counter)

for tag_seq in tags:
    for i in range(1, len(tag_seq)):
        transition_counter[tag_seq[i - 1]][tag_seq[i]] += 1

transition_prob = {
    tag: {next_tag: transition_counter[tag][next_tag] / tag_counter[tag] for next_tag in transition_counter[tag]}
    for tag in transition_counter
}
print(transition_prob)

### Calculate Emission Probabilities

In [None]:
emission_counter = defaultdict(Counter)
word_counter = Counter([word for sentence in sentences for word in sentence])

for sentence, tag_seq in zip(sentences, tags):
    for word, tag in zip(sentence, tag_seq):
        emission_counter[tag][word] += 1

emission_prob = {
    tag: {word: emission_counter[tag][word] / tag_counter[tag] for word in emission_counter[tag]}
    for tag in emission_counter
}
print(emission_prob)

### Implement the Viterbi Algorithm for POS Tagging

In [12]:
def viterbi(sentence, tags, initial_prob, transition_prob, emission_prob):
    n = len(sentence)
    m = len(tags)
    
    # Initialize viterbi and backpointer matrices
    viterbi = np.zeros((m, n))
    backpointer = np.zeros((m, n), dtype=int)
    
    # Convert tags to index
    tag_index = {tag: i for i, tag in enumerate(tags)}
    
    # Initialization step
    for tag in tags:
        if sentence[0] in emission_prob[tag]:
            viterbi[tag_index[tag], 0] = initial_prob.get(tag, 0) * emission_prob[tag].get(sentence[0], 1e-6)
        else:
            viterbi[tag_index[tag], 0] = initial_prob.get(tag, 0) * 1e-6  # Small probability for unseen words

    # Recursion step
    for t in range(1, n):
        for curr_tag in tags:
            max_prob, max_state = max(
                (viterbi[tag_index[prev_tag], t - 1] * transition_prob[prev_tag].get(curr_tag, 1e-6) *
                 emission_prob[curr_tag].get(sentence[t], 1e-6), tag_index[prev_tag])
                for prev_tag in tags
            )
            viterbi[tag_index[curr_tag], t] = max_prob
            backpointer[tag_index[curr_tag], t] = max_state

    # Termination step
    best_path_prob = max(viterbi[tag_index[tag], n - 1] for tag in tags)
    best_last_tag = np.argmax([viterbi[tag_index[tag], n - 1] for tag in tags])

    # Backtrace the best path
    best_path = [best_last_tag]
    for t in range(n - 1, 0, -1):
        best_last_tag = backpointer[best_last_tag, t]
        best_path.insert(0, best_last_tag)

    # Convert indices back to tags
    index_tag = {i: tag for tag, i in tag_index.items()}
    best_path = [index_tag[i] for i in best_path]
    
    return best_path, best_path_prob

### Testing the Model with a Sample Sentence

In [13]:
test_sentence = sentences[0]  # Using the first sentence in the dataset as a test example

# Get unique tags
unique_tags = list(tag_counter.keys())

# Run Viterbi algorithm on the test sentence
best_path, best_prob = viterbi(test_sentence, unique_tags, initial_prob, transition_prob, emission_prob)

print("Sentence:", test_sentence)
print("Predicted Tags:", best_path)
print("Probability of Best Path:", best_prob)

Sentence: ['The', 'Arizona', 'Corporations', 'Commission', 'authorized', 'an', '11.5', '%', 'rate', 'increase', 'at', 'Tucson', 'Electric', 'Power', 'Co.', ',', 'substantially', 'lower', 'than', 'recommended', 'last', 'month', 'by', 'a', 'commission', 'hearing', 'officer', 'and', 'barely', 'half', 'the', 'rise', 'sought', 'by', 'the', 'utility', '.']
Predicted Tags: ['DT', 'NNP', 'NNP', 'NNP', 'VBD', 'DT', 'CD', 'NN', 'NN', 'NN', 'IN', 'NNP', 'NNP', 'NNP', 'NNP', ',', 'RB', 'JJR', 'IN', 'VBN', 'JJ', 'NN', 'IN', 'DT', 'NN', 'NN', 'NN', 'CC', 'RB', 'PDT', 'DT', 'NN', 'VBN', 'IN', 'DT', 'NN', '.']
Probability of Best Path: 4.746027399410256e-112


### Original Sentence and its tags

In [15]:
print(sentences[0],tags[0])

['The', 'Arizona', 'Corporations', 'Commission', 'authorized', 'an', '11.5', '%', 'rate', 'increase', 'at', 'Tucson', 'Electric', 'Power', 'Co.', ',', 'substantially', 'lower', 'than', 'recommended', 'last', 'month', 'by', 'a', 'commission', 'hearing', 'officer', 'and', 'barely', 'half', 'the', 'rise', 'sought', 'by', 'the', 'utility', '.'] ['DT', 'NNP', 'NNP', 'NNP', 'VBD', 'DT', 'CD', 'NN', 'NN', 'NN', 'IN', 'NNP', 'NNP', 'NNP', 'NNP', ',', 'RB', 'JJR', 'IN', 'VBN', 'JJ', 'NN', 'IN', 'DT', 'NN', 'NN', 'NN', 'CC', 'RB', 'PDT', 'DT', 'NN', 'VBN', 'IN', 'DT', 'NN', '.']


### Evaluate the Model on the Dataset

In [23]:
correct = 0
total = 0

for sentence, true_tags in zip(sentences, tags):
    predicted_tags, _ = viterbi(sentence, unique_tags, initial_prob, transition_prob, emission_prob)
    correct += sum(p == t for p, t in zip(predicted_tags, true_tags))
    total += len(true_tags)

accuracy = correct / total
print("HMM POS Tagging Accuracy:", accuracy)

HMM POS Tagging Accuracy: 0.975623823690122


### Explanation of the Code

1. **Loading Data with Pandas**: The dataset is loaded and normalized with `pd.json_normalize()`, making it easy to extract sentences and labels.
2. **Initial Probabilities**: We calculate the probability of each tag being the starting tag.
3. **Transition Probabilities**: Counts transitions between tags and calculates probabilities based on those transitions.
4. **Emission Probabilities**: Calculates the probability of a word given a tag.
5. **Viterbi Algorithm**: Implements Viterbi for decoding the sequence, finding the most likely POS tag sequence.
6. **Testing and Evaluation**: Tests the model on a sample sentence and calculates overall accuracy across the dataset.

This code work as a standalone POS tagging model based on HMM principles

### Save the Model

In [16]:
import joblib

# Save the initial, transition, and emission probabilities
model = {
    'initial_prob': initial_prob,
    'transition_prob': transition_prob,
    'emission_prob': emission_prob,
    'unique_tags': unique_tags  # Save the list of unique tags as well
}

# Save the model to a file
joblib.dump(model, 'hmm_pos_tagger_model.joblib')

['hmm_pos_tagger_model.joblib']

### Using the Saved Model

In [17]:
# Load the model from the file
loaded_model = joblib.load('hmm_pos_tagger_model.joblib')

# Retrieve the probabilities
initial_prob = loaded_model['initial_prob']
transition_prob = loaded_model['transition_prob']
emission_prob = loaded_model['emission_prob']
unique_tags = loaded_model['unique_tags']

# Now you can use the loaded model with the Viterbi function
test_sentence = ["The", "company", "announced", "a", "new", "policy"]
best_path, best_prob = viterbi(test_sentence, unique_tags, initial_prob, transition_prob, emission_prob)

print("Sentence:", test_sentence)
print("Predicted Tags:", best_path)
print("Probability of Best Path:", best_prob)

Sentence: ['The', 'company', 'announced', 'a', 'new', 'policy']
Predicted Tags: ['DT', 'NN', 'VBD', 'DT', 'JJ', 'NN']
Probability of Best Path: 2.293859621014805e-15


### POS Tagging using Spacy

In [19]:
pos_tag("The company announced a new policy")

[('The', 'DET'),
 ('company', 'NOUN'),
 ('announced', 'VERB'),
 ('a', 'DET'),
 ('new', 'ADJ'),
 ('policy', 'NOUN')]

### POS Tagging using NLTK

In [22]:
word_pos_tag("The company announced a new policy")

[('The', 'DT'),
 ('company', 'NN'),
 ('announced', 'VBD'),
 ('new', 'JJ'),
 ('policy', 'NN')]