# Part 2: Estimate Transition Parameters


### Code from Part 1 Needed

In [19]:
def process_file(filepath):
    # we make use of the default library "collections" to make processing the tags and word-tag pairs easier
    import collections #used for counting
    tag_count = collections.defaultdict(int)  # counting for tags
    word_tag_count = collections.defaultdict(int)  # counting for word-tag pairs
    vocabulary = set()  # stores unique words
    sentences = []
    current_sentence = []

    with open(filepath, 'r', encoding='utf-8') as file:
        # reading file line-by-line
        for line in file:
            stripped_line = line.strip() #removes the /n and then splits it to separete the word and its label
            if stripped_line:  # check if there even is a word or tag in the line
                word, tag = stripped_line.split()  # Split line into word and tag
                word_tag_count[(word, tag)] += 1
                tag_count[tag] += 1
                vocabulary.add(word) #doesnt add duplicates
                current_sentence.append(word)
            else:
                if current_sentence:
                    sentences.append(current_sentence)
                    current_sentence = []
        if current_sentence:
            sentences.append(current_sentence)

    return tag_count, word_tag_count, vocabulary, sentences


#tag count : dictionary with the count of each tag e.g ('B-NP') : 45
#word_tag_count : dictionary with the count of each word-tag pair e.g ('Municipal','B-NP') : 1

tag_count, word_tag_count, vocabulary, sentences = process_file('EN/train')


In [20]:
def estimate_all_emission_probabilities_with_unknown(tag_count, word_tag_count, k =0.1):
  
    emission_probabilities = {}
    # iterate through all the word tag pairs to get all the emission probabilities
    # store the results in the dictionary emission_probabilities
    for (word, tag), count in word_tag_count.items():
        
        emission_probabilities[(word, tag)] = count / (tag_count[tag]+k)
        
    for tag, count in tag_count.items():
        emission_probabilities[("#UNK#", tag)] = count / (tag_count[tag]+k)
    return emission_probabilities

In [21]:
emission_probabilities = estimate_all_emission_probabilities_with_unknown(tag_count, word_tag_count)


In [22]:
def process_file_for_transitions(filepath):
    # we make use of the default library "collections" to make processing the tags and word-tag pairs easier
    import collections
    transition_count = collections.defaultdict(int) #y_u to y_v, including start and stop
    tag_count = collections.defaultdict(int)  # counting for tags
    vocab = set()
    
    # we still need counters for stop and start, to add them into the transition parameters
    start_counter = 0 
    stop_counter = 0
    
    START = "START"
    STOP = "STOP"
    previous_tag = START

    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            stripped_line = line.strip()
            if stripped_line:
                word, tag = stripped_line.split()
                transition_count[(previous_tag, tag)] += 1
                if previous_tag == "START":
                    start_counter += 1
                tag_count[tag] += 1
                previous_tag = tag
                vocab.add(word)
            else:  # when the sentence has ended
                transition_count[(previous_tag, STOP)] += 1
                stop_counter += 1
                previous_tag = START  # reset for the next sentence
     #adding counts for start and stop
    tag_count["START"] = start_counter
    tag_count["STOP"] = stop_counter
    
    with open(filepath, 'r', encoding='utf-8') as file:
        content = file.read().strip()
    # split on double newlines which denote separated sentences in our case
    sentences = [sentence.split() for sentence in content.split('\n\n')]

    return transition_count, tag_count, sentences, vocab

In [23]:
def get_sentences(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        content = file.read().strip()
    # split on double newlines which denote separated sentences in our case
    return [sentence.split() for sentence in content.split('\n\n')]


In [24]:
# run the functions 
transition_count, tag_count, sentences, vocabulary = process_file_for_transitions('EN/train')

## Functions to write to output file

##### We make use of these functions to write the predictions to an output file

In [25]:
# we need to use viterbi to predict sentence by sentence
def get_prediction(filepath, tag_count, transition_probabilities, emission_probabilities, vocabulary):
    sentences = get_sentences(filepath)
    predictions = [] #initialise the list of sentences
    for sentence in sentences:
       
        # predict the best path, sentence by sentence
        best_path_prediction = viterbi_algorithm(sentence, tag_count, transition_probabilities, emission_probabilities, vocabulary)
       
        #puts the word - predicted tag pairs in the predictions array pairwise
        predictions.append(list(zip(sentence, best_path_prediction))) 
        
    return predictions
    

In [26]:
def write_tag_predictions_to_file(prediction, output_filepath):
    # open the output file for writing
    with open(output_filepath, 'w', encoding='utf-8') as file:
        for sentence in predictions:
            for word, tag in sentence:
                # write each word and its predicted tag to the file, with a spacing to separate them.
                file.write(f"{word} {tag}\n")
            # leave an empty line between sentences
            file.write("\n")

## Part 2a, 10 points (estimate transition parameters)
###### we have a function to calculate the transitiuon probability of a set of u,v states, based on the tag and transition counts. We also have another function to calculate the transition probability of all the different transitions present in the file.

###### we will be using the latter mainly for this projject

In [27]:
def estimate_one_transition_probability(y_u, y_v, transition_count, tag_count):
        # get the total times y->x occurs
    tag_transition_freq = transition_count.get((y_u, y_v), 0)
    # total times y appears
    tag_total_freq = tag_count.get(y, 1)
    
    return tag_transition_freq / tag_total_freq
    

In [28]:
def estimate_all_transition_probability(transition_count, tag_count):
  
    transition_probabilities = {}
    # iterate through all the transition tag pairs to get all the transition probabilities
    # store the results in the dictionary transition_probabilities
    for (y_u, y_v), count in transition_count.items():
        transition_probabilities[(y_u, y_v)] = count / tag_count[y_u]
        
    return transition_probabilities

In [29]:
# run the function to get all the transiiton probaibilities
transition_probabilities = estimate_all_transition_probability(transition_count, tag_count)

## Part 2b, 15 points (viterbi algorithm implementation)

###### We have a function that runs the Viterbi algorithm to obtain the ideal sequence of tags for a given sequence of observations


In [30]:
def viterbi_algorithm(sentence, tag_count, transition_probabilities, emission_probabilities, vocabulary, unk = 0.1):
    # make sure that "sentence" is a sequence of x observations
    
    tags = [tag for tag in tag_count if tag not in ['START', 'STOP']] 
    # makes a dictionary of tags that doesnt include start and stop, so we dont iterate through them unncessarily except 
    # at the actual start and end of the sentence
    
    n = len(sentence)  # number of words in the sentence (k)
    m = len(tags)      # number of tags (u / v)
    
    # to account for unknown words. For this to work effectively, the emission parameters should include the probabilities for unknown words.
    for i in range(0,n):
        if sentence[i] not in vocabulary:
            sentence[i] = '#UNK#'
    
    # create a matrix to store all the pi values, initialised at "-inf" so we can easily look for max score or probability to input
    pi = [[float('-inf')] * m for _ in range(n+1)] #+1 tp account for the stop state, but we dont actually store anything theer
    backpointer = [[0] * m for _ in range(n)] # to store y*

    # we skip the step of assigning pi(0,v) = 1 if v is start and 0 otherwise, so our base case is when u is "START"
    for i, tag in enumerate(tags): # i is the index of the tag
        t_count = tag_count[tag]
        pi[0][i] = transition_probabilities.get(('START', tag), 0) * emission_probabilities.get((sentence[0], tag),0)

            
    #bottom up dynamic programming, updating the next values of pi based on the previous values of pi
    for i in range(1, n): 
        for j, tag in enumerate(tags):
            max_prob = float('-inf')
            max_state = None
            for kk, prev_tag in enumerate(tags):
                prob = pi[i-1][kk] * transition_probabilities.get((prev_tag, tag), 0) * emission_probabilities.get((sentence[i], tag), 0)
                
                if prob > max_prob:
                    max_prob = prob
                    max_state = kk
                    
            pi[i][j] = max_prob
            backpointer[i][j] = max_state # to store y*

    # termination step 
    max_prob = float('-inf')
    max_state = None
    for i, tag in enumerate(tags):
        prob = pi[n-1][i] * transition_probabilities.get((tag, 'STOP'), 0)
        if prob > max_prob:
            max_prob = prob
            max_state = i
            # no need to store in backpointer
            
    #initialise an array for the best sequence of states using the max state from the termination step    
    best_path = [tags[max_state]]
    
    #go backwards along the backpointer, iteratively finding the best state
    for i in range(n-1, 0, -1):
        max_state = backpointer[i][max_state]
        best_path.append(tags[max_state])
        
    # reverse the array to get the sequence of states or tags for the observations in the right order
    best_path.reverse()
    
    return best_path

In [31]:
# run the functions to write predictions to file
predictions = get_prediction('EN/dev.in', tag_count, transition_probabilities, emission_probabilities, vocabulary)
write_tag_predictions_to_file(predictions, 'EN/dev.p2.out')

In [33]:
# evaluate the scores of the viterbi algorithm implementation
!python3 EvalScript/evalResult.py EN/dev.out EN/dev.p2.out


#Entity in gold data: 13179
#Entity in prediction: 14279

#Correct Entity : 10858
Entity  precision: 0.7604
Entity  recall: 0.8239
Entity  F: 0.7909

#Correct Sentiment : 10056
Sentiment  precision: 0.7043
Sentiment  recall: 0.7630
Sentiment  F: 0.7325
