## Part 1: Estimate Emission Parameters

##### Function to process the file

In [3]:
def process_file(filepath):
    # we make use of the default library "collections" to make processing the tags and word-tag pairs easier
    import collections #used for counting
    tag_count = collections.defaultdict(int)  # counting for tags
    word_tag_count = collections.defaultdict(int)  # counting for word-tag pairs
    vocabulary = set()  # stores unique words
    sentences = []
    current_sentence = []

    with open(filepath, 'r', encoding='utf-8') as file:
        # reading file line-by-line
        for line in file:
            stripped_line = line.strip() #removes the /n and then splits it to separete the word and its label
            if stripped_line:  # check if there even is a word or tag in the line
                word, tag = stripped_line.split()  # Split line into word and tag
                word_tag_count[(word, tag)] += 1
                tag_count[tag] += 1
                vocabulary.add(word) #doesnt add duplicates
                current_sentence.append(word)
            else:
                if current_sentence:
                    sentences.append(current_sentence)
                    current_sentence = []
        if current_sentence:
            sentences.append(current_sentence)

    return tag_count, word_tag_count, vocabulary, sentences


#tag count : dictionary with the count of each tag e.g ('B-NP') : 45
#word_tag_count : dictionary with the count of each word-tag pair e.g ('Municipal','B-NP') : 1

tag_count, word_tag_count, vocabulary, sentences = process_file('EN/train')


##### Part 1a, 5 points

In [4]:
def estimate_all_emission_probabilities(tag_count, word_tag_count):
  
    emission_probabilities = {}
    # iterate through all the word tag pairs to get all the emission probabilities
    # store the results in the dictionary emission_probabilities
    for (word, tag), count in word_tag_count.items():
        emission_probabilities[(word, tag)] = count / tag_count[tag] 
        #setting a soft defauly emission probability of (no.of times word-tag appears)/(number of occurence of the tag in question in document)
        
    return emission_probabilities

In [5]:
def estimate_one_emission_probabilities(x, y, tag_count, word_tag_count):
    
    # get the total times y->x occurs
    word_tag_freq = word_tag_count.get((x, y), 0)
    # total times y appears
    tag_total_freq = tag_count.get(y, 1)
    
    return word_tag_freq / tag_total_freq


In [6]:
emission_probabilities = estimate_all_emission_probabilities(tag_count, word_tag_count)

In [7]:
emission_probabilities

{('Municipal', 'B-NP'): 2.1139414438220062e-05,
 ('bonds', 'I-NP'): 0.0018134857394075947,
 ('are', 'B-VP'): 0.03707354471277586,
 ('generally', 'B-ADVP'): 0.0033660589060308557,
 ('a', 'B-ADJP'): 0.0017133066818960593,
 ('bit', 'I-ADJP'): 0.003484320557491289,
 ('safer', 'I-ADJP'): 0.0017421602787456446,
 ('than', 'B-PP'): 0.006961440147930603,
 ('corporate', 'B-NP'): 0.0005919036042701618,
 ('in', 'B-PP'): 0.15565345080763582,
 ('a', 'B-NP'): 0.0758693584187718,
 ('recession', 'I-NP'): 0.000641131322012786,
 (',', 'O'): 0.36465315013404825,
 ('but', 'O'): 0.012231903485254691,
 ('not', 'B-ADJP'): 0.0034266133637921186,
 ('as', 'I-ADJP'): 0.012195121951219513,
 ('safe', 'I-ADJP'): 0.003484320557491289,
 ('as', 'B-PP'): 0.019796595420677653,
 ('bonds', 'B-NP'): 0.0004439277032026213,
 ('issued', 'B-VP'): 0.0007666611905153059,
 ('by', 'B-PP'): 0.047207266003154405,
 ('the', 'B-NP'): 0.1639572983828348,
 ('federal', 'I-NP'): 0.001117400304079427,
 ('government', 'I-NP'): 0.0022348006081

##### Part 1b, 10 points

In [8]:
def estimate_emission_probability_with_unknown(tag, word, tag_count, word_tag_count, vocabulary, k=0.1):
    
    # total times y appears + k
    tag_total_freq = tag_count.get(tag, 0) + k
    
    # Check if the word was seen in the training set; if not, use the special UNK token
    # e(x|y) = k/(count(y)+k) if word token is UNK
    if word not in vocabulary:
        word = '#UNK#'
        word_tag_freq = k
        return word_tag_freq / tag_total_freq
    
    # get the total times y->x occurs
    word_tag_freq = word_tag_count.get((word, tag), 0)
  
    return word_tag_freq / tag_total_freq


In [9]:
def estimate_all_emission_probabilities_with_unknown(tag_count, word_tag_count, k =0.1):
  
    emission_probabilities = {}
    # iterate through all the word tag pairs to get all the emission probabilities
    # store the results in the dictionary emission_probabilities
    for (word, tag), count in word_tag_count.items():
        
        emission_probabilities[(word, tag)] = count / (tag_count[tag]+k)
        
    for tag, count in tag_count.items():
        emission_probabilities[("#UNK#", tag)] = count / (tag_count[tag]+k)
    return emission_probabilities

In [10]:
emission_probabilities = estimate_all_emission_probabilities_with_unknown(tag_count, word_tag_count)


In [11]:
# emission_probabilities

In [12]:
print(emission_probabilities)

emission_probabilities_filtered = {word_tag: probability for word_tag, probability in emission_probabilities.items() if word_tag[0] == '#UNK#'}
print(emission_probabilities_filtered) 




{('#UNK#', 'B-NP'): 0.999997886063025, ('#UNK#', 'I-NP'): 0.9999981681995783, ('#UNK#', 'B-VP'): 0.9999945238786273, ('#UNK#', 'B-ADVP'): 0.9999719502959244, ('#UNK#', 'B-ADJP'): 0.9999428930386615, ('#UNK#', 'I-ADJP'): 0.999825814318063, ('#UNK#', 'B-PP'): 0.999994561404463, ('#UNK#', 'O'): 0.9999958110095049, ('#UNK#', 'B-SBAR'): 0.9999473434784899, ('#UNK#', 'I-VP'): 0.999990156608361, ('#UNK#', 'I-ADVP'): 0.9997245937758192, ('#UNK#', 'B-PRT'): 0.999786370433668, ('#UNK#', 'I-PP'): 0.9995517705064993, ('#UNK#', 'B-CONJP'): 0.9979633401221996, ('#UNK#', 'I-CONJP'): 0.9984399375975039, ('#UNK#', 'B-INTJ'): 0.9961685823754789, ('#UNK#', 'I-INTJ'): 0.9859154929577465, ('#UNK#', 'I-SBAR'): 0.9979209979209979, ('#UNK#', 'B-UCP'): 0.9090909090909091, ('#UNK#', 'I-UCP'): 0.9756097560975611, ('#UNK#', 'B-LST'): 0.990990990990991}


In [13]:
tag_count

defaultdict(int,
            {'B-NP': 47305,
             'I-NP': 54591,
             'B-VP': 18261,
             'B-ADVP': 3565,
             'B-ADJP': 1751,
             'I-ADJP': 574,
             'B-PP': 18387,
             'O': 23872,
             'B-SBAR': 1899,
             'I-VP': 10159,
             'I-ADVP': 363,
             'B-PRT': 468,
             'I-PP': 223,
             'B-CONJP': 49,
             'I-CONJP': 64,
             'B-INTJ': 26,
             'I-INTJ': 7,
             'I-SBAR': 48,
             'B-UCP': 1,
             'I-UCP': 4,
             'B-LST': 11})

In [14]:
max(tag_count)

'O'

#### Part 1c, 10 points

In [15]:
def predict_one_tag(word, emission_probabilities, vocabulary,tag_count):
    # use UNK token if the word is not in the vocabulary
    best_tag = None

    if word not in vocabulary:
        word = '#UNK#'
        best_tag = max(tag_count)
#         return '#UNK#'

    # initialise the variables to keep track of the best tag and its highest probability
    max_probability = -1  # Start with a very low probability

    # iterate over all possible tags for the word in the emission probabilities
    for (current_word, tag), probability in emission_probabilities.items():
        if current_word == word and probability > max_probability:
            max_probability = probability
            best_tag = tag

    return word, best_tag


In [16]:
def predict_all_tags(filepath, emission_probabilities, vocabulary, tag_count):
    predictions = []
    current_sentence = []
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            word = line.strip()
            if word:  # Ensure the line is not empty
                word, best_tag = predict_one_tag(word, emission_probabilities, vocabulary, tag_count)
                current_sentence.append((word, best_tag))
            else:
                if current_sentence:
                    predictions.append(current_sentence)
                    current_sentence = []
        if current_sentence:
                    predictions.append(current_sentence)
#                     current_sentence = []

    return predictions

In [17]:
predictions = predict_all_tags("EN/dev.in", emission_probabilities, vocabulary, tag_count)

In [18]:
def write_predictions_to_file(predictions, output_filepath):
    # open the output file for writing
    with open(output_filepath, 'w', encoding='utf-8') as file:
        for sentence in predictions:
            for word, tag in sentence:
                # write each word and its predicted tag to the file, with a spacing to separate.
                file.write(f"{word} {tag}\n")
            file.write("\n")

In [19]:
write_predictions_to_file(predictions, "EN/dev_try.out")

In [20]:
predictions

[[('HBO', 'B-NP'),
  ('has', 'B-VP'),
  ('close', 'B-ADJP'),
  ('to', 'B-PP'),
  ('24', 'I-NP'),
  ('million', 'I-NP'),
  ('subscribers', 'I-NP'),
  ('to', 'B-PP'),
  ('its', 'B-NP'),
  ('HBO', 'B-NP'),
  ('and', 'I-UCP'),
  ('#UNK#', 'I-NP'),
  ('networks', 'I-NP'),
  (',', 'O'),
  ('while', 'B-SBAR'),
  ('Showtime', 'B-NP'),
  ('and', 'I-UCP'),
  ('its', 'B-NP'),
  ('sister', 'I-NP'),
  ('service', 'I-NP'),
  (',', 'O'),
  ('The', 'B-NP'),
  ('#UNK#', 'I-NP'),
  ('Channel', 'I-NP'),
  (',', 'O'),
  ('have', 'I-VP'),
  ('only', 'I-CONJP'),
  ('about', 'B-PP'),
  ('10', 'I-ADVP'),
  ('million', 'I-NP'),
  (',', 'O'),
  ('according', 'B-PP'),
  ('to', 'B-PP'),
  ('Paul', 'B-NP'),
  ('#UNK#', 'I-NP'),
  ('Associates', 'I-NP'),
  (',', 'O'),
  ('a', 'B-LST'),
  ('#UNK#', 'I-NP'),
  (',', 'O'),
  ('Calif.', 'B-NP'),
  (',', 'O'),
  ('research', 'I-NP'),
  ('firm', 'I-NP'),
  ('.', 'O')],
 [('#UNK#', 'I-NP'),
  ('#UNK#', 'I-NP'),
  ('#UNK#', 'I-NP'),
  ('after', 'B-SBAR'),
  ('the', 'B-NP')

In [21]:
# Evaluation of ES


!python3 EvalScript/evalResult.py EN/dev.out EN/dev_try.out



Python was not found; run without arguments to install from the Microsoft Store, or disable this shortcut from Settings > Manage App Execution Aliases.


# Part 2

In [31]:
for i,k in enumerate(tag_count):
    print(i,k)

0 B-NP
1 I-NP
2 B-VP
3 B-ADVP
4 B-ADJP
5 I-ADJP
6 B-PP
7 O
8 B-SBAR
9 I-VP
10 I-ADVP
11 B-PRT
12 I-PP
13 B-CONJP
14 I-CONJP
15 B-INTJ
16 I-INTJ
17 I-SBAR
18 B-UCP
19 I-UCP
20 B-LST


#### 2a, 10 points

In [32]:
def get_vocab(filepath):
#     import collections #used for counting
    vocab = set()
    with open(filepath, 'r', encoding='utf-8') as file:
        # reading file line-by-line
        for line in file:
            stripped_line = line.strip() #removes the /n and then splits it to separete the word and its label
            if stripped_line:  # check if there even is a word or tag in the line
                word, tag = stripped_line.split()  # Split line into word and tag
                vocab.add(word) #doesnt add duplicates

    return vocab


In [33]:
def gettt(filepath):
    vocab = set()
    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
#             else:
            word = line.split()
            vocab.update(word)
    return vocab


In [34]:
vocabu = get_vocab('EN/train')

In [35]:
print("24" not in vocabu)

False


In [36]:
vv = gettt('EN/dev.in')

In [37]:
print(vv)

{'Crutcher', 'summer', 'day', 'mortality', 'Stephanie', 'Turner', 'races', 'an', 'Cable', 'Du', 'euphoria', 'cuts', 'Budweiser', 'Board', 'inundated', 'parliament', 'happens', 'eye', 'Disk', 'June', 'reasonable', 'leaving', 'reserves', 'River', 'redeemed', 'equities', 'interview', 'U.S.', 'jobs', 'first-ever', 'ages', 'might', 'commission', 'rapid', 'individual', 'environmental', 'explain', 'assault', 'banning', 'communists', 'nevertheless', 'He', 'curbs', '43', 'payment', 'rein', 'sporadic', 'overall', 'pleased', 'while', '6.94', 'declining', 'Public', 'steel', 'Roberts', 'audit', 'operating', 'chances', 'COPPER', 'representatives', 'Paramount-MCA', 'treatment', 'forward', '450,000', 'represent', 'Nausea', 'luminaries', 'arrive', 'Net', 'tumble', 'assurances', 'utilities', 'would', 'principal', 'committed', 'generates', 'had', 'turning', 'likewise', 'Railroad', 'Herrera', 'undo', 'successors', 'hot-dipped', 'live', 'Baden-Wuerttemberg', 'emergency-medical', 'gone', 'repeated', 'scalp'

In [38]:
def check_new_words(training_set, test_set):
    # Use set difference to find words in the test set that aren't in the training set
    new_words = test_set - training_set
    
    # Return True if there are new words, otherwise False
    return len(new_words) > 0, new_words

x,z = check_new_words(vocabu, vv)
print(len(z))
print(len(vv))
print(len(vocabu))

1284
6003
18212


In [39]:
def process_file_for_transitions(filepath):
    # we make use of the default library "collections" to make processing the tags and word-tag pairs easier
    import collections
    transition_count = collections.defaultdict(int) #y_u to y_v, including start and stop

    tag_count = collections.defaultdict(int)  # counting for tags
    start_counter = 0
    stop_counter = 0
    
    START = "START"
    STOP = "STOP"
    previous_tag = START

    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            stripped_line = line.strip()
            if stripped_line:
                word, tag = stripped_line.split()
                transition_count[(previous_tag, tag)] += 1
                if previous_tag == "START":
                    start_counter += 1
                tag_count[tag] += 1
                previous_tag = tag
            else:  # when the sentence has ended
                transition_count[(previous_tag, STOP)] += 1
                stop_counter += 1
                previous_tag = START  # reset for the next sentence
    tag_count["START"] = start_counter
    tag_count["STOP"] = stop_counter

    return transition_count, tag_count



In [40]:
transition_count, tag_count = process_file_for_transitions('EN/train')


In [41]:
transition_count

defaultdict(int,
            {('START', 'B-NP'): 4966,
             ('B-NP', 'I-NP'): 32390,
             ('I-NP', 'B-VP'): 7365,
             ('B-VP', 'B-ADVP'): 570,
             ('B-ADVP', 'B-ADJP'): 59,
             ('B-ADJP', 'I-ADJP'): 490,
             ('I-ADJP', 'I-ADJP'): 84,
             ('I-ADJP', 'B-PP'): 164,
             ('B-PP', 'B-NP'): 17064,
             ('I-NP', 'B-PP'): 8544,
             ('I-NP', 'O'): 12410,
             ('O', 'O'): 2710,
             ('O', 'B-ADJP'): 209,
             ('B-NP', 'B-VP'): 6164,
             ('B-VP', 'B-PP'): 1803,
             ('I-NP', 'I-NP'): 22201,
             ('O', 'STOP'): 7598,
             ('B-VP', 'B-SBAR'): 467,
             ('B-SBAR', 'B-NP'): 1657,
             ('B-VP', 'B-NP'): 6304,
             ('B-VP', 'O'): 1231,
             ('O', 'B-NP'): 8288,
             ('I-NP', 'B-NP'): 2601,
             ('B-ADVP', 'B-PP'): 608,
             ('B-VP', 'I-VP'): 6828,
             ('I-VP', 'B-NP'): 3610,
             ('O', 'B-V

In [42]:
def estimate_one_transmission_probability(y_u, y_v, transition_count, tag_count):
        # get the total times y->x occurs
    tag_transition_freq = transition_count.get((y_u, y_v), 0)
    # total times y appears
    tag_total_freq = tag_count.get(y, 1)
    
    return tag_transition_freq / tag_total_freq
    

In [43]:
def estimate_all_transmission_probability(transition_count, tag_count):
  
    transmission_probabilities = {}
    # iterate through all the transition tag pairs to get all the transition probabilities
    # store the results in the dictionary transition_probabilities
    for (y_u, y_v), count in transition_count.items():
        transmission_probabilities[(y_u, y_v)] = count / tag_count[y_u]
        
    return transmission_probabilities

In [44]:
transmission_probabilities = estimate_all_transmission_probability(transition_count, tag_count)

In [45]:
transmission_probabilities

{('START', 'B-NP'): 0.6480490669450607,
 ('B-NP', 'I-NP'): 0.6847056336539478,
 ('I-NP', 'B-VP'): 0.13491234818926195,
 ('B-VP', 'B-ADVP'): 0.031214062756694597,
 ('B-ADVP', 'B-ADJP'): 0.016549789621318374,
 ('B-ADJP', 'I-ADJP'): 0.27984009137635635,
 ('I-ADJP', 'I-ADJP'): 0.14634146341463414,
 ('I-ADJP', 'B-PP'): 0.2857142857142857,
 ('B-PP', 'B-NP'): 0.9280469897209985,
 ('I-NP', 'B-PP'): 0.15650931472220694,
 ('I-NP', 'O'): 0.2273268487479621,
 ('O', 'O'): 0.11352211796246649,
 ('O', 'B-ADJP'): 0.008755026809651475,
 ('B-NP', 'B-VP'): 0.13030335059718845,
 ('B-VP', 'B-PP'): 0.09873500903564975,
 ('I-NP', 'I-NP'): 0.4066787565715961,
 ('O', 'STOP'): 0.3182808310991957,
 ('B-VP', 'B-SBAR'): 0.025573626855046272,
 ('B-SBAR', 'B-NP'): 0.8725645076355977,
 ('B-VP', 'B-NP'): 0.3452165817863206,
 ('B-VP', 'O'): 0.06741142325173868,
 ('O', 'B-NP'): 0.34718498659517427,
 ('I-NP', 'B-NP'): 0.0476452162444359,
 ('B-ADVP', 'B-PP'): 0.17054698457223003,
 ('B-VP', 'I-VP'): 0.3739116149170363,
 ('

#### 2b, 15 points (viterbii)

In [46]:
tag_count

defaultdict(int,
            {'B-NP': 47305,
             'I-NP': 54591,
             'B-VP': 18261,
             'B-ADVP': 3565,
             'B-ADJP': 1751,
             'I-ADJP': 574,
             'B-PP': 18387,
             'O': 23872,
             'B-SBAR': 1899,
             'I-VP': 10159,
             'I-ADVP': 363,
             'B-PRT': 468,
             'I-PP': 223,
             'B-CONJP': 49,
             'I-CONJP': 64,
             'B-INTJ': 26,
             'I-INTJ': 7,
             'I-SBAR': 48,
             'B-UCP': 1,
             'I-UCP': 4,
             'B-LST': 11,
             'START': 7663,
             'STOP': 7663})

In [47]:
tags = [tag for tag in tag_count if tag not in ['START', 'STOP']]
print(tags)

['B-NP', 'I-NP', 'B-VP', 'B-ADVP', 'B-ADJP', 'I-ADJP', 'B-PP', 'O', 'B-SBAR', 'I-VP', 'I-ADVP', 'B-PRT', 'I-PP', 'B-CONJP', 'I-CONJP', 'B-INTJ', 'I-INTJ', 'I-SBAR', 'B-UCP', 'I-UCP', 'B-LST']


In [48]:
# transmission_probabilities
# print(emission_probabilities)
for word, tag in emission_probabilities:
    if word == '#UNK#':
        print(word,tag)

#UNK# B-NP
#UNK# I-NP
#UNK# B-VP
#UNK# B-ADVP
#UNK# B-ADJP
#UNK# I-ADJP
#UNK# B-PP
#UNK# O
#UNK# B-SBAR
#UNK# I-VP
#UNK# I-ADVP
#UNK# B-PRT
#UNK# I-PP
#UNK# B-CONJP
#UNK# I-CONJP
#UNK# B-INTJ
#UNK# I-INTJ
#UNK# I-SBAR
#UNK# B-UCP
#UNK# I-UCP
#UNK# B-LST


In [49]:
def viterbi_algorithm(sentence, tag_count, transmission_probabilities, emission_probabilities, vocabulary, unk = 0.1):
    # make sure that "sentence" is a sequence of x observations
    tags = [tag for tag in tag_count if tag not in ['START', 'STOP']] # makes a dictionary of tags that doesnt include start and stop, so we dont iterate through them unncessarily
#     tags.pop('START')
#     tags.pop('STOP')
    
    n = len(sentence)  # number of words in the sentence (k)
    m = len(tags)      # number of tags (u / v)
    for i in range(0,n):
        if sentence[i] not in vocabulary:
            sentence[i] = '#UNK#'
    # create a matrix to store all the pi values
    pi = [[float('-inf')] * m for _ in range(n+1)] #+1 tp account for the stop state, but we dont actually store anything theer
    backpointer = [[0] * m for _ in range(n)] # to store y*

    # base case!!!, here, we actually just initialise the first column to be pi(0,v) where u is "START". 
    # we skip the step of assigning pi(0,v) = 1 if v is start and 0 otherwise
    for i, tag in enumerate(tags): # i is the index of the tag
        t_count = tag_count[tag]
        pi[0][i] = transmission_probabilities.get(('START', tag), 0) * emission_probabilities.get((sentence[0], tag),0)

            
    #bottom up dynamic programming        
    for i in range(1, n): 
        for j, tag in enumerate(tags):
            max_prob = float('-inf')
            max_state = None
            for kk, prev_tag in enumerate(tags):
                # for now, if there's an unknown word, default to 0, but we should change this to the k/count + k thing
#                 t_count
                prob = pi[i-1][kk] * transmission_probabilities.get((prev_tag, tag), 0) * emission_probabilities.get((sentence[i], tag), 0)
                
                if prob > max_prob:
                    max_prob = prob
                    max_state = kk
            pi[i][j] = max_prob
            backpointer[i][j] = max_state # to store y*

    # termination step 
    max_prob = float('-inf')
    max_state = None
    for i, tag in enumerate(tags):
        prob = pi[n-1][i] * transmission_probabilities.get((tag, 'STOP'), 0)
        if prob > max_prob:
            max_prob = prob
            max_state = i
            
    #initialise an array for the best sequence of states         
    best_path = [tags[max_state]]
    
    #go backwards along the backpointer, iteratively finding the best state
    for i in range(n-1, 0, -1):
        max_state = backpointer[i][max_state]
        best_path.append(tags[max_state])
        
    # reverse the array to get it in the right order
    best_path.reverse()
    
    return best_path

In [50]:
def get_sentences(filepath):
    with open(filepath, 'r', encoding='utf-8') as file:
        content = file.read().strip()
    # split on double newlines which denote separated sentences
    return [sentence.split() for sentence in content.split('\n\n')]


In [51]:
def get_prediction(filepath, tag_count, transmission_probabilities, emission_probabilities, vocabulary):
    sentences = get_sentences(filepath)
#     print(sentences)
    predictions = []
    for sentence in sentences:
        best_path_prediction = viterbi_algorithm(sentence, tag_count, transmission_probabilities, emission_probabilities, vocabulary)
        predictions.append(list(zip(sentence, best_path_prediction))) #puts them in the predictions array pairwise
    return predictions
    

In [52]:
def write_tag_predictions_to_file(predictions, output_filepath):
    # open the output file for writing
    with open(output_filepath, 'w', encoding='utf-8') as file:
        for sentence in predictions:
            for word, tag in sentence:
                # write each word and its predicted tag to the file, with a spacing to separate.
                file.write(f"{word} {tag}\n")
            file.write("\n")

In [53]:
predictions = get_prediction('EN/dev.in', tag_count, transmission_probabilities, emission_probabilities, vocabulary)

In [54]:
predictions

[[('HBO', 'B-NP'),
  ('has', 'B-VP'),
  ('close', 'I-VP'),
  ('to', 'B-PP'),
  ('24', 'B-NP'),
  ('million', 'I-NP'),
  ('subscribers', 'I-NP'),
  ('to', 'B-PP'),
  ('its', 'B-NP'),
  ('HBO', 'I-NP'),
  ('and', 'O'),
  ('#UNK#', 'B-NP'),
  ('networks', 'I-NP'),
  (',', 'O'),
  ('while', 'B-SBAR'),
  ('Showtime', 'B-NP'),
  ('and', 'O'),
  ('its', 'B-NP'),
  ('sister', 'I-NP'),
  ('service', 'I-NP'),
  (',', 'O'),
  ('The', 'B-NP'),
  ('#UNK#', 'I-NP'),
  ('Channel', 'I-NP'),
  (',', 'O'),
  ('have', 'B-VP'),
  ('only', 'I-VP'),
  ('about', 'B-PP'),
  ('10', 'B-NP'),
  ('million', 'I-NP'),
  (',', 'O'),
  ('according', 'B-PP'),
  ('to', 'B-PP'),
  ('Paul', 'B-NP'),
  ('#UNK#', 'I-NP'),
  ('Associates', 'I-NP'),
  (',', 'O'),
  ('a', 'B-NP'),
  ('#UNK#', 'I-NP'),
  (',', 'O'),
  ('Calif.', 'B-NP'),
  (',', 'O'),
  ('research', 'B-NP'),
  ('firm', 'I-NP'),
  ('.', 'O')],
 [('#UNK#', 'B-NP'),
  ('#UNK#', 'I-NP'),
  ('#UNK#', 'I-NP'),
  ('after', 'B-PP'),
  ('the', 'B-NP'),
  ('stock', 'I-N

In [55]:
write_tag_predictions_to_file(predictions, 'EN/dev.p2.out')

In [64]:
!python EvalScript/evalResult.py EN/dev.out EN/dev.p2.out


#Entity in gold data: 13179
#Entity in prediction: 14279

#Correct Entity : 10858
Entity  precision: 0.7604
Entity  recall: 0.8239
Entity  F: 0.7909

#Correct Sentiment : 10056
Sentiment  precision: 0.7043
Sentiment  recall: 0.7630
Sentiment  F: 0.7325


### Part 3

In [None]:
# lets try just storing the best 2 paths at each point, then at the final stop state, 

In [65]:
#Just copy pastad this here for easy reference first.
def viterbi_algorithm2(sentence, tag_count, transmission_probabilities, emission_probabilities, unk = 0.1):

    # make sure that "sentence" is a sequence of x observations
    tags = [tag for tag in tag_count if tag not in ['START', 'STOP']] # makes a dictionary of tags that doesnt include start and stop, so we dont iterate through them unncessarily
    tag_count.pop('START')
    tag_count.pop('STOP')
    
    n = len(sentence)  # number of words in the sentence (k)
    m = len(tags)      # number of tags (u / v)

    # create a matrix to store all the pi values
    pi = [[float('-inf'), float('-inf')] * m for _ in range(n+1)] #+1 tp account for the stop state, but we dont actually store anything theer
    backpointer = [[0] * m for _ in range(n)] # to store y*

    pi_2 = [[float('-inf'), float('-inf')] * m for _ in range(n+1)]
    backpointer_2 = [[0] * m for _ in range(n)] # to store y*

    # base case!!!, here, we actually just initialise the first column to be pi(0,v) where u is "START". 
    # we skip the step of assigning pi(0,v) = 1 if v is start and 0 otherwise
    for i, tag in enumerate(tags): # i is the index of the tag
        t_count = tag_count[tag]
        pi[0][i] = transmission_probabilities.get(('START', tag), 0) * emission_probabilities.get((sentence[0], tag), 0)
#         if tag == "START":
#             pi[0,i] = 1
#         elif tag == "STOP": # we only do this because tag_count includes both start and stop states
#             continue
#         else:
#             pi[0,i] = 0
            
    #bottom up dynamic programming        
    for i in range(1, n): 
        for j, tag in enumerate(tags):
            max_prob = float('-inf')
            second_max = float('-inf')
            max_state = None
            second_max_state = None
            for kk, prev_tag in enumerate(tags):
                # for now, if there's an unknown word, default to 0, but we should change this to the k/count + k thing
                t_count
                prob = pi[i-1][kk] * transmission_probabilities.get((prev_tag, tag), 0) * emission_probabilities.get((sentence[i], tag), 0)
                prob_2 = pi_2[i-1][kk] * transmission_probabilities.get((prev_tag, tag), 0) * emission_probabilities.get((sentence[i], tag), 0)

                #changes made here to store the 2nd max
                if prob > max_prob:
                    max_prob = prob
                    max_state = kk
                elif prob_2 > second_max and prob_2 != max_prob:
                    second_max = prob_2
                    second_max_state = kk


            pi[i][j] = max_prob
            backpointer[i][j] = max_state # to store y*

            pi_2[i][j] = second_max
            backpointer_2[i][j] = second_max_state

    # termination step 
    max_prob = float('-inf')
    max_state = None
    for i, tag in enumerate(tags):
        prob = pi[n-1][i] * transmission_probabilities.get((tag, 'STOP'), 0)
        prob_2 = pi_2[n-1][i] * transmission_probabilities.get((tag, 'STOP'), 0)

        if prob > max_prob:
            max_prob = prob
            max_state = i

        elif prob_2 > second_max and prob_2 != max_prob:
            second_max = prob_2
            second_max_state = i
            
    #initialise an array for the best sequence of states         
    best_path = [tags[max_state]]
    second_best_path = [tags[second_max_state]]
    
    #go backwards along the backpointer, iteratively finding the best state
    for i in range(n-1, 0, -1):
        max_state = backpointer[i][max_state]
        second_max_state = backpointer_2[i][second_max_state]

        best_path.append(tags[max_state])
        second_best_path.append(tags[second_max_state])

        
    # reverse the array to get it in the right order
    best_path.reverse()
    second_best_path.reverse()
    
    return best_path,second_best_path

In [66]:
filepath = 'EN/dev.in'
sentences = get_sentences(filepath)
for sentence in sentences:
    print(best_path_prediction = viterbi_algorithm2(sentence, tag_count, transmission_probabilities, emission_probabilities, vocabulary))

TypeError: list indices must be integers or slices, not NoneType

### Old stuff

In [207]:
def processfile(filepath):
#     counter = 0
    with open(filepath,'r',encoding='utf-8') as f:
        ys = {}
        words = []
        lines = f.readlines()
        for line in lines:
            clean_text = line.strip().split(" ") #removes the /n and then splits it to separete the word and its label
#             print(counter)
#             counter += 1
            if len(clean_text) > 1: #sometimes clean_text = [] whenever there is an empty line between paragraphs
                word = clean_text[0]
                label = clean_text[1]
                if word not in words:
                    words.append(word)
                if label not in ys:
                    ys[label] = 1
                else:
                    ys[label] += 1
        return ys, words, lines

In [208]:
ytrain, wordstrain, linestrain = processfile("EN/train")