# 50.007 Machine Learning Project
____

In [2]:
import pandas as pd
import math
import copy
import numpy as np
from collections import Counter, defaultdict


In [3]:
def read_dataset(path, labeled=True):
    with open(path, encoding="utf8") as fp:
        data_read = []
        sentence = []
        for line in fp:
            if line == "\n":
                data_read.append(sentence)
                sentence = []
            else:
                if labeled:
                    tokens = line.strip().split()
                    sentence.append((' '.join(tokens[:-1]), tokens[-1]))
                else:
                    sentence.append((line.strip()))
    return data_read

## 1.1 Calculating MLE
#### **Estimating Emission Parameters**

In [4]:
def get_emissions(data):
    label_counts = defaultdict(int)
    emission_counts = defaultdict(lambda:defaultdict(int))
    for sentence in data:
        for word, label in sentence:
            label_counts[label] += 1
            emission_counts[label][word] += 1
    emissions = defaultdict(float)
    for sentence in data:
        for word,label in sentence:
            emission_prob = emission_counts[label][word]/label_counts[label]
            emissions[(word, label)] = emission_prob
    return emissions

## 1.2 Calculating MLE
#### **Estimating Emission Parameters accounting for unknown words**

In [5]:
def get_e(data, k=1):
    words_list = [x for sentence in data for(x,y) in sentence] 
    #all unique words in the training set
    words = set(words_list)
    label_counts = defaultdict(int)
    emission_counts = defaultdict(lambda:defaultdict(int))
    for sentence in data:
        for word, label in sentence:
            label_counts[label] += 1
            emission_counts[label][word] += 1
    emissions = defaultdict(float)
    for sentence in data:
        for word,label in sentence:
            emission_prob = emission_counts[label][word]/label_counts[label]
            emissions[(word, label)] = emission_prob
    #this function is designed for testing phase so that we can safely handle new words that do not exist in the training set
    def e(x,y): #x:word, y:label
        if x not in words: 
            return k/(label_counts[y] + k)
        else:
            return emissions[(x,y)]
    return e, emissions

ES_data = read_dataset("./Data/ES/train")
e, emissions = get_e(ES_data)
value = e('PLATO', 'B-positive')
print(e('Plato','B-positive'))
print(value)
np.log(value)

0.0
0.0008613264427217916


-7.057036981697891

## 1.3 Simple Sentiment Analysis
#### **Implementing a simple sentiment analysis system**

In [6]:
def write_file(data, path):
    with open(path, "w", encoding='utf8') as fp:
        for sentence in data:
            for row in sentence:
                fp.write(" ".join(row)+'\n') # rebuild the rows
            fp.write("\n")


In [7]:
def checkforUNK(dev,words):
    dev_2 = copy.deepcopy(dev)
    for sentence in dev_2:
            for i, word in enumerate(sentence):
                if word not in words:
                    sentence[i] = "#UNK#"
    return dev_2

In [8]:
def simple_sentiment(dataset, trainset, output_path):
    labels_list = [y for sentence in trainset for (x,y) in sentence] 
    #all unique words in the training set
    labels = set(labels_list)
    words = set([x for sentence in trainset for (x,y) in sentence])
    dataset = checkforUNK(dataset, words)
    e, emissions = get_e(trainset)
    output=[]
    for sentence in dataset:
        sentence_wlabels=[]
        for word in sentence:
            max_p = 0
            label = None
            for y in labels:
                p = e(word,y)
                if p >max_p:
                    max_p = p
                    label = y
            sentence_wlabels.append((word,label))
        output.append(sentence_wlabels)
    write_file(output, output_path)

In [9]:
ES_dev = read_dataset('./Data/ES/dev.in', labeled=False)
ES_data = read_dataset("./Data/ES/train")
simple_sentiment(ES_dev ,ES_data, "Data/ES/dev.p1.out")
print("ES Evaluation")
!python3 EvalScript/evalResult.py Data/ES/dev.out Data/ES/dev.p1.out

ES Evaluation

#Entity in gold data: 229
#Entity in prediction: 1466

#Correct Entity : 178
Entity  precision: 0.1214
Entity  recall: 0.7773
Entity  F: 0.2100

#Correct Sentiment : 97
Sentiment  precision: 0.0662
Sentiment  recall: 0.4236
Sentiment  F: 0.1145


In [10]:
RU_dev = read_dataset('./Data/RU/dev.in', labeled=False)
RU_data = read_dataset("./Data/RU/train")
simple_sentiment(RU_dev ,RU_data, "Data/RU/dev.p1.out")
print("RU Evaluation")
!python3 EvalScript/evalResult.py Data/RU/dev.out Data/RU/dev.p1.out

RU Evaluation

#Entity in gold data: 389
#Entity in prediction: 1816

#Correct Entity : 266
Entity  precision: 0.1465
Entity  recall: 0.6838
Entity  F: 0.2413

#Correct Sentiment : 129
Sentiment  precision: 0.0710
Sentiment  recall: 0.3316
Sentiment  F: 0.1170


## 2.1 Estimating Transition Probabilities


In [11]:
def estimate_transition_parameters_test(data):
    transition_counts = defaultdict(lambda: defaultdict(int))
    state_counts = defaultdict(int)
    state_counts['START'] = len(data)
    state_counts['STOP'] = len(data)
    # # Group sentences
    # list_of_sentences = [list(sub) for ele, sub in groupby(sentences, key=bool) if ele]
    # Count state and transition occurrences
    for sentence in data:
        prev_state = "START"
        n = len(sentence)
        for i in range(n):
            state = sentence[i][1]
            state_counts[state] += 1
            if i == n-1:
                transition_counts[prev_state][state] +=1
                transition_counts[state]["STOP"] += 1
            else:
                transition_counts[prev_state][state] += 1 
                prev_state = state
    transitions = defaultdict(lambda: defaultdict(float))
    for prev_state in transition_counts:
        for curr_state in transition_counts[prev_state]:
            transition_prob = transition_counts[prev_state][curr_state]/(state_counts[prev_state])
            transitions[prev_state][curr_state] = transition_prob 
    def q(x,y):
        return transitions[x][y]
    return q, transitions
  
# Estimate transition parameters
transition_params = estimate_transition_parameters_test(ES_data)
print("Transition Parameters:", transition_params)


Transition Parameters: (<function estimate_transition_parameters_test.<locals>.q at 0x7fb1f1acbf70>, defaultdict(<function estimate_transition_parameters_test.<locals>.<lambda> at 0x7fb2007acca0>, {'START': defaultdict(<class 'float'>, {'O': 0.9289176090468497, 'B-positive': 0.052234787291330104, 'B-negative': 0.014001077005923533, 'B-neutral': 0.004846526655896607}), 'O': defaultdict(<class 'float'>, {'O': 0.8856896848630963, 'B-positive': 0.03650766316514551, 'STOP': 0.06344067504735663, 'B-negative': 0.012226623041157224, 'B-neutral': 0.0021353538832443605}), 'B-positive': defaultdict(<class 'float'>, {'O': 0.871551724137931, 'I-positive': 0.11637931034482758, 'STOP': 0.008620689655172414, 'B-neutral': 0.0008620689655172414, 'B-positive': 0.002586206896551724}), 'B-negative': defaultdict(<class 'float'>, {'O': 0.8110236220472441, 'I-negative': 0.1784776902887139, 'STOP': 0.010498687664041995}), 'B-neutral': defaultdict(<class 'float'>, {'I-neutral': 0.20833333333333334, 'O': 0.79166

In [12]:
# def viterbi_algorithm(sentence, q, e, states):
#     pi = [{}]
#     for state in states:
#         if state == "START":
#             pi[-1][state] = 1
#         else:
#             pi[-1][state] = np.log(1e-10)

#     n = len(sentence)
#     for i in range(n):
#         word = sentence[i]
#         pi.append({})
#         for state in states:
#             pi[-1][state]= max([p + np.log(q(u, state)+1e-10) + np.log(e(word, state)+1e-10) for u, p in pi[-2].items()])
#     pi.append({})
#     pi[-1]['STOP'] = max([pi[-2][state] +np.log(q(state, 'STOP')+1e-10) for state in states])
    
#     print(pi)
#     #predicting states for sentence
#     labels = [None] * n
#     next = 'STOP'
#     for i in range(n, 0, -1):
#         max_p = np.log(0)
#         best_state = None
#         #start from the back
#         for state in states:
#             print(state)
#             prob= pi[i][state]+ np.log(q(next, state)+1e-10)
#             print(prob,state)
#             if prob >= max_p:
#                 max_p = prob
#                 best_state = state
#         next = best_state
#         print(next)
#         labels[i - 1] = best_state
#     return labels

# def viterbi(dataset, train_set):
#     states = set([y for sentence in train_set for (x, y) in sentence])
#     q,transitions= estimate_transition_parameters_test(train_set)
#     e, emissions= get_e(train_set, k=1)
#     output = []
#     for sentence in dataset:
#         labels = viterbi_algorithm(sentence, q, e, states)
#         sentence = list(zip(sentence, labels)) # new sentence
#         output.append(sentence)
#     return output

## 2.2 Viterbi Algorithm

In [13]:
def viterbi_algorithm(sentence, transition_params,e,states):
    n = len(sentence)
    num_states = len(states)
    pi = [{} for i in range(n)]
    backpointers = [{} for i in range(n)]


    for state in states:
        emission_prob = max(e(sentence[0],state),1e-10)
        pi[0][state] = math.log(transition_params['START'].get(state, 1e-10)) + math.log(emission_prob)
        backpointers[0][state] = 'START'

    # Forward pass
    for t in range(1, n):
        for state in states:
            max_prob = float('-inf')
            prev_state = None
            for prev_state in states:
                transition_prob = transition_params[prev_state].get(state, 1e-10)
                emission_prob = max(e(sentence[t],state),1e-10)
                prob = pi[t - 1].get(prev_state, 1e-10) + math.log(transition_prob) + math.log(emission_prob)
                if prob > max_prob:
                    max_prob = prob
                    backpointers[t][state] = prev_state
            pi[t][state] = max_prob

    # Termination step
    max_prob = float('-inf')
    final_state = None
    for state in states:
        transition_prob = transition_params[state].get('STOP', 1e-10)
        prob = pi[n - 1][state] + math.log(transition_prob)
        if prob > max_prob:
            max_prob = prob
            final_state = state

    # Backtracking step
    best_path = [final_state]
    for t in range(n - 1, 0, -1):
        best_path.insert(0, backpointers[t][best_path[0]])

    return best_path

# Run Viterbi algorithm on the development set using viterbi_algorithm_2
def viterbi(dev, transition_params, e,states,words):
    output = []
    dev = checkforUNK(dev,words)
    print(dev)
    for sentence in dev:
        best_path = viterbi_algorithm(sentence, transition_params, e, states)
        sentence = list(zip(sentence, best_path)) # new sentence
        output.append(sentence)
    return output

ES_dev = read_dataset('./Data/ES/dev.in', labeled=False)
ES_data = read_dataset("./Data/ES/train")
print(ES_dev)
q, transition_params = estimate_transition_parameters_test(ES_data)
e,emissions = get_e(ES_data, k=1)
words = set([x for sentence in ES_data for (x,y) in sentence ])
states = set([y for sentence in ES_data for (x,y) in sentence ])
output = viterbi(ES_dev, transition_params, e,states,words)
print(output)

[['Plato', 'degustación', ':', 'un', 'poco', 'abundante', 'de', 'más', ',', 'pero', 'bien', 'cocinado', '.'], ['restaurante', 'excelente', 'con', 'carne', 'de', 'alta', 'calidad', '.'], ['Las', 'posibilidades', 'en', 'el', 'restaurante', 'son', 'fundamentalmente', 'tres', ';', 'carta', 'normal', ',', 'menú', 'degustacion', 'y', 'una', 'opción', 'intermedia', 'que', 'es', 'una', 'selección', 'de', 'primeros', 'y', 'postres', 'y', 'carta', 'para', 'el', 'segundo', '.'], ['No', 'perderse', 'el', 'sorbete', 'de', 'mojito', '.'], ['para', 'mi', 'perfecto', '!'], ['Devolucion', 'a', 'cocina', ',', 'amabilidad', 'de', 'camarera', ',', 'requerimiento', 'de', 'cuenta', 'y', 'adios', '.'], ['Así', 'como', 'el', 'romesco', ',', 'que', 'era', 'un', 'poco', '"', 'de', 'bote', '"', '.'], ['Destacar', 'los', 'arroces', ',', 'la', 'caldereta', 'de', 'bogavante', ',', 'las', 'zamburiñas', 'al', 'horno', 'y', 'los', 'platos', 'de', '"', 'picoteo', '"', 'y', 'los', 'pescados', 'en', 'general', '.'], ['So

In [14]:
ES_dev = read_dataset('./Data/ES/dev.in', labeled=False)
ES_data = read_dataset("./Data/ES/train")
q, transition_params = estimate_transition_parameters_test(ES_data)
e,emissions = get_e(ES_data, k=1)
states = set([y for sentence in ES_data for (x,y) in sentence ])
words = set([x for sentence in ES_data for (x,y) in sentence ])

output = viterbi(ES_dev, transition_params, e,states,words)
write_file(output, './Data/ES/dev.p2.out')
print("ES Viterbi Evaluation")
!python3 EvalScript/evalResult.py Data/ES/dev.out Data/ES/dev.p2.out

[['Plato', 'degustación', ':', 'un', 'poco', 'abundante', 'de', 'más', ',', 'pero', 'bien', 'cocinado', '.'], ['restaurante', 'excelente', 'con', 'carne', 'de', 'alta', 'calidad', '.'], ['Las', '#UNK#', 'en', 'el', 'restaurante', 'son', '#UNK#', 'tres', ';', 'carta', 'normal', ',', 'menú', 'degustacion', 'y', 'una', 'opción', '#UNK#', 'que', 'es', 'una', 'selección', 'de', 'primeros', 'y', 'postres', 'y', 'carta', 'para', 'el', 'segundo', '.'], ['No', '#UNK#', 'el', 'sorbete', 'de', '#UNK#', '.'], ['para', 'mi', 'perfecto', '!'], ['#UNK#', 'a', 'cocina', ',', 'amabilidad', 'de', 'camarera', ',', '#UNK#', 'de', 'cuenta', 'y', 'adios', '.'], ['Así', 'como', 'el', '#UNK#', ',', 'que', 'era', 'un', 'poco', '"', 'de', 'bote', '"', '.'], ['#UNK#', 'los', 'arroces', ',', 'la', '#UNK#', 'de', 'bogavante', ',', 'las', '#UNK#', 'al', 'horno', 'y', 'los', 'platos', 'de', '"', '#UNK#', '"', 'y', 'los', 'pescados', 'en', 'general', '.'], ['Somos', 'clientes', '#UNK#', '.', '#UNK#', 'que', 'lo', '#U

In [15]:
RU_dev = read_dataset('./Data/RU/dev.in', labeled=False)
RU_data = read_dataset("./Data/RU/train")
q, transition_params = estimate_transition_parameters_test(RU_data)
e,emissions = get_e(RU_data, k=1)
states = set([y for sentence in RU_data for (x,y) in sentence ])
words = set([x for sentence in RU_data for (x,y) in sentence ])

output = viterbi(RU_dev, transition_params, e,states,words)
write_file(output, './Data/RU/dev.p2.out')
print("RU Viterbi Evaluation")
!python3 EvalScript/evalResult.py Data/RU/dev.out Data/RU/dev.p2.out

[['Интерьер', ',', 'интерьер', ',', 'и', 'еще', 'раз', 'интерьер', '!', '!', '!', 'общее', '#UNK#', 'решение', 'и', '#UNK#', '#UNK#', '-', 'просто', '#UNK#', '!', '!', '!', 'особенно', 'на', '#UNK#', 'с', '#UNK#', 'сейчас', '#UNK#', ',', 'Дача', '-', 'самое', '#UNK#', 'и', 'при', 'этом', 'очень', '#UNK#', 'место', '.'], ['#UNK#', 'счет', 'оказался', 'весьма', 'приличным', '.', 'но', 'только', '#UNK#', 'из-за', 'вина', '.'], ['То', 'же', 'касается', 'и', '#UNK#', 'вида', 'официанток', '.'], ['Затем', 'мы', 'попросили', 'счет', ',', 'его', 'так', 'же', 'долго', 'не', 'могли', 'принести', ',', 'как', 'и', 'заказ', '.'], ['В', 'день', 'банкета', 'с', 'первой', 'же', '#UNK#', 'в', 'ресторане', '#UNK#', 'настроение', 'праздника', 'весь', 'персонал', 'улыбался', 'и', '#UNK#', '!', 'было', 'очень', 'приятно', '!', '=)', 'кухня', '#UNK#', ',', 'все', 'гости', 'наелись', '"', 'до', '#UNK#', '"', 'все', 'было', 'очень', 'вкусно', '!', 'превосходная', '#UNK#', 'на', 'гриле', 'и', 'хачапури', '!'],

## 3.1 Viterbi Algorithm with kth-best sequence

In [16]:
def viterbi_algorithm(sentence, transition_params,e,states, k):
    n = len(sentence)
    num_states = len(states)
    pi = [{} for i in range(n)]
    backpointers = [{} for i in range(n)]


    for state in states:
        emission_prob = max(e(sentence[0],state),1e-10)
        pi[0][state] = math.log(transition_params['START'].get(state, 1e-10)) + math.log(emission_prob)
        backpointers[0][state] = 'START'

    # Forward pass
    for t in range(1, n):
        for state in states:
            max_prob = float('-inf')
            prev_state = None
            prob_list=[]
            for prev_state in states:
                transition_prob = transition_params[prev_state].get(state, 1e-10)
                emission_prob = max(e(sentence[t],state),1e-10)
                prob = pi[t - 1].get(prev_state, 1e-10) + math.log(transition_prob) + math.log(emission_prob)
                prob_list.append((prev_state,prob))
    
            sorted_prob_list = sorted(prob_list, key=lambda x: x[1], reverse=True)
            sorted_states = sorted_prob_list[:k]
            max_prob = sorted_states[-1][1]
            backpointers[t][state] = sorted_states[-1][0]
            # print(backpointers[t][state])
            pi[t][state] = max_prob

    # Termination step
    max_prob = float('-inf')
    final_state = None
    for state in states:
        transition_prob = transition_params[state].get('STOP', 1e-10)
        prob = pi[n - 1][state] + math.log(transition_prob)
        if prob > max_prob:
            max_prob = prob
            final_state = state

    # Backtracking step
    best_path = [final_state]
    for t in range(n - 1, 0, -1):
        best_path.insert(0, backpointers[t][best_path[0]])
    # print(best_path)
    return best_path

# Run Viterbi algorithm on the development set using viterbi_algorithm_2
def modified_viterbi(dev, transition_params, e,states,words,k):
    output = []
    dev = checkforUNK(dev,words)
    for sentence in dev:
        best_path = viterbi_algorithm(sentence, transition_params, e, states,k)
        sentence = list(zip(sentence, best_path)) # new sentence
        output.append(sentence)
    return output
ES_dev = read_dataset('./Data/ES/dev.in', labeled=False)
ES_data = read_dataset("./Data/ES/train")
q, transition_params = estimate_transition_parameters_test(ES_data)
e,emissions = get_e(ES_data, k=1)
states = set([y for sentence in ES_data for (x,y) in sentence ])
words = set([x for sentence in ES_data for (x,y) in sentence ])

k = 1
print(f"the {k} best sequence" ,modified_viterbi(ES_dev, transition_params, e,states,words,k))
k = 2
print(f"the {k} best sequence" ,modified_viterbi(ES_dev, transition_params, e,states,words,k))
k = 8
print(f"the {k} best sequence" ,modified_viterbi(ES_dev, transition_params, e,states,words,k))


the 1 best sequence [[('Plato', 'B-negative'), ('degustación', 'I-negative'), (':', 'O'), ('un', 'O'), ('poco', 'O'), ('abundante', 'O'), ('de', 'O'), ('más', 'O'), (',', 'O'), ('pero', 'O'), ('bien', 'O'), ('cocinado', 'O'), ('.', 'O')], [('restaurante', 'O'), ('excelente', 'O'), ('con', 'O'), ('carne', 'B-positive'), ('de', 'O'), ('alta', 'O'), ('calidad', 'O'), ('.', 'O')], [('Las', 'O'), ('#UNK#', 'B-positive'), ('en', 'O'), ('el', 'O'), ('restaurante', 'O'), ('son', 'O'), ('#UNK#', 'B-negative'), ('tres', 'I-negative'), (';', 'O'), ('carta', 'O'), ('normal', 'O'), (',', 'O'), ('menú', 'B-positive'), ('degustacion', 'I-positive'), ('y', 'O'), ('una', 'O'), ('opción', 'O'), ('#UNK#', 'B-positive'), ('que', 'O'), ('es', 'O'), ('una', 'O'), ('selección', 'O'), ('de', 'O'), ('primeros', 'O'), ('y', 'O'), ('postres', 'B-positive'), ('y', 'O'), ('carta', 'O'), ('para', 'O'), ('el', 'O'), ('segundo', 'O'), ('.', 'O')], [('No', 'O'), ('#UNK#', 'B-positive'), ('el', 'O'), ('sorbete', 'B-pos

In [17]:
ES_dev = read_dataset('./Data/ES/dev.in', labeled=False)
ES_data = read_dataset("./Data/ES/train")
q, transition_params = estimate_transition_parameters_test(ES_data)
e,emissions = get_e(ES_data, k=1)
states = set([y for sentence in ES_data for (x,y) in sentence ])
words = set([x for sentence in ES_data for (x,y) in sentence ])

best_output = modified_viterbi(ES_dev, transition_params, e,states,words,1)
second_best_output = modified_viterbi(ES_dev, transition_params, e,states,words,2)
eight_best_ouput = modified_viterbi(ES_dev, transition_params, e,states,words,8)

write_file(second_best_output, './Data/ES/dev.p3.2nd.out')
write_file(eight_best_ouput, './Data/ES/dev.p3.8th.out')

In [18]:
print("Evaluation for 2nd best sequence")
!python3 EvalScript/evalResult.py Data/ES/dev.out Data/ES/dev.p3.2nd.out
print("\n")
print("Evaluation for 8th best sequence")
!python3 EvalScript/evalResult.py Data/ES/dev.out Data/ES/dev.p3.8th.out


Evaluation for 2nd best sequence

#Entity in gold data: 229
#Entity in prediction: 2252

#Correct Entity : 123
Entity  precision: 0.0546
Entity  recall: 0.5371
Entity  F: 0.0992

#Correct Sentiment : 77
Sentiment  precision: 0.0342
Sentiment  recall: 0.3362
Sentiment  F: 0.0621


Evaluation for 8th best sequence

#Entity in gold data: 229
#Entity in prediction: 4046

#Correct Entity : 191
Entity  precision: 0.0472
Entity  recall: 0.8341
Entity  F: 0.0894

#Correct Sentiment : 98
Sentiment  precision: 0.0242
Sentiment  recall: 0.4279
Sentiment  F: 0.0458


In [19]:
RU_dev = read_dataset('./Data/RU/dev.in', labeled=False)
RU_data = read_dataset("./Data/RU/train")
q, transition_params = estimate_transition_parameters_test(RU_data)
e,emissions = get_e(RU_data, k=1)
states = set([y for sentence in RU_data for (x,y) in sentence ])
words = set([x for sentence in RU_data for (x,y) in sentence ])


best_output = modified_viterbi(RU_dev, transition_params, e,states,words,1)
second_best_output = modified_viterbi(RU_dev, transition_params, e,states,words,2)
eight_best_ouput = modified_viterbi(RU_dev, transition_params, e,states,words,8)

write_file(second_best_output, './Data/RU/dev.p3.2nd.out')
write_file(eight_best_ouput, './Data/RU/dev.p3.8th.out')

In [20]:
print("Evaluation for 2nd best sequence")
!python3 EvalScript/evalResult.py Data/RU/dev.out Data/RU/dev.p3.2nd.out
print("\n")
print("Evaluation for 8th best sequence")
!python3 EvalScript/evalResult.py Data/RU/dev.out Data/RU/dev.p3.8th.out

Evaluation for 2nd best sequence

#Entity in gold data: 389
#Entity in prediction: 3441

#Correct Entity : 163
Entity  precision: 0.0474
Entity  recall: 0.4190
Entity  F: 0.0851

#Correct Sentiment : 108
Sentiment  precision: 0.0314
Sentiment  recall: 0.2776
Sentiment  F: 0.0564


Evaluation for 8th best sequence

#Entity in gold data: 389
#Entity in prediction: 6152

#Correct Entity : 300
Entity  precision: 0.0488
Entity  recall: 0.7712
Entity  F: 0.0917

#Correct Sentiment : 122
Sentiment  precision: 0.0198
Sentiment  recall: 0.3136
Sentiment  F: 0.0373
