In [None]:
from collections import defaultdict

with open("Data/ES/train") as f:
    es = f.read().splitlines()

with open("Data/RU/train") as f:
    ru = f.read().splitlines()

In [2]:
with open("Data/ES/dev.in") as f:
    dev_in_es = f.read().splitlines()
with open("Data/ES/dev.out") as f:
    dev_out_es = f.read().splitlines()
with open("Data/RU/dev.in") as f:
    dev_in_ru = f.read().splitlines()
with open("Data/RU/dev.out") as f:
    test_out_ru = f.read().splitlines()

In [3]:
def estimate_emission_params(train_data,k=1):

    word_sentiment_counts = defaultdict(lambda: defaultdict(int))
    sentiment_counts =  defaultdict(int)
    emission_params = {}
    
    #getting the count(y) and count(y --> x)
    for sentence in train_data:
        try:
            if(sentence!=""):
                x, label = sentence.split(" ")
        except:
            continue

        sentiment_counts[label]+=1

        word_sentiment_counts[label][x]+=1



    #calculating the emission parameters

    for key in word_sentiment_counts:
        for word in word_sentiment_counts[key]:
            emission_params[(word,key)] = word_sentiment_counts[key][word]/(sentiment_counts[key])
    # print(emission_params)
    return emission_params, sentiment_counts

In [4]:
es_para, count =estimate_emission_params(es)
print(es_para[("palo", "O")])
states = []
for i in count.keys():
    states.append(i)
print("count", states)

3.238656605240146e-05
count ['O', 'B-positive', 'B-negative', 'B-neutral', 'I-neutral', 'I-positive', 'I-negative']


In [5]:
from itertools import groupby

def estimate_transition_parameters_test(sentences):
    transition_counts = {}
    state_counts = {}
    list_of_sentences = [list(sub) for ele, sub in groupby(sentences, key = bool) if ele]
    for one_sentence in list_of_sentences:
        # print(one_sentence)
        prev_state = 'START'
        for one_word in one_sentence:
            if(one_word!=""):
                #print(one_word)
                word, state = one_word.split(" ")
                if state_counts.get(prev_state):
                    state_counts[prev_state] +=1
                else:
                    state_counts[prev_state] = 1
                

                if prev_state not in transition_counts:
                    transition_counts[prev_state] = {}
                if state not in transition_counts[prev_state]:
                    transition_counts[prev_state][state] = 1
                else:
                    transition_counts[prev_state][state] += 1
                prev_state = state
        if "END" not in transition_counts[prev_state]:
            transition_counts[prev_state]["END"] = 1
        else: 
            transition_counts[prev_state]["END"] +=1
        # print(transition_counts)
    for from_state, to_states in transition_counts.items():
        # print(f"From State: {from_state}")
        for to_state, count in to_states.items():
            transition_counts[from_state][to_state] = count/state_counts[from_state]
            # print(f"  To State: {to_state}, Count: {count}")
    # print(transition_counts)    

            

    # print("transition", transition_counts, "state", state_counts)       
    return transition_counts

In [6]:
estimate_transition_parameters_test(es)

{'START': {'O': 0.9289176090468497,
  'B-positive': 0.052234787291330104,
  'B-negative': 0.014001077005923533,
  'B-neutral': 0.004846526655896607},
 'O': {'O': 0.9456845511712573,
  'B-positive': 0.038980620012503214,
  'END': 0.06773802081418012,
  'B-negative': 0.013054830287206266,
  'B-neutral': 0.002279998529033207},
 'B-positive': {'O': 0.8791304347826087,
  'I-positive': 0.11739130434782609,
  'END': 0.008695652173913044,
  'B-neutral': 0.0008695652173913044,
  'B-positive': 0.0026086956521739132},
 'B-negative': {'O': 0.8196286472148541,
  'I-negative': 0.18037135278514588,
  'END': 0.010610079575596816},
 'B-neutral': {'I-neutral': 0.20833333333333334, 'O': 0.7916666666666666},
 'I-neutral': {'I-neutral': 0.6511627906976745, 'O': 0.3488372093023256},
 'I-positive': {'I-positive': 0.5718849840255591,
  'O': 0.4281150159744409,
  'END': 0.003194888178913738},
 'I-negative': {'O': 0.39766081871345027, 'I-negative': 0.6023391812865497}}

In [37]:
import math

def viterbi_algorithm_2(sentence, transition_params, emission_params, states):
    n = len(sentence)
    num_states = len(states)
    viterbi = [{} for _ in range(n)]
    backpointers = [{} for _ in range(n)]

    # Initialization at time step 0
    for state in states:
        emission_prob = emission_params.get((sentence[0], state), 1e-10)
        viterbi[0][state] = math.log(transition_params['START'].get(state, 1e-10)) + math.log(emission_prob)
        backpointers[0][state] = 'START'

    # Forward pass
    for t in range(1, n):
        for state in states:
            max_prob = float('-inf')
            prev_state = None
            for prev_state in states:
                transition_prob = transition_params[prev_state].get(state, 1e-10)
                emission_prob = emission_params.get((sentence[t], state), 1e-10)
                prob = viterbi[t - 1].get(prev_state,1e-10) + math.log(transition_prob) + math.log(emission_prob)
                if prob > max_prob:
                    max_prob = prob
                    backpointers[t][state] = prev_state
            viterbi[t][state] = max_prob

    # Termination step
    max_prob = float('-inf')
    final_state = None
    for state in states:
        # print(viterbi[n - 1][state])
        transition_prob = transition_params[state].get('STOP', 1e-10)
        prob = viterbi[n - 1][state] + math.log(transition_prob)
        if prob > max_prob:
            max_prob = prob
            final_state = state

    # Backtracking step
    best_path = [final_state]
    for t in range(n - 1, 0, -1):
        best_path.insert(0, backpointers[t][best_path[0]])

    return best_path

def run_viterbi_on_dev_set_2(dev_set, transition_params, emission_params, states):
    output = []
    list_of_sentences = [list(sub) for ele, sub in groupby(dev_set, key = bool) if ele]
    for sentence in list_of_sentences:
        best_path = viterbi_algorithm_2(sentence, transition_params, emission_params, states)
        output.append(best_path)

    return output




In [38]:
# Assume train_set, dev_in, and dev_out are lists of sentences and words
# with their corresponding tags

# Train the model on the training set
transition_params = estimate_transition_parameters_test(es)
emission_params,count = estimate_emission_params(es)
# print(transition_params.get("START",1e-10))
states = []
for i in count.keys():
    states.append(i)
# Run Viterbi algorithm on the development set
predicted_tags = run_viterbi_on_dev_set(dev_in_es, transition_params, emission_params, states)

predicted_tags_2 = run_viterbi_on_dev_set_2(dev_in_es, transition_params, emission_params, states)
# Compute metrics
def actual_tags(test_set):
    tags =[]
    list_of_sentences = [list(sub) for ele, sub in groupby(test_set, key = bool) if ele]
    for sentence in list_of_sentences:
        innerlist =[]
        for word in sentence:
            w, state = word.split()
            innerlist.append(state)
        tags.append(innerlist)
    return tags

# precision, recall, f_score = compute_metrics(actual_tags(dev_out_es), predicted_tags)
precision = scores(actual_tags(dev_out_es), predicted_tags)
print("Precision:", precision)
precision_2 = scores(actual_tags(dev_out_es), predicted_tags_2)
print("Precision2:", precision_2)
# print("Recall:", recall)
# print("F-score:", f_score)


3998 4312
Precision: 0.9271799628942486
4032 4312
Precision2: 0.935064935064935
