In [1]:
from collections import defaultdict

with open("Data/ES/train") as f:
    es = f.read().splitlines()

with open("Data/RU/train") as f:
    ru = f.read().splitlines()

In [2]:
with open("Data/ES/dev.in") as f:
    dev_in_es = f.read().splitlines()
with open("Data/ES/dev.out") as f:
    dev_out_es = f.read().splitlines()
with open("Data/RU/dev.in") as f:
    dev_in_ru = f.read().splitlines()
with open("Data/RU/dev.out") as f:
    test_out_ru = f.read().splitlines()

In [3]:
def estimate_emission_params(train_data,k=1):

    word_sentiment_counts = defaultdict(lambda: defaultdict(int))
    sentiment_counts =  defaultdict(int)
    emission_params = {}
    
    #getting the count(y) and count(y --> x)
    for sentence in train_data:
        try:
            if(sentence!=""):
                x, label = sentence.split(" ")
        except:
            continue

        sentiment_counts[label]+=1

        word_sentiment_counts[label][x]+=1



    #calculating the emission parameters

    for key in word_sentiment_counts:
        for word in word_sentiment_counts[key]:
            emission_params[(word,key)] = word_sentiment_counts[key][word]/(sentiment_counts[key])
    # print(emission_params)
    return emission_params, sentiment_counts

In [4]:
es_para, count =estimate_emission_params(es)
print(es_para[("palo", "O")])
states = []
for i in count.keys():
    states.append(i)
print("count", states)

3.238656605240146e-05
count ['O', 'B-positive', 'B-negative', 'B-neutral', 'I-neutral', 'I-positive', 'I-negative']


In [7]:
def format_sentences(data):
    formatted =[]
    innerlist = []
    for sentence in data:
        try:
            if(sentence!=""):
                x, label = sentence.split(" ")
                innerlist.append(label)
            if(sentence==""):
                formatted.append(innerlist.copy())
                innerlist.clear()
                
        except:
            continue
    # print(formatted)
    return formatted

In [13]:
from itertools import groupby

def estimate_transition_parameters(sentences):
    transition_counts = {}
    state_counts = {}
    list_of_sentences = format_sentences(sentences)
    for one_sentence in list_of_sentences:
        # print(one_sentence)
        prev_state = 'START'
        for state in one_sentence:
            if state_counts.get(prev_state):
                state_counts[prev_state] +=1
            else:
                state_counts[prev_state] = 1

            if prev_state not in transition_counts:
                transition_counts[prev_state] = {}
            if state not in transition_counts[prev_state]:
                transition_counts[prev_state][state] = 1
            else:
                transition_counts[prev_state][state] += 1
            prev_state = state
        if "END" not in transition_counts[prev_state]:
            transition_counts[prev_state]["END"] = 1
            state_counts[prev_state] +=1
        else: 
            transition_counts[prev_state]["END"] +=1
            state_counts[prev_state] +=1
    # print(state_counts) 


    # Calculate transition probabilities and assign a default value of 0 if the entry doesn't exist
    for from_state, to_states in transition_counts.items():
        for state, x in to_states.items():
            # print(from_state,state)
            if state not in transition_counts[from_state]:
                transition_counts[from_state][state] = 0
            else:
                transition_counts[from_state][state] = transition_counts[from_state][state] / state_counts[from_state]

            

    # print("transition", transition_counts, "state", state_counts)       
    return transition_counts

In [14]:
estimate_transition_parameters(es)

{'START': {'O': 0.9289176090468497,
  'B-positive': 0.052234787291330104,
  'B-negative': 0.014001077005923533,
  'B-neutral': 0.004846526655896607},
 'O': {'O': 0.8856896848630963,
  'B-positive': 0.03650766316514551,
  'END': 0.06344067504735663,
  'B-negative': 0.012226623041157224,
  'B-neutral': 0.0021353538832443605},
 'B-positive': {'O': 0.871551724137931,
  'I-positive': 0.11637931034482758,
  'END': 0.008620689655172414,
  'B-neutral': 0.0008620689655172414,
  'B-positive': 0.002586206896551724},
 'B-negative': {'O': 0.8110236220472441,
  'I-negative': 0.1784776902887139,
  'END': 0.010498687664041995},
 'B-neutral': {'I-neutral': 0.20833333333333334, 'O': 0.7916666666666666},
 'I-neutral': {'I-neutral': 0.6511627906976745, 'O': 0.3488372093023256},
 'I-positive': {'I-positive': 0.5700636942675159,
  'O': 0.4267515923566879,
  'END': 0.0031847133757961785},
 'I-negative': {'O': 0.39766081871345027, 'I-negative': 0.6023391812865497}}

In [15]:
import math

def viterbi_algorithm(sentence, transition_params, emission_params, states):
    n = len(sentence)
    viterbi = [{} for _ in range(n)]
    backpointers = [{} for _ in range(n)]

    # Initialization at time step 0
    for state in states:
        emission_prob = emission_params.get((sentence[0], state), 1e-10)
        viterbi[0][state] = math.log(transition_params['START'].get(state, 1e-10)) + math.log(emission_prob)
        backpointers[0][state] = 'START'

    # Forward pass
    for t in range(1, n):
        for state in states:
            max_prob = float('-inf')
            prev_state = None
            for prev_state in states:
                transition_prob = transition_params[prev_state].get(state, 1e-10)
                emission_prob = emission_params.get((sentence[t], state), 1e-10)
                prob = viterbi[t - 1].get(prev_state,1e-10) + math.log(transition_prob) + math.log(emission_prob)
                if prob > max_prob:
                    max_prob = prob
                    backpointers[t][state] = prev_state
            viterbi[t][state] = max_prob

    # Termination step
    max_prob = float('-inf')
    final_state = None
    for state in states:
        # print(viterbi[n - 1][state])
        transition_prob = transition_params[state].get('STOP', 1e-10)
        prob = viterbi[n - 1][state] + math.log(transition_prob)
        if prob > max_prob:
            max_prob = prob
            final_state = state

    # Backtracking step
    best_path = [final_state]
    for t in range(n - 1, 0, -1):
        best_path.insert(0, backpointers[t][best_path[0]])

    return best_path

def run_viterbi_on_dev_set(dev_set, transition_params, emission_params, states):
    output = []
    list_of_sentences = [list(sub) for ele, sub in groupby(dev_set, key = bool) if ele]
    for sentence in list_of_sentences:
        best_path = viterbi_algorithm(sentence, transition_params, emission_params, states)
        output.append(best_path)

    return output

In [16]:
def actual_word(test_set): 
    tags =[]
    list_of_sentences = [list(sub) for ele, sub in groupby(test_set, key = bool) if ele]
    for sentence in list_of_sentences:
        innerlist =[]
        for word in sentence:
            innerlist.append(word)
        tags.append(innerlist)
    return tags

In [17]:
transition_params_es = estimate_transition_parameters(es)
emission_params_es,count = estimate_emission_params(es)
transition_params_ru = estimate_transition_parameters(ru)
emission_params_ru,count = estimate_emission_params(ru)

states = []
for i in count.keys():
    states.append(i)
predicted_tags_ES = run_viterbi_on_dev_set(dev_in_es, transition_params_es, emission_params_es, states)
predicted_tags_RU = run_viterbi_on_dev_set(dev_in_ru, transition_params_ru, emission_params_ru, states)


actual_words_ES = actual_word(dev_in_es)
actual_words_RU = actual_word(dev_in_ru)

In [18]:
import sys

def compute_output(words, predicted_tags):
    output_captured = []

    for word, pred_tag in zip(words, predicted_tags):
        for w, pred in zip(word, pred_tag):
            output_captured.append(w + " " + pred)  # Append the combined word and tag
        output_captured.append("")  # Append an empty line after each sentence
    return output_captured  # Return the captured output as a list


# Call the compute_output function with actual_words and sentences_8
output_captured_ES = compute_output(actual_words_ES, predicted_tags_ES)
output_captured_RU = compute_output(actual_words_RU, predicted_tags_RU)

# Write the captured output to a text file
output_filename = "dev.p2.out"  # Change this to your desired filename
output_path_ES = "Data/ES/"  # Change this to your desired folder
output_path_RU = "Data/RU/"

with open(output_path_ES + output_filename, "w") as file:
    for line in output_captured_ES:
        file.write(line + "\n")

with open(output_path_RU + output_filename, "w") as file:
    for line in output_captured_RU:
        file.write(line + "\n")