In [3]:
with open("Data/ES/train", 'rb') as file:
    EStrain_content = file.read()

with open("Data/RU/train", 'rb') as file:
    RUtrain_content = file.read()

with open("Data/ES/dev.in", 'rb') as file:
    ESin_content = file.read()

with open("Data/RU/dev.in", 'rb') as file:
    RUin_content = file.read()

estrain = [line.decode('utf-8')  for line in EStrain_content.split(b'\n')]
rutrain = [line.decode('utf-8') for line in RUtrain_content.split(b'\n')]
esin = [line.decode('utf-8') for line in ESin_content.split(b'\n')]
ruin = [line.decode('utf-8') for line in RUin_content.split(b'\n')]



def estimate_emissions(data, k=1):
    sentiment_count = {}
    emission_parameters = {}
    
    for line in data:
        if line == "" or ' ' not in line:  # Modified this condition
            continue
        
        word, sentiment = line.rsplit(' ', 1)
        if sentiment not in sentiment_count.keys():
            sentiment_count[sentiment] = 1
        else:
            sentiment_count[sentiment] += 1
            
        if word not in emission_parameters.keys():
            emission_parameters[word] = {sentiment:1}
        elif word in emission_parameters.keys() and sentiment not in emission_parameters[word]:
            emission_parameters[word][sentiment] = 1
        else:
            emission_parameters[word][sentiment] += 1
            
    emission_parameters["#UNK#"] = {}
    
    for sentiment in sentiment_count.keys():
        emission_parameters['#UNK#'][sentiment] = k
        sentiment_count[sentiment] += k
        
    for word, sentiment in emission_parameters.items():
        for sentiment, count in sentiment.items():
            emission_parameters[word][sentiment] = count/sentiment_count[sentiment]
            
    return emission_parameters


def sentiment_analysis(data):
    params = estimate_emissions(data)
    tag_dict = {}
    for word, sentiment in params.items():
        tag = max(sentiment, key = sentiment.get)
        tag_dict[word] = tag
    return tag_dict
        
estags = sentiment_analysis(estrain)
rutags = sentiment_analysis(rutrain)

with open("Data/ES/dev.p1.out", "w", encoding = 'utf-8') as file:
    for word in esin:
        if word == "":
            line = ""
        elif word not in estags.keys():
            sentiment = estags["#UNK#"]
            line = f"{word} {sentiment}"
        else:
            sentiment = estags[word]
            line = f"{word} {sentiment}"
        file.write(line + "\n")

with open("Data/RU/dev.p1.out", "w", encoding = 'utf-8') as file:
    for word in ruin:
        if word == "":
            sentiment = ""
        elif word not in rutags.keys():
            sentiment = rutags["#UNK#"]
        else:
            sentiment = rutags[word]
        line = f"{word} {sentiment}"
        file.write(line + "\n")

In [5]:
import numpy as np

def estimate_transition(data):
    count = {}
    params = {}
    states = []
    sentence = []
    for line in data:
        if line.strip() != "":
            sentence.append(line.split()[-1])
        elif sentence != [] and line == "":
            states.append(sentence)
            sentence = []

    for sentence in states:
        if "START" not in count.keys():
            count["START"] = 1
        else:
            count["START"] += 1
    
        if ("START", sentence[0]) not in params.keys():
            params[("START", sentence[0])] = 1
        else:
            params[("START", sentence[0])] += 1
        
        for i in range(len(sentence)):
            if sentence[i] not in count.keys():
                count[sentence[i]] = 1
            else:
                count[sentence[i]] += 1
            if i != 0:
                if (sentence[i-1], sentence[i]) not in params.keys():
                    params[(sentence[i-1], sentence[i])] = 1
                else:
                    params[(sentence[i-1], sentence[i])] += 1
        if "STOP" not in count.keys():
            count["STOP"] = 1
        else:
            count["STOP"] += 1
        if (sentence[-1], "STOP") not in params.keys():
            params[(sentence[-1], "STOP")] = 1
        else:
            params[(sentence[-1], "STOP")] += 1
    for pair in params:
        params[pair] = params[pair]/count[pair[0]]
    return params

es_e_params = estimate_emissions(estrain)
es_t_params = estimate_transition(estrain)
print(es_t_params)
ru_e_params = estimate_emissions(rutrain)
ru_t_params = estimate_transition(rutrain)

def viterbi(e_params, t_params, sentence):
    n = len(sentence)
    path = []
    states = []
    policy = {}
    for pair in t_params.keys():
        states.append(pair[0])
    states = set(states)
    matrix = [{"START":0}]
    for i in range(1, n+1):
        policy[i] = {}
        matrix.append({})
        word = sentence[i-1]
        if word not in e_params.keys():
            word = "#UNK#"
        for v in e_params[word]:
            matrix[i][v] = -np.inf
            for u in matrix[i-1]:
                if (u, v) not in t_params.keys():
                    continue
                score = matrix[i-1][u] + np.log(e_params[word][v] * t_params[(u, v)])
                if score > matrix[i][v]:
                    matrix[i][v] = score
                    policy[i][v] = u

    for i in range(n):
        if all(score == -np.inf for score in matrix[i].values()):
            return ["O"]*n
    end_state = max(matrix[n], key = lambda state: matrix[n][state])
    prev_state = end_state
    path.append(prev_state)
    for i in range(n, 1, -1):
        path.append(policy[i][prev_state])
        prev_state = policy[i][prev_state]
    return path[::-1]

{('START', 'O'): 1.0, ('O', 'O'): 0.9445496814189771, ('O', 'B-positive'): 0.03984845875667298, ('B-positive', 'O'): 0.8801724137931034, ('O', 'B-negative'): 0.013122094024453246, ('B-negative', 'O'): 0.821522309711286, ('O', 'B-neutral'): 0.002445324608231445, ('B-neutral', 'I-neutral'): 0.20833333333333334, ('I-neutral', 'I-neutral'): 0.6511627906976745, ('I-neutral', 'O'): 0.3488372093023256, ('B-positive', 'I-positive'): 0.11637931034482758, ('I-positive', 'I-positive'): 0.5700636942675159, ('I-positive', 'O'): 0.4299363057324841, ('B-neutral', 'O'): 0.7916666666666666, ('B-negative', 'I-negative'): 0.1784776902887139, ('I-negative', 'O'): 0.39766081871345027, ('I-negative', 'I-negative'): 0.6023391812865497, ('B-positive', 'B-neutral'): 0.0008620689655172414, ('B-positive', 'B-positive'): 0.002586206896551724, ('O', 'STOP'): 3.4441191665231616e-05}


In [51]:
with open("ES/dev.p2.out", "w", encoding = 'utf-8') as file:
    sentences = []
    sentence = []
    for line in esin:
        if line != "":
            sentence.append(line)
        elif line == "" and sentence != []:
            sentences.append(sentence)
            sentence = []
    for sentence in sentences:
        path = viterbi(es_e_params, es_t_params, sentence)
        for word, sentiment in zip(sentence, path):
            line = f"{word} {sentiment}\n"
            file.write(line)
        file.write("\n")
    

with open("RU/dev.p2.out", "w", encoding = 'utf-8') as file:
    sentences = []
    sentence = []
    for line in ruin:
        if line != "":
            sentence.append(line)
        elif line == "" and sentence != []:
            sentences.append(sentence)
            sentence = []
    for sentence in sentences:
        path = viterbi(ru_e_params, ru_t_params, sentence)
        for word, sentiment in zip(sentence, path):
            line = f"{word} {sentiment}\n"
            file.write(line)
        file.write("\n")