In [2]:
# Import libraries
import os
import numpy as np
from collections import Counter

In [3]:
# Enable floating-point underflow warnings and disable divide-by-zero warnings
np.seterr(under="warn", divide="ignore")

np.set_printoptions(precision=3, suppress=True)

# Set OS-independent paths, relative to current directory
es_train_path = os.path.join("data", "ES", "train")
es_dev_in_path = os.path.join("data", "ES", "dev.in")
es_dev_out_path = os.path.join("data", "ES", "dev.out")
es_dev_p1_out_path = os.path.join("data", "ES", "dev.p1.out")
es_dev_p2_out_path = os.path.join("data", "ES", "dev.p2.out")
es_dev_p3_out_path = os.path.join("data", "ES", "dev.p3.out")
ru_train_path = os.path.join("data", "RU", "train")
ru_dev_in_path = os.path.join("data", "RU", "dev.in")
ru_dev_out_path = os.path.join("data", "RU", "dev.out")
ru_dev_p1_out_path = os.path.join("data", "RU", "dev.p1.out")
ru_dev_p2_out_path = os.path.join("data", "RU", "dev.p2.out")
ru_dev_p3_out_path = os.path.join("data", "RU", "dev.p3.out")

# Define constant variables
N = 7
START, O, BPOS, IPOS, BNEU, INEU, BNEG, INEG, END = 0, 1, 2, 3, 4, 5, 6, 7, 8
labels = {"START": START,
          "O": O,
          "B-positive": BPOS,
          "I-positive": IPOS,
          "B-neutral": BNEU,
          "I-neutral": INEU,
          "B-negative": BNEG,
          "I-negative": INEG,
          "END": END}
labels_list = ["START", "O", "B-positive", "I-positive", "B-neutral", "I-neutral", "B-negative", "I-negative", "END"]

In [4]:
# Read dev.in data
def read_dev_in_data(filepath):
    results = []
    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            results.append(line.strip())
    return results

# Read dev.out data
def read_dev_out_data(filepath):
    results = []
    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            if len(line.strip().rsplit(" ", 1)) == 2:
                token, label = line.strip().rsplit(" ", 1)
                results.append((token, labels[label]))
            else:
                continue
    return results

# Read dev.out data with line ending
def read_dev_out_data_w_end(filepath):
    results = []
    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            if len(line.strip().rsplit(" ", 1)) == 2:
                token, label = line.strip().rsplit(" ", 1)
                results.append((token, labels[label]))
            else:
                results.append(('', END))
                results.append(('', START))
    return results

# Read training data
def read_training_data(filepath):
    results = []
    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            if len(line.strip().rsplit(" ", 1)) == 2:
                token, label = line.strip().rsplit(" ", 1)
                results.append((token, labels[label]))
            else:
                continue
    return results

# Read training data with line ending
def read_training_data_w_end(filepath):
    results = []
    with open(filepath, "r", encoding="utf-8") as file:
        lines = file.readlines()
        for line in lines:
            if len(line.strip().rsplit(" ", 1)) == 2:
                token, label = line.strip().rsplit(" ", 1)
                results.append((token, labels[label]))
            else:
                results.append(('', END))
                results.append(('', START))
    return results

In [19]:
from collections import defaultdict

def train_emission_parameters(training_data, k=1.0):
    """
    Estimate emission parameters from the training set using MLE.
    
    Args:
    - training_data: List of (word, label) tuples.
    - k: Smoothing parameter.
    
    Returns:
    - emission_params: Dictionary with keys as labels and values as another dictionary of words and their probabilities.
    """
    # Count(y -> x) and Count(y)
    emission_counts = defaultdict(lambda: defaultdict(int))
    label_counts = defaultdict(int)
    
    # Populate the counts
    for word, label in training_data:
        word = word.lower() # To make sure it treats words like Apple and apple as the same
        emission_counts[label][word] += 1
        label_counts[label] += 1
    # print("Emission Counts: ", emission_counts)
    # print("Label Counts: ",label_counts)
    
    # Calculate emission parameters with smoothing
    emission_params = defaultdict(dict)
    for label, word_counts in emission_counts.items():
        #print(label)
        for word, count in word_counts.items():
            #print(word,count)
            emission_params[label][word] = (count + k) / (label_counts[label] + k)
        
        # Handle #UNK# token
        emission_params[label]['#UNK#'] = k / (label_counts[label] + k) #  words that do not appear in the training set
    print("Emission Paramslen : ", len(emission_params))
    return emission_params

def simple_sentiment_analysis(sentence, emission_params):
    """
    Implement a simple sentiment analysis system.
    
    Args:
    - sentence: List of words.
    - emission_params: Emission parameters from the training data.
    
    Returns:
    - tags: List of predicted tags for the sentence.
    """
    tags = []
    for word in sentence:
        word = word.lower() # To make sure it treats words like Apple and apple as the same
        probabilities = []
        # Find the label with the maximum emission probability for the word
        for label, params in emission_params.items():
            word_probability = params.get(word, params['#UNK#'])
            probabilities.append((label, word_probability))
        
        #print("Probabilities: ",probabilities)
        best_label = max(probabilities, key=lambda x: x[1])[0]
        tags.append((word,best_label))
        
    
    return tags

def calculate_scores(predicted_tags, gold_tags):
    """
    Calculate precision, recall, and F scores.
    
    Args:
    - predicted_tags: List of predicted tags.
    - gold_tags: List of gold standard tags.
    
    Returns:
    - precision, recall, f_score
    """
    correct = 0
    for p, g in zip(predicted_tags,gold_tags):
        #print(p,g)
        w,l = p
        cw,cl = g
        if l == cl:
            correct+= 1
    total_predicted = len(predicted_tags)
    total_gold = len(gold_tags)

    precision = correct / total_predicted if (total_predicted) != 0 else 0
    recall = correct / total_gold if (total_gold) != 0 else 0
    f_score = (2 * precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

    return precision, recall, f_score



# Reading data:
training_data = read_training_data(ru_train_path)
test_data = read_dev_in_data(ru_dev_in_path)
correct_data = read_dev_out_data(ru_dev_out_path)

training_data_es = read_training_data(es_train_path)
test_data_es = read_dev_in_data(es_dev_in_path)
correct_data_es = read_dev_out_data(es_dev_out_path)


# print("Training data: ",training_data)
# print("Test Data: ", test_data)
# print("Dev out; ",correct_data)

# Calculating emission params with train data
emission_params = train_emission_parameters(training_data)
predicted_tags = simple_sentiment_analysis(test_data, emission_params)
output_file_path = "predicted_tags.txt"
# Open the file in write mode and save the predicted tags
with open(output_file_path, 'w', encoding="utf-8") as file:
    for word, tag in predicted_tags:
        file.write(f"{word} {labels_list[tag]}\n")

print(f"Predicted tags saved to {output_file_path}")

emission_params_es = train_emission_parameters(training_data_es)
predicted_tags_es = simple_sentiment_analysis(test_data_es, emission_params_es)
output_file_path = "predicted_tags_es.txt"
# Open the file in write mode and save the predicted tags
with open(output_file_path, 'w', encoding="utf-8") as file:
    for word, tag in predicted_tags_es:
        file.write(f"{word} {labels_list[tag]}\n")

print(f"Predicted tags saved to {output_file_path}")


# Correct data
gold_tags = correct_data
gold_tags_es = correct_data_es
# print("predicted_tags: ",predicted_tags)
# print("gold_tags: ",gold_tags)


# Calculating the Precision, Recall and F-Score
print("-----------------------------Russian---------------------------------------------------")
precision, recall, f_score = calculate_scores(predicted_tags, gold_tags)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F Score: {f_score:.2f}")
print("-----------------------------Spanish--------------------------------------------------")
precision, recall, f_score = calculate_scores(predicted_tags_es, gold_tags_es)
print(f"Precision: {precision:.2f}, Recall: {recall:.2f}, F Score: {f_score:.2f}")



Emission Paramslen :  7


KeyError: 4