In [2]:
from collections import  defaultdict

with open("Data/ES/train") as f:
    es = f.read().splitlines()

with open("Data/RU/train") as f:
    ru = f.read().splitlines()


In [3]:
# CONSTANTS
tags = ["B-neutral", "B-positive", "B-negative", "I-neutral", "I-positive", "I-negative", "O"]

In [4]:
def estimate_emission_params(train_data,k=1):
    word_sentiment_counts = defaultdict(lambda: defaultdict(int))
    sentiment_counts =  defaultdict(int)
    emission_params = {}
    
    #getting the count(y) and count(y --> x)
    for sentence in train_data:
        try:
            if(sentence!=""):
                x, label = sentence.split(" ")
        except:
            continue

        sentiment_counts[label]+=1

        word_sentiment_counts[label][x]+=1



    #calculating the emission parameters

    for key in word_sentiment_counts:
        for word in word_sentiment_counts[key]:
            emission_params[(word,key)] = word_sentiment_counts[key][word]/(sentiment_counts[key])
    return emission_params

In [5]:
def estimate_emission_params_unk(train_data, k=1):
    word_sentiment_counts = defaultdict(lambda: defaultdict(int))
    sentiment_counts =  defaultdict(int)
    emission_params = {}
    for sentence in train_data:
        try:
            if(sentence!=""):
                x, label = sentence.split(" ")
        except:
            continue

        sentiment_counts[label]+=1

        word_sentiment_counts[label][x]+=1

    for tag in tags:
        word_sentiment_counts[tag]["#UNK"] = k

    #calculating the emission parameters

    for key in word_sentiment_counts:
        for word in word_sentiment_counts[key]:
            emission_params[(word,key)] = word_sentiment_counts[key][word]/(sentiment_counts[key] + k)
    return emission_params

In [6]:
# results of training using es and ru datasets for part 1
result_es = estimate_emission_params(es)
result_ru = estimate_emission_params(ru)
# # results of training using es and ru datasets for part 2
result_es_unk = estimate_emission_params_unk(es)
result_ru_unk = estimate_emission_params_unk(ru)

PART 3

In [7]:
# Calculating the y_optimal using training_data
def calc_y_optimal(emission_params):
    y_optimal = defaultdict(str)
    for word, sentiment in emission_params:
        max1 = []
        for tag in tags:
            if (word, tag) in emission_params:
                max1.append((tag, emission_params[(word, tag)]))
        y_optimal[word] = max(max1, key=lambda x: x[1])[0]
    return y_optimal

In [8]:
# writing the output onto dev.p1.out
def write_output(path, emission_params, emission_unk):
    y_optimal = calc_y_optimal(emission_params)
    with open(f'{path}/dev.in') as f:
        x = f.readlines()
    with open(f'{path}/dev.p1.out', 'w') as f:
        for test_word in x:
            if test_word == '\n':
                f.write(test_word)
            else:
                word = test_word.strip()
                if word in y_optimal:
                    f.write(word + ' ' + y_optimal[word] + '\n')
                else:
                    # if the word is an unk, use the emission_unk function to find the y_max and write it into the file
                    e_max = 0
                    y_max = ""
                    for tag in tags:
                        e = emission_unk[("#UNK", tag)]
                        if e_max < e:
                            y_max = tag
                            e_max = e
                    f.write(word +" "+y_max+"\n")

# Instructions to run for each case

- Run both the cell below after making sure all the above cells have been executed.
- After running the cell below, a dev.p1.out file should appear in the Data/ES and Data/RU folder respectively.
- Then you could run the evalScript.py to get the precision, recall and F-score.

In [9]:
# writing for ES dataset
write_output("./Data/ES", result_es, result_es_unk)

# writing for RU dataset 
write_output("./Data/RU", result_ru, result_ru_unk)

## Evaluation Result:

### ES (Spanish) 
- **Entity Statistics:**
  - Gold data entities: `229`
  - Predicted entities: `1466`
  - Correct entities: `178`
    - **Precision:** `0.1214`
    - **Recall:** `0.7773`
    - **F-score:** `0.2100`

- **Sentiment Statistics:**
  - Correct sentiment count: `97`
    - **Precision:** `0.0662`
    - **Recall:** `0.4236`
    - **F-score:** `0.1145`

---

### RU (Russian)
- **Entity Statistics:**
  - Gold data entities: `389`
  - Predicted entities: `1816`
  - Correct entities: `266`
    - **Precision:** `0.1465`
    - **Recall:** `0.6838`
    - **F-score:** `0.2413`

- **Sentiment Statistics:**
  - Correct sentiment count: `129`
    - **Precision:** `0.0710`
    - **Recall:** `0.3316`
    - **F-score:** `0.1170`
