Reading the data 

In [145]:
def read_data(file_path):
    sentences = []          # Holds all the sentences parsed from the file
    current_sentence = []   # Holds the words and tags of current sentence being processed
    
    with open(file_path, 'r', encoding='utf-8') as f:
        for line_number, line in enumerate(f, start=1):
            if line == "" "" or line == '' or line == "\n":
                continue
            else:
                line = line.strip() # Remove leading/trailing whitespaces
            
            # Check if the line is not empty
            if line:
                parts = line.split() # split the lines into words and tags
                if len(parts) >= 2:
                    word, tag = ' '.join(parts[:-1]), parts[-1] # join parts except the last one for multi-word tokens
                    current_sentence.append((word, tag))
                else:
                    print(f"Error in line {line_number}: Unexpected format - {line}")
            else:
                # If the line is empty, finish the current sentence and start a new one
                if current_sentence:
                    sentences.append(current_sentence)
                current_sentence = []
    
    # If there's remaining sentence at the end, add it to the list of sentences
    if current_sentence:
        sentences.append(current_sentence)
    
    return sentences


# Load data
RU_file = './RU/train'
RU_training_data = read_data(RU_file)
print(RU_training_data)

# Counting B-positive tag occurrences
b_positive_count = 0
for sentence in RU_training_data:
    for _, tag in sentence:
        if tag == "B-positive":
            b_positive_count += 1

print("Total B-positive tags:", b_positive_count)

[[('Еда', 'B-positive'), ('вкусная', 'O'), (',', 'O'), ('но', 'O'), ('отдельно', 'O'), ('хочу', 'O'), ('отметить', 'O'), ('красивую', 'O'), ('сервировку', 'B-positive'), ('блюд', 'I-positive'), (';', 'O'), ('.', 'O'), ('Филадельфию', 'B-positive'), ('мне', 'O'), ('удалось', 'O'), ('только', 'O'), ('попробовать', 'O'), (',', 'O'), ('сделали', 'O'), ('на', 'O'), ('отлично', 'O'), (',', 'O'), ('хороший', 'O'), ('кусок', 'O'), ('лосося-филадельфии-чуток', 'B-positive'), ('риса', 'I-positive'), (',', 'O'), ('смело', 'O'), ('возьму', 'O'), ('в', 'O'), ('следующий', 'O'), ('раз', 'O'), ('Десерт', 'B-positive'), ('Тирамису', 'I-positive'), ('просто', 'O'), ('таял', 'O'), ('во', 'O'), ('рту', 'O'), (')', 'O'), (')', 'O'), (')', 'O'), ('Очень', 'O'), ('благодарны', 'O'), ('персоналу', 'B-positive'), ('за', 'O'), ('качественное', 'O'), ('обслуживание', 'B-positive'), ('.', 'O'), ('Были', 'O'), ('здесь', 'O'), ('первый', 'O'), ('раз', 'O'), ('и', 'O'), (',', 'O'), ('точно', 'O'), (',', 'O'), ('не'

In [133]:
ES_data = read_data('./ES/train')
print(ES_data)

[[('Estuvimos', 'O'), ('hace', 'O'), ('poco', 'O'), ('mi', 'O'), ('pareja', 'O'), ('y', 'O'), ('yo', 'O'), ('comiendo', 'O'), ('y', 'O'), ('resultó', 'O'), ('todo', 'O'), ('muy', 'O'), ('bien', 'O'), (',', 'O'), ('tanto', 'O'), ('la', 'O'), ('comida', 'B-positive'), (',', 'O'), ('el', 'O'), ('vino', 'B-positive'), (',', 'O'), ('el', 'O'), ('trato', 'B-positive'), (',', 'O'), ('la', 'O'), ('decoración', 'B-positive'), ('…', 'O'), ('nos', 'O'), ('gustó', 'O'), ('todo', 'O'), ('mucho', 'O'), ('.', 'O'), ('Por', 'O'), ('poner', 'O'), ('algún', 'O'), ('pero', 'O'), (',', 'O'), ('quizá', 'O'), ('el', 'O'), ('jamón', 'B-negative'), ('no', 'O'), ('era', 'O'), ('todo', 'O'), ('lo', 'O'), ('"', 'O'), ('ibérico', 'O'), ('"', 'O'), ('que', 'O'), ('cabía', 'O'), ('esperar', 'O'), ('.', 'O'), ('Bien', 'O'), ('lo', 'O'), ('sabe', 'O'), ('el', 'O'), ('autor', 'O'), ('del', 'O'), ('blog', 'O'), ('.', 'O'), (')', 'O'), ('Comida', 'B-positive'), ('exquisita', 'O'), ('.', 'O'), ('Restaurante', 'B-positive

PART 1 (Section 1)

In [146]:
from fractions import Fraction

#function that estimates the emission parameters from the training set using MLE
def estimate_emission_parameters_mle(training_data):
    emission_params = {}    # emission parameters initialization
    emission_counts = {}    # initialize emission counts of each word given each tag
    
    # Looping through the training data
    for sentence in training_data:
        for word, tag in sentence:
            if tag not in emission_counts:
                emission_counts[tag] = {}
            if word not in emission_counts[tag]:
                emission_counts[tag][word] = 0
            emission_counts[tag][word] += 1
            
    # Calculating MLE
    for tag, word_counts in emission_counts.items():
        total_count = sum(word_counts.values())
        # sub-dictionary for current tag in emission_params
        emission_params[tag] = {}
        for word, count in word_counts.items():
            emission_params[tag][word] = count / total_count
            # emission_params[tag][word] = Fraction(count, total_count)
                
    return emission_params
        

In [147]:
ES_file = './ES/train'
ES_training_data = read_data(ES_file)
emission_params = estimate_emission_parameters_mle(ES_training_data)
print(emission_params)

RU_file = './RU/train'
RU_training_data = read_data(RU_file)
emission_params = estimate_emission_parameters_mle(RU_training_data)
print(emission_params)

# check_emission = sum(emission_params['B-positive'].values())
# print(f"Sum of all words emitting label (B-positive) should be equal to 1 : {True if check_emission == 1 else False}")

{'O': {'Estuvimos': 0.0002066471499913897, 'hace': 0.0008954709832960221, 'poco': 0.0018942655415877389, 'mi': 0.0024797657998966763, 'pareja': 0.00044773549164801103, 'y': 0.035267780265197175, 'yo': 0.0012398828999483382, 'comiendo': 0.00034441191665231615, 'resultó': 0.00013776476666092647, 'todo': 0.003960737041501636, 'muy': 0.01363871189943172, 'bien': 0.005682796624763217, ',': 0.05731014293094541, 'tanto': 0.001343206474944033, 'la': 0.02600309970724987, 'el': 0.022111245049078698, '…': 0.0015498536249354228, 'nos': 0.005028413983123816, 'gustó': 0.00037885310831754776, 'mucho': 0.0018253831582572756, '.': 0.055898054072670914, 'Por': 0.0010332357499569485, 'poner': 0.00017220595832615808, 'algún': 0.0002066471499913897, 'pero': 0.006578267608059239, 'quizá': 0.00013776476666092647, 'no': 0.012708799724470466, 'era': 0.0017909419665920441, 'lo': 0.009299121749612537, '"': 0.001343206474944033, 'ibérico': 6.888238333046323e-05, 'que': 0.029102806957120718, 'cabía': 3.44411916652

PART 1 (Section 2)

In [148]:
# def estimate_emissions(data, k=1):
#     sentiment_count = {}
#     emission_parameters = {}
    
#     for line in data:
#         if line == "":
#             continue
#         else:
#             word, sentiment = line.rsplit(' ', 1)
#             if sentiment not in sentiment_count.keys():
#                 sentiment_count[sentiment] = 1
#             else:
#                 sentiment_count[sentiment] += 1
#             if word not in emission_parameters.keys():
#                 emission_parameters[word] = {sentiment: 1}
#             elif word in emission_parameters.keys() and sentiment not in emission_parameters[word]:
#                 emission_parameters[word][sentiment] = 1
#             else:
#                 emission_parameters[word][sentiment] += 1
    
#     emission_parameters["#UNK#"] = {}
#     for sentiment in sentiment_count.keys():
#         emission_parameters['#UNK#'][sentiment] = k
#         sentiment_count[sentiment] += k
    
#     for word, sentiment in emission_parameters.items():
#         for sentiment, count in sentiment.items():
#             emission_parameters[word][sentiment] = count / sentiment_count[sentiment]
    
#     return emission_parameters


# def sentiment_analysis(data):
#     params = estimate_emissions(data)
#     tag_dict = {}
#     for word, sentiment in params.items():
#         tag = max(sentiment, key = sentiment.get)
#         tag_dict[word] = tag
#     return tag_dict

In [149]:
with open('./ES/train', "rb") as f:
    train_data = f.read()
passInto = [line.decode('utf-8') for line in train_data.split(b'\n')]


# emission_params = estimate_emissions(passInto, k=1)
# result = sentiment_analysis(passInto)
# # Display the results
# print("Emission Parameters:", emission_params)
# print("Sentiment Analysis Result:", result)

In [154]:
def estimate_emissions_from_data(data, k=1):
    sentiment_count = {}
    emission_parameters = {}

    for sentence in data:
        for word, sentiment in sentence:
            sentiment_count.setdefault(sentiment, 0)
            sentiment_count[sentiment] += 1

            emission_parameters.setdefault(word, {}).setdefault(sentiment, 0)
            emission_parameters[word][sentiment] += 1

    emission_parameters["#UNK#"] = {}
    for sentiment in sentiment_count:
        emission_parameters['#UNK#'][sentiment] = k
        sentiment_count[sentiment] += k

    for word, sentiment_count_dict in emission_parameters.items():
        for sentiment, count in sentiment_count_dict.items():
            emission_parameters[word][sentiment] = count / sentiment_count[sentiment]

    return emission_parameters


def sentiment_analysis_from_data(data):
    # params = estimate_emissions_from_data(data)
    # tag_dict = {word: max(sentiment, key=sentiment.get) for word, sentiment in params.items()}
    # return tag_dict
    tag_dict = {word: max(sentiment, key=sentiment.get) for word, sentiment in emission_params.items()}
    return tag_dict

In [155]:
data = read_data('./ES/train')
# Assuming 'data' contains your pre-processed list of sentences
emission_params = estimate_emissions_from_data(data, k=1)

for word, sentiment_dict in emission_params.items():
    for sentiment, value in sentiment_dict.items():
        print("Word:", word)
        print("Sentiment:", sentiment)
        print("Value:", value)
        
results = sentiment_analysis_from_data(data)

# Display the result
print("Emission Parameters:", emission_params)
print("Sentiment Analysis Result:", results)

Word: Estuvimos
Sentiment: O
Value: 0.00020664003306240529
Word: hace
Sentiment: O
Value: 0.0008954401432704229
Word: poco
Sentiment: O
Value: 0.0018942003030720485
Word: mi
Sentiment: O
Value: 0.0024796803967488635
Word: pareja
Sentiment: O
Value: 0.00044772007163521146
Word: y
Sentiment: O
Value: 0.0352665656426505
Word: y
Sentiment: I-negative
Value: 0.011627906976744186
Word: y
Sentiment: I-positive
Value: 0.015873015873015872
Word: yo
Sentiment: O
Value: 0.0012398401983744318
Word: yo
Sentiment: I-negative
Value: 0.005813953488372093
Word: comiendo
Sentiment: O
Value: 0.00034440005510400884
Word: resultó
Sentiment: O
Value: 0.00013776002204160352
Word: todo
Sentiment: O
Value: 0.0039606006336961016
Word: todo
Sentiment: I-negative
Value: 0.005813953488372093
Word: muy
Sentiment: O
Value: 0.013638242182118749
Word: muy
Sentiment: I-positive
Value: 0.006349206349206349
Word: bien
Sentiment: O
Value: 0.005682600909216145
Word: bien
Sentiment: B-positive
Value: 0.0008613264427217916
W

PART 1 (Section 3)
-> It is in dev.p1.out in each of the folder ES and folder RU

In [139]:
# def predSentiment(input_sentence, emission_params):
#     predicted_tags = []
#     for word in input_sentence:
#         tag_probabilities = [(tag, prob) for tag, prob in emission_params.items() if word in tag]
#         if tag_probabilities:
#             max_tag, max_prob = max(tag_probabilities, key=lambda item: item[1])
#             predicted_tags.append(max_tag)
#         else:
#             predicted_tags.append("O")
#     return predicted_tags


In [164]:
data_file_path = './ES/train'
processedDevInFile = './ES/dev.in'
devOutFilePath = './ES/dev.p1.out'

# Read and parse the content of the data file
with open(data_file_path, 'r', encoding='utf-8') as f:
    data = [line.strip().split() for line in f]
    
# Read and parse the content of the processedDevInFile
with open(processedDevInFile, 'r', encoding='utf-8') as f:
    processed_data = [line.strip().split() for line in f]
#print(processed_data)

sentiment_prediction = [sentiment_analysis_from_data(processed_data)]
print(sentiment_prediction)
# for sentiment_dic in sentiment_prediction:
#     print(sentiment_dict)
#     for sentence_tags, sentiment in sentiment_dict.items():
#         print("sentence:", sentence_tags)
#         print("sentiment:", sentiment)



# Write the predictions to dev.p1.out
with open(devOutFilePath, "w", encoding="utf-8") as f:
    for sentiment_dic in sentiment_prediction:
        for sentiment, val in sentiment_dict.items():
            
        f.write("\n".join(sentence_tags) + " " + sentiment +"\n\n")

[{'Estuvimos': 'O', 'hace': 'O', 'poco': 'O', 'mi': 'O', 'pareja': 'O', 'y': 'O', 'yo': 'I-negative', 'comiendo': 'O', 'resultó': 'O', 'todo': 'I-negative', 'muy': 'O', 'bien': 'O', ',': 'O', 'tanto': 'O', 'la': 'I-positive', 'comida': 'B-positive', 'el': 'I-neutral', 'vino': 'B-negative', 'trato': 'B-positive', 'decoración': 'B-positive', '…': 'B-negative', 'nos': 'O', 'gustó': 'O', 'mucho': 'I-negative', '.': 'O', 'Por': 'O', 'poner': 'O', 'algún': 'O', 'pero': 'I-neutral', 'quizá': 'O', 'jamón': 'I-negative', 'no': 'O', 'era': 'O', 'lo': 'O', '"': 'I-negative', 'ibérico': 'O', 'que': 'O', 'cabía': 'O', 'esperar': 'O', 'Bien': 'O', 'sabe': 'O', 'autor': 'O', 'del': 'I-negative', 'blog': 'O', ')': 'O', 'Comida': 'B-neutral', 'exquisita': 'O', 'Restaurante': 'B-neutral', 'diferente': 'O', 'creativo': 'O', 'agradable': 'O', 'Si': 'O', 'has': 'O', 'probado': 'O', 'sus': 'O', 'carnes': 'B-neutral', 'te': 'O', 'estas': 'O', 'perdiendo': 'O', 'algo': 'O', 'grande': 'O', '!': 'O', 'En': 'B-n

TypeError: can only concatenate str (not "float") to str

PART 4

In [None]:
# import numpy as np

In [None]:
# def tokenize_and_tag(lines):
#     token_tags = []

#     for line in lines:
#         tokens = line.strip().split()
#         if len(tokens) == 2:
#             token, tag = tokens
#             token_tags.append((token, tag))
#         else:
#             token_tags.append(("", ""))

#     return token_tags

# def mini_viterbi(token, labels, vocabulary, emission_probs, transition_probs, initial_probs):
#     possible_states = [label for label in labels if label != 'START']
#     token_lower = token.lower() if token.lower() in vocabulary else "#UNK#"

#     possible_curr_states = [label for label in possible_states if emission_probs.get(label, {}).get(token_lower)]
#     possible_next_states = {curr_state: [label for label in possible_states if transition_probs.get(label, {}).get(curr_state)] for curr_state in possible_curr_states}
    
#     states_matrix = []
#     emission_matrix = []
#     transition_matrix = []
    
#     for curr_state, next_states in possible_next_states.items():
#         for next_state in next_states:
#             states_matrix.append(curr_state)
#             emission_matrix.append(float(emission_probs[curr_state][token_lower]))
#             transition_matrix.append(float(transition_probs[next_state][curr_state]))
    
#     calculation = np.log(emission_matrix) + np.log(transition_matrix)
#     return calculation, transition_matrix, states_matrix

# def teach_perceptron(data, label_set, vocab, num_epochs, emission_probs, transition_probs):
#     weights = {label: 0 for label in label_set if label != "START"}
#     learn_rate = 0.1
#     starting_probs = np.ones((1, 1))
#     transition_counts = {}
    
#     for _ in range(num_epochs):
#         for i, (term, tag) in enumerate(data):
#             if data[i-1][0] == "":
#                 current_term = term
#                 j = i
#                 while current_term != "":
#                     j += 1
#                     current_state = tag
#                     next_state = data[j][1]
#                     if current_state not in transition_counts:
#                         transition_counts[current_state] = {next_state: 1}
#                     elif next_state in transition_counts[current_state]:
#                         transition_counts[current_state][next_state] += 1
#                     current_term = data[j][0]
#                 current_term = ''
#                 j = 0
            
#             if term:
#                 calc, trans_matrix, state_matrix = mini_viterbi(term, label_set, vocab, emission_probs, transition_probs, starting_probs)
#                 starting_probs = starting_probs + np.partition(calc, -1)[-1:]
#                 arg_max = np.argpartition(calc, -1)[-1:]
#                 predicted_state = state_matrix[arg_max[0]]
                
#                 if predicted_state != tag:
#                     next_term, next_state = data[i+1]
#                     real_transition = transition_probs.get(next_state, {}).get(tag, 0)
#                     real_counts = transition_counts.get(tag, {}).get(next_state, 0)
#                     pred_counts = transition_counts.get(tag, {}).get(predicted_state, 0)
                    
#                     weights[tag] = learn_rate * (weights.get(tag) + (real_transition * real_counts))
#                     weights[predicted_state] = learn_rate * (weights.get(predicted_state) - (trans_matrix[arg_max[0]]) * pred_counts)
#             else:
#                 starting_probs = np.ones((1, 1))
    
#     return weights


# # def estimate_structured_perceptron(lines, weights, labels, vocabulary, emission_probs, transition_probs):
# #     # out = open(output_path, "w", encoding="utf-8")
# #     # initial_probs = np.ones((1, 1))

# #     for i in range(len(lines)):
# #         if lines[i] in ["", "", "\n"]:
# #             out.write("\n")
# #             initial_probs = np.ones((1, 1))
# #         elif i != len(lines) - 1:
# #             token = lines[i].strip().split()[0]
# #             calculation, transition_matrix, states_matrix = mini_viterbi(token, labels, vocabulary, emission_probs, transition_probs, initial_probs)
# #             weight_matrix = [weights.get(state) for state in states_matrix]
# #             final_calc = calculation * weight_matrix
# #             initial_probs = initial_probs + np.atleast_1d(np.partition(final_calc, -1)[-1:])
# #             arg_pi = np.atleast_1d(np.argpartition(final_calc, -1)[-1:])
# #             predicted_tag = states_matrix[arg_pi[0]]
# #             out_str = " ".join([token, predicted_tag, "\n"])
# #             out.write(out_str)

# def apply_perceptron(lines, weights, label_set, vocab, emission_probs, transition_probs):
#     results = []
#     starting_probs = np.ones((1, 1))

#     for i in range(len(lines)):
#         if lines[i] in ["", "", "\n"]:
#             results.append("\n")
#             starting_probs = np.ones((1, 1))
#         elif i != len(lines) - 1:
#             token = lines[i].strip().split()[0]
#             calc, trans_matrix, state_matrix = mini_viterbi(token, label_set, vocab, emission_probs, transition_probs, starting_probs)
#             weight_matrix = [weights.get(state) for state in state_matrix]
#             final_calc = calc * weight_matrix
#             starting_probs = starting_probs + np.atleast_1d(np.partition(final_calc, -1)[-1:])
#             arg_max = np.atleast_1d(np.argpartition(final_calc, -1)[-1:])
#             predicted_tag = state_matrix[arg_max[0]]
#             result_str = " ".join([token, predicted_tag, "\n"])
#             results.append(result_str)
    
#     return results

