# 50.007 Machine Learning Project Part 2

____

In [21]:
from collections import defaultdict
import math
from itertools import groupby


In [22]:
import math
from collections import defaultdict
from itertools import groupby

# Function to estimate transition parameters using MLE
# Function to estimate transition parameters using MLE
def estimate_transition_parameters_train(train_data):
    transition_counts = defaultdict(lambda: defaultdict(int))
    state_counts = defaultdict(int)

    for sentence in train_data:
        prev_state = 'START'
        for word, state in sentence:
            transition_counts[prev_state][state] += 1
            state_counts[prev_state] += 1
            prev_state = state
        transition_counts[prev_state]['END'] += 1

    # Calculate transition probabilities
    transition_params = defaultdict(dict)
    for from_state, to_states in transition_counts.items():
        for to_state, count in to_states.items():
            transition_params[from_state][to_state] = count / state_counts[from_state]
            
    return transition_params


# Function to estimate emission parameters using MLE
def estimate_emission_params(train_data):
    word_sentiment_counts = defaultdict(lambda: defaultdict(int))
    sentiment_counts = defaultdict(int)
    
    for sentence in train_data:
        for word, label in sentence:
            sentiment_counts[label] += 1
            word_sentiment_counts[label][word] += 1
            
    emission_params = {}
    for label, word_counts in word_sentiment_counts.items():
        total_label_count = sentiment_counts[label]
        for word, count in word_counts.items():
            emission_params[(word, label)] = count / total_label_count
            
    return emission_params

# Viterbi Algorithm Implementation
def viterbi_algorithm(sentence, transition_params, emission_params, states):
    n = len(sentence)
    num_states = len(states)
    viterbi = [{} for _ in range(n)]
    backpointers = [{} for _ in range(n)]

    # Initialization at time step 0
    for state in states:
        emission_prob = emission_params.get((sentence[0][0], state), 1e-10)
        viterbi[0][state] = math.log(transition_params['START'].get(state, 1e-10)) + math.log(emission_prob)
        backpointers[0][state] = 'START'

    # Forward pass
    for t in range(1, n):
        for state in states:
            max_prob = float('-inf')
            prev_state = None
            for prev_state in states:
                transition_prob = transition_params[prev_state].get(state, 1e-10)
                emission_prob = emission_params.get((sentence[t][0], state), 1e-10)
                prob = viterbi[t - 1].get(prev_state, 1e-10) + math.log(transition_prob) + math.log(emission_prob)
                if prob > max_prob:
                    max_prob = prob
                    backpointers[t][state] = prev_state
            viterbi[t][state] = max_prob

    # Termination step
    max_prob = float('-inf')
    final_state = None
    for state in states:
        transition_prob = transition_params[state].get('END', 1e-10)
        prob = viterbi[n - 1][state] + math.log(transition_prob)
        if prob > max_prob:
            max_prob = prob
            final_state = state

    # Backtracking step
    best_path = [final_state]
    for t in range(n - 1, 0, -1):
        best_path.insert(0, backpointers[t][best_path[0]])

    return best_path

# Load and preprocess the data
def load_data(filename):
    with open(filename, encoding='utf-8') as f:
        data = f.read().splitlines()
    return data

def preprocess_data(data):
    processed_data = []
    sentence = []
    for line in data:
        if line.strip(".r"):  # Skip empty lines
            parts = line.split(" ")
            if len(parts) != 2:
                print("Invalid line format:", line)
                continue  # Skip this line
            word, label = parts
            sentence.append((word, label))
        else:
            processed_data.append(sentence)
            sentence = []
    return processed_data

def write_output(filename, predicted_tags):
    with open(filename, "w", encoding="utf-8") as f:
        for tags in predicted_tags:
            f.write(" ".join(tags) + "\n")

# Load and preprocess the training data
train_data_es = load_data("Data/ES/train")
train_data_ru = load_data("Data/RU/train")
train_data_es = preprocess_data(train_data_es)
train_data_ru = preprocess_data(train_data_ru)

# Estimate model parameters using MLE
transition_params_es = estimate_transition_parameters_train(train_data_es)
transition_params_ru = estimate_transition_parameters_train(train_data_ru)
emission_params_es = estimate_emission_params(train_data_es)
emission_params_ru = estimate_emission_params(train_data_ru)

# Load and preprocess the development data
dev_data_in_es = load_data("Data/ES/dev.in")
dev_data_in_es = preprocess_data(dev_data_in_es)

# Define the states (tags)
states_es = list(emission_params_es.keys())
states_ru = list(emission_params_ru.keys())

# Run Viterbi algorithm on the development set using the learned models
predicted_tags_es = []
for sentence in dev_data_in_es:
    predicted_tags_es.append(viterbi_algorithm(sentence, transition_params_es, emission_params_es, states_es))

# Write the output to a file (dev.p2.out)
write_output("dev.p2.out", predicted_tags_es)

# Define a function to calculate precision, recall, and F-score
def calculate_metrics(true_tags, predicted_tags, states):
    tp = 0
    fp = 0
    fn = 0

    for true_seq, pred_seq in zip(true_tags, predicted_tags):
        for true, pred in zip(true_seq, pred_seq):
            if true == pred and true != 'O':
                tp += 1
            elif true != pred and true != 'O' and pred != 'O':
                fp += 1
                fn += 1

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f_score = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0

    return precision, recall, f_score

# Calculate and print precision, recall, and F-score for ES
true_tags_es = load_data("Data/ES/dev.out")
true_tags_es = preprocess_data(true_tags_es)
precision_es, recall_es, f_score_es = calculate_metrics(true_tags_es, predicted_tags_es, states_es)
print("ES Precision:", precision_es)
print("ES Recall:", recall_es)
print("ES F-score:", f_score_es)


SyntaxError: invalid syntax (3966961907.py, line 100)