In [None]:
import re

class CRFSentimentAnalyzer:
    def __init__(self, learning_rate=0.01, max_iterations=1000):
        self.learning_rate = learning_rate
        self.max_iterations = max_iterations
        self.weights = {}

    def train(self, dataset):
        for sentence, labels in dataset:
            for i in range(len(sentence)):
                features = self.extract_features(sentence, i)
                predicted_label = self.predict(features, labels)
                true_label = labels[i]
                if predicted_label != true_label:
                    self.update_weights(features, predicted_label, true_label)

    def extract_features(self, sentence, i):
        features = {}
        word = sentence[i]
        features[f'word:{word}'] = 1
        # Add more feature extraction logic here if needed
        for j in range(i - 1, i + 2):
            if j >= 0 and j < len(sentence):
                features[f'word-{j}:{sentence[j]}'] = 1
        features[f'is_capitalized:{word[0].isupper()}'] = 1
        features[f'is_punctuation:{re.match(r"^[,.!?;]", word) != None}'] = 1
        return features

    def predict(self, features, labels):
        scores = {label: sum(features.get(feature, 0) * weight for feature, weight in self.weights.get(label, {}).items())
                  for label in labels}
        return max(scores, key=scores.get)

    def update_weights(self, features, predicted_label, true_label):
        if true_label not in self.weights:
            self.weights[true_label] = {}
        if predicted_label not in self.weights:
            self.weights[predicted_label] = {}

        for feature, value in features.items():
            self.weights[true_label][feature] = self.weights[true_label].get(feature, 0) + self.learning_rate * value
            self.weights[predicted_label][feature] = self.weights[predicted_label].get(feature, 0) - self.learning_rate * value

    def classify(self, sentence):
        predicted_labels = []
        for i in range(len(sentence)):
            features = self.extract_features(sentence, i)
            predicted_label = self.predict(features, list(self.weights.keys()))
            predicted_labels.append(predicted_label)
        return predicted_labels


def extract_data(data):
    # data1 = data.split("")
    words = []
    labels = []
    l=[]
    for sentence in data:
        if sentence=="":
            obj=(words,labels)
            l.append((obj))
            labels=[]
            words=[]
            obj=()
        else:
            a=sentence.split(" ")

            words.append(a[0])
            labels.append(a[1])
    return l

def extract_data_test(data):
    # data1 = data.split("")

    L=[]
    l = []
    for sentence in data:
        if sentence=="":
            L.append(l)
            l = []
        else:
            l.append(sentence)
    return L

# Given dataset
with open(r'Data\ES\train') as f:
    data = f.read().splitlines()

dataset = extract_data(data)

# Initialize and train the sentiment analyzer
analyzer = CRFSentimentAnalyzer()
analyzer.train(dataset)

# Test sentences
with open(r'Data\ES\dev.in') as f:
    test_data = f.read().splitlines()

test_sentences = extract_data_test(test_data)

# Perform sentiment analysis on test sentences
with open(r'Data\ES\dev.p4.out', "w+") as f:
    for sentence in test_sentences:
        predicted_labels = analyzer.classify(sentence)
        for j in range(len(predicted_labels)):
            f.write(sentence[j] + " " + predicted_labels[j] + "\n")
        else:
            f.write("\n")

In [None]:
class NaiveBayes:
    def __init__(self):
        self.word_count = {}
        self.class_count = {}
        self.total_words = set()

    def fit(self, data):
        lines = data.split('\n')
        for line in lines:
            if not line.strip():
                continue

            parts = line.split()
            if len(parts) != 2:
                continue

            word, tag = parts
            self.total_words.add(word)
            
            if tag not in self.class_count:
                self.class_count[tag] = 0
            self.class_count[tag] += 1
            
            if tag not in self.word_count:
                self.word_count[tag] = {}
            if word not in self.word_count[tag]:
                self.word_count[tag][word] = 0
            self.word_count[tag][word] += 1

    def predict(self, sentence):
        sentiments = []
        for word in sentence.split():
            max_prob = float('-inf')
            best_tag = 'Outside'
            for tag in self.class_count:
                prob = self.class_count[tag]
                # Use Laplace smoothing for word probabilities
                word_prob = (self.word_count[tag].get(word, 0) + 1) / (self.class_count[tag] + len(self.total_words))
                prob *= word_prob
                if prob > max_prob:
                    max_prob = prob
                    best_tag = tag
            sentiments.append(best_tag)
        return sentiments

# Example Usage:
with open(r'Data\\ES\\train') as f:
    data = f.read()

classifier = NaiveBayes()
classifier.fit(data)

print(classifier.predict("Risotto"))  # Expected: negative (based on training data)


In [3]:
#FK U


class CRFSentimentAnalyzer:
    def __init__(self, learning_rate=0.01, max_iterations=1000, softening=0.01):
        self.learning_rate = learning_rate
        self.max_iterations = max_iterations
        self.softening = softening  # Add softening parameter
        self.weights = {}

    def train(self, dataset):
        for sentence, labels in dataset:
            for i in range(len(sentence)):
                features = self.extract_features(sentence, i)
                predicted_label = self.predict(features, labels)
                true_label = labels[i]
                if predicted_label != true_label:
                    self.update_weights(features, predicted_label, true_label)

    def extract_features(self, sentence, i):
        features = {}
        word = sentence[i]
        features[f'word:{word}'] = 1
        # Add more feature extraction logic here if needed
        return features

    def predict(self, features, labels):
        scores = {label: sum(features.get(feature, 0) * weight for feature, weight in self.weights.get(label, {}).items())
                  for label in labels}
        return max(scores, key=scores.get)

    def update_weights(self, features, predicted_label, true_label):
        if true_label not in self.weights:
            self.weights[true_label] = {}
        if predicted_label not in self.weights:
            self.weights[predicted_label] = {}

        for feature, value in features.items():
            # Adding softening to the weight updates
            self.weights[true_label][feature] = (1 - self.softening) * self.weights[true_label].get(feature, 0) + self.learning_rate * value
            self.weights[predicted_label][feature] = (1 - self.softening) * self.weights[predicted_label].get(feature, 0) - self.learning_rate * value

    def classify(self, sentence):
        predicted_labels = []
        for i in range(len(sentence)):
            features = self.extract_features(sentence, i)
            predicted_label = self.predict(features, list(self.weights.keys()))
            predicted_labels.append(predicted_label)
        return predicted_labels


def extract_data(data):
    # data1 = data.split("")
    words = []
    labels = []
    l=[]
    for sentence in data:
        if sentence=="":
            obj=(words,labels)
            l.append((obj))
            labels=[]
            words=[]
            obj=()
        else:
            a=sentence.split(" ")

            words.append(a[0])
            labels.append(a[1])
    return l

def extract_data_test(data):
    # data1 = data.split("")

    L=[]
    l = []
    for sentence in data:
        if sentence=="":
            L.append(l)
            l = []
        else:
            l.append(sentence)
    return L

# Load training data
with open(r'Data\\ES\\train') as f:
    data = f.read().splitlines()

dataset = extract_data(data)

# Initialize and train the sentiment analyzer with softening
analyzer = CRFSentimentAnalyzer(learning_rate=0.01, max_iterations=200, softening=0.0001)
analyzer.train(dataset)

# Load test data
with open(r'Data\\ES\\dev.in') as f:
    test_data = f.read().splitlines()

test_sentences = extract_data_test(test_data)

# Perform sentiment analysis on test sentences
with open(r'Data\\ES\\dev.p4.out', "w+") as f:
    for sentence in test_sentences:
        predicted_labels = analyzer.classify(sentence)
        for j in range(len(predicted_labels)):
            f.write(sentence[j] + " " + predicted_labels[j] + "\n")
        else:
            f.write("\n")

In [5]:
class ImprovedCRFSentimentAnalyzer:
    def __init__(self, learning_rate=0.01, max_iterations=1000, regularization=0.01, softening=0.001):
        self.learning_rate = learning_rate
        self.max_iterations = max_iterations
        self.regularization = regularization
        self.softening = softening
        self.weights = {}

    def train(self, dataset):
        for sentence, labels in dataset:
            for i in range(len(sentence)):
                features = self.extract_features(sentence, i)
                predicted_label = self.predict(features, labels)
                true_label = labels[i]
                if predicted_label != true_label:
                    self.update_weights(features, predicted_label, true_label)

    def extract_features(self, sentence, i):
        features = {}
        word = sentence[i]
        features[f'word:{word}'] = 1
        # Add more feature extraction logic here if needed
        return features

    def predict(self, features, labels):
        scores = {label: sum(features.get(feature, 0) * weight for feature, weight in self.weights.get(label, {}).items())
                  for label in labels}
        return max(scores, key=scores.get)

    def update_weights(self, features, predicted_label, true_label):
        if true_label not in self.weights:
            self.weights[true_label] = {}
        if predicted_label not in self.weights:
            self.weights[predicted_label] = {}

        for feature, value in features.items():
            # Adding regularization and softening to the weight updates
            self.weights[true_label][feature] = (1 - self.regularization) * (1 - self.softening) * self.weights[true_label].get(feature, 0) + self.learning_rate * value
            self.weights[predicted_label][feature] = (1 - self.regularization) * (1 - self.softening) * self.weights[predicted_label].get(feature, 0) - self.learning_rate * value

    def classify(self, sentence):
        predicted_labels = []
        for i in range(len(sentence)):
            features = self.extract_features(sentence, i)
            predicted_label = self.predict(features, list(self.weights.keys()))
            predicted_labels.append(predicted_label)
        return predicted_labels

def extract_data(data):
    # data1 = data.split("")
    words = []
    labels = []
    l=[]
    for sentence in data:
        if sentence=="":
            obj=(words,labels)
            l.append((obj))
            labels=[]
            words=[]
            obj=()
        else:
            a=sentence.split(" ")

            words.append(a[0])
            labels.append(a[1])
    return l

def extract_data_test(data):
    # data1 = data.split("")

    L=[]
    l = []
    for sentence in data:
        if sentence=="":
            L.append(l)
            l = []
        else:
            l.append(sentence)
    return L
# Load training data
with open(r'Data\\ES\\train') as f:
    data = f.read().splitlines()

dataset = extract_data(data)  # You should define extract_data function as in your previous code

# Initialize and train the improved sentiment analyzer
improved_analyzer = ImprovedCRFSentimentAnalyzer(learning_rate=0.01, max_iterations=200, regularization=0.001, softening=0.0001)
improved_analyzer.train(dataset)

# Load test data
with open(r'Data\\ES\\dev.in') as f:
    test_data = f.read().splitlines()

test_sentences = extract_data_test(test_data)  # You should define extract_data_test function as in your previous code

# Perform sentiment analysis on test sentences
with open(r'Data\\ES\\dev.p4.out', "w+") as f:
    for sentence in test_sentences:
        predicted_labels = improved_analyzer.classify(sentence)
        for j in range(len(predicted_labels)):
            f.write(sentence[j] + " " + predicted_labels[j] + "\n")
        else:
            f.write("\n")

In [10]:
class HMM:
    def __init__(self, states, observations):
        self.states = states
        self.observations = observations
        self.initial_prob = {}
        self.transition_prob = {}
        self.emission_prob = {}

    def train(self, labeled_data):
        # Count occurrences of initial states
        for sentence, _ in labeled_data:
            initial_state = _[0]
            self.initial_prob.setdefault(initial_state, 0)
            self.initial_prob[initial_state] += 1

        # Normalize initial probabilities
        total_sentences = len(labeled_data)
        for state, count in self.initial_prob.items():
            self.initial_prob[state] = count / total_sentences

        # Count occurrences of state transitions and emissions
        for sentence, labels in labeled_data:
            prev_state = labels[0]
            for current_state, word in zip(labels[1:], sentence[1:]):
                self.transition_prob.setdefault(prev_state, {}).setdefault(current_state, 0)
                self.transition_prob[prev_state][current_state] += 1

                self.emission_prob.setdefault(current_state, {}).setdefault(word, 0)
                self.emission_prob[current_state][word] += 1

                prev_state = current_state

        # Normalize transition and emission probabilities
        for prev_state in self.transition_prob:
            total_transitions = sum(self.transition_prob[prev_state].values())
            for current_state in self.transition_prob[prev_state]:
                self.transition_prob[prev_state][current_state] /= total_transitions

        for state in self.emission_prob:
            total_emissions = sum(self.emission_prob[state].values())
            for word in self.emission_prob[state]:
                self.emission_prob[state][word] /= total_emissions

    def predict(self, sentence):
        n = len(sentence)
        dp = [{} for _ in range(n)]

        for state in self.states:
            dp[0][state] = self.initial_prob.get(state, 0) * self.emission_prob.get(state, {}).get(sentence[0], 0)

        for t in range(1, n):
            for current_state in self.states:
                max_prob = 0
                for prev_state in self.states:
                    transition_prob = self.transition_prob.get(prev_state, {}).get(current_state, 0)
                    emission_prob = self.emission_prob.get(current_state, {}).get(sentence[t], 0)
                    prob = dp[t - 1][prev_state] * transition_prob * emission_prob

                    if prob > max_prob:
                        max_prob = prob
                        dp[t][current_state] = prob

        # Backtrack to find the best path
        best_path = []
        max_prob = 0
        for state in self.states:
            if dp[n - 1][state] > max_prob:
                max_prob = dp[n - 1][state]
                best_path = [state]

        for t in range(n - 2, -1, -1):
            next_state = best_path[0]
            for state in self.states:
                transition_prob = self.transition_prob.get(state, {}).get(next_state, 0)
                emission_prob = self.emission_prob.get(next_state, {}).get(sentence[t + 1], 0)
                prob = dp[t][state] * transition_prob * emission_prob

                if prob > max_prob:
                    max_prob = prob
                    best_path.insert(0, state)

        return best_path
# Load and preprocess training data
def load_data(file_path):
    with open(file_path, 'r') as f:
        data = f.read().split('\n\n')
    return [sentence.split('\n') for sentence in data]

# Extract words and labels from labeled data
def extract_words_and_labels(labeled_sentence):
    words = []
    labels = []
    for line in labeled_sentence:
        if line:
            word, label = line.split(' ')
            words.append(word)
            labels.append(label)
    print(words)
    return words, labels

# Load and preprocess labeled training data
es_train_data = load_data('Data/ES/train')
ru_train_data = load_data('Data/RU/train')

es_labeled_sentences = [extract_words_and_labels(sentence) for sentence in es_train_data]
ru_labeled_sentences = [extract_words_and_labels(sentence) for sentence in ru_train_data]

# Initialize and train HMMs
states = ['B-positive', 'I-positive', 'B-negative', 'I-negative', 'B-neutral', 'I-neutral', 'O']
observations = set([word for sentence, _ in es_labeled_sentences + ru_labeled_sentences for word in sentence])

es_hmm = HMM(states, observations)
ru_hmm = HMM(states, observations)

es_hmm.train(es_labeled_sentences)
ru_hmm.train(ru_labeled_sentences)

# Load and preprocess development data
def load_unlabeled_data(file_path):
    with open(file_path, 'r') as f:
        data = f.read().split('\n')
    return [sentence.split() for sentence in data]

es_dev_data = load_unlabeled_data('Data/ES/dev.in')
ru_dev_data = load_unlabeled_data('Data/RU/dev.in')

# Predict sentiment using trained HMMs
es_predictions = [es_hmm.predict(sentence) for sentence in es_dev_data]
ru_predictions = [ru_hmm.predict(sentence) for sentence in ru_dev_data]

# Write predictions to output files
def write_predictions_to_file(predictions, output_file):
    with open(output_file, 'w') as f:
        for sentence_predictions in predictions:
            f.write('\n'.join(sentence_predictions))
            f.write('\n\n')

write_predictions_to_file(es_predictions, 'Data/ES/dev.p4.out')
write_predictions_to_file(ru_predictions, 'Data/RU/dev.p4.out')

# Calculate precision, recall, and F-scores
def evaluate(predictions, true_labels):
    # TODO: Implement evaluation logic
    pass

# Evaluate the models
es_true_labels = load_data('Data/ES/dev.out')
ru_true_labels = load_data('Data/RU/dev.out')

es_precision, es_recall, es_fscore = evaluate(es_predictions, es_true_labels)
ru_precision, ru_recall, ru_fscore = evaluate(ru_predictions, ru_true_labels)

print(f"ES Precision: {es_precision:.4f}, Recall: {es_recall:.4f}, F-score: {es_fscore:.4f}")
print(f"RU Precision: {ru_precision:.4f}, Recall: {ru_recall:.4f}, F-score: {ru_fscore:.4f}")

['Estuvimos', 'hace', 'poco', 'mi', 'pareja', 'y', 'yo', 'comiendo', 'y', 'resultó', 'todo', 'muy', 'bien', ',', 'tanto', 'la', 'comida', ',', 'el', 'vino', ',', 'el', 'trato', ',', 'la', 'decoración', '…', 'nos', 'gustó', 'todo', 'mucho', '.']
['Por', 'poner', 'algún', 'pero', ',', 'quizá', 'el', 'jamón', 'no', 'era', 'todo', 'lo', '"', 'ibérico', '"', 'que', 'cabía', 'esperar', '.']
['Bien', 'lo', 'sabe', 'el', 'autor', 'del', 'blog', '.', ')']
['Comida', 'exquisita', '.']
['Restaurante', 'diferente', ',', 'creativo', 'y', 'agradable', '.']
['Si', 'no', 'has', 'probado', 'sus', 'carnes', 'te', 'estas', 'perdiendo', 'algo', 'muy', 'grande', '!']
['En', 'resumen', ',', 'comida', 'bien-muy', 'bien', ',', 'servicio', 'correcto', 'y', 'profesional']
['02-12-', '2012', 'elegimos', 'este', 'restaurante', 'por', 'los', 'comentarios', ',', 'pero', 'ha', 'sido', 'una', 'Grandisima', 'Decepción', '.']
['Salimos', 'encantadas', 'del', 'restaurante', '.']
['Ubicación']
['Comimos', 'muy', 'bien', 

ValueError: too many values to unpack (expected 2)