In [5]:
import time
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from collections import Counter

# Load word vectors
wv = KeyedVectors.load('embs_train.kv')

# Function to compute the sentence embedding
def sentence_embedding(sentence, word_counts):
    tokens = sentence.split()
    embeddings = []
    for token in tokens:
        if token in wv and word_counts[token] > 1:  # Consider tokens with sufficient frequency
            embeddings.append(wv[token])
    if len(embeddings) > 0:
        # Return the average embedding
        return np.mean(embeddings, axis=0)
    else:
        # Return a zero vector if no embeddings were found
        return np.zeros(wv.vector_size)

# Function to process data row by row for training/testing
def read_from(dataframe, word_counts):
    for _, row in dataframe.iterrows():
        label = 1 if row['target'] == '+' else -1  # Convert label to numeric
        emb = sentence_embedding(row['sentence'], word_counts)
        yield (label, emb)

# Test the model
def test(data, model, word_counts):
    tot, err = 0, 0
    for label, emb in read_from(data, word_counts):
        err += label * (np.dot(model, emb)) <= 0
    return err / len(data)

# Train the model using a perceptron-like algorithm
def averaged_perceptron(train_data, dev_data, word_counts, epochs=10):
    t = time.time()
    best_err = 1.0
    model = np.zeros(wv.vector_size)  # Initialize model as zero vector
    avg_model = np.zeros(wv.vector_size)

    for it in range(1, epochs + 1):
        updates = 0
        for label, emb in read_from(train_data, word_counts):
            if label * (np.dot(model, emb)) <= 0:  # Perceptron update rule
                updates += 1
                model += label * emb
            avg_model += model

        dev_err = test(dev_data, avg_model, word_counts)
        best_err = min(best_err, dev_err)
        print(f"Epoch {it}, Updates: {updates / len(train_data) * 100:.1f}%, Dev Error: {dev_err * 100:.1f}%")

    avg_model /= (epochs * len(train_data))
    print(f"Best Dev Error: {best_err * 100:.1f}%, Time: {time.time() - t:.1f} secs")
    return avg_model

# Predict labels for the test set and save to file
def predict_test(test_data, model, word_counts, output_file="test.predicted.csv"):
    predictions = []
    for _, row in test_data.iterrows():
        emb = sentence_embedding(row['sentence'], word_counts)
        prediction = '+' if np.dot(model, emb) >= 0 else '-'
        predictions.append(prediction)

    test_data['target'] = predictions
    test_data.to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")

if __name__ == "__main__":
    # Load training, development, and test data
    train_data = pd.read_csv('train.csv')
    dev_data = pd.read_csv('dev.csv')
    test_data = pd.read_csv('test.csv')

    # Count word frequencies
    word_counts = Counter()
    for review in train_data['sentence']:
        word_counts.update(review.split())

    # Train the model
    trained_model = averaged_perceptron(train_data, dev_data, word_counts, epochs=10)

    # Predict the test set and save the results
    predict_test(test_data, trained_model, word_counts, "C:/Users/badhe/Downloads/badhe_HW4_ML/test3.predicted.csv")


Epoch 1, Updates: 32.0%, Dev Error: 25.0%
Epoch 2, Updates: 30.4%, Dev Error: 25.1%
Epoch 3, Updates: 30.3%, Dev Error: 24.4%
Epoch 4, Updates: 29.9%, Dev Error: 24.2%
Epoch 5, Updates: 30.4%, Dev Error: 24.2%
Epoch 6, Updates: 30.7%, Dev Error: 24.5%
Epoch 7, Updates: 30.4%, Dev Error: 24.3%
Epoch 8, Updates: 30.1%, Dev Error: 24.5%
Epoch 9, Updates: 29.9%, Dev Error: 24.7%
Epoch 10, Updates: 30.2%, Dev Error: 24.7%
Best Dev Error: 24.2%, Time: 14.7 secs
Predictions saved to C:/Users/badhe/Downloads/badhe_HW4_ML/test3.predicted.csv
