In [5]:
import time
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors

# Load word vectors
wv = KeyedVectors.load('embs_train.kv')

# Function to compute the sentence embedding
def sentence_embedding(sentence):
    tokens = sentence.split()
    embeddings = [wv[token] for token in tokens if token in wv]
    if len(embeddings) > 0:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(wv.vector_size)

# Function to process data row by row for training
def read_from(dataframe):
    for _, row in dataframe.iterrows():
        label = 1 if row['target'] == '+' else -1
        emb = sentence_embedding(row['sentence'])
        yield (label, emb)

# Test the model
def test(data, model):
    tot, err = 0, 0
    for label, emb in read_from(data):
        err += label * (np.dot(model, emb)) <= 0
    return err / len(data)

# Train the model using a perceptron-like algorithm
def averaged_perceptron(train_data, dev_data, epochs=10):
    t = time.time()
    best_err = 1.0
    model = np.zeros(wv.vector_size)  # Initialize model as zero vector
    avg_model = np.zeros(wv.vector_size)

    for it in range(1, epochs + 1):
        updates = 0
        for label, emb in read_from(train_data):
            if label * (np.dot(model, emb)) <= 0:  # Perceptron update rule
                updates += 1
                model += label * emb
            avg_model += model

        dev_err = test(dev_data, avg_model)
        best_err = min(best_err, dev_err)
        print(f"Epoch {it}, Updates: {updates / len(train_data) * 100:.1f}%, Dev Error: {dev_err * 100:.1f}%")

    avg_model /= (epochs * len(train_data))
    print(f"Best Dev Error: {best_err * 100:.1f}%, Time: {time.time() - t:.1f} secs")
    return avg_model

# Predict labels for test set and save to file
def predict_test(test_data, model, output_file="test.predicted.csv"):
    predictions = []
    for _, row in test_data.iterrows():
        emb = sentence_embedding(row['sentence'])
        prediction = '+' if np.dot(model, emb) >= 0 else '-'
        predictions.append(prediction)

    test_data['target'] = predictions
    test_data.to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")

if __name__ == "__main__":
    # Load training, development, and test data
    train_data = pd.read_csv('train.csv')  
    dev_data = pd.read_csv('dev.csv')
    test_data = pd.read_csv('test.csv')

    # Train the model
    trained_model = averaged_perceptron(train_data, dev_data, epochs=10)

    # Predict the test set and save the results
    predict_test(test_data, trained_model, "C:/Users/badhe/Downloads/badhe_HW4_ML/test.predicted.csv")

Epoch 1, Updates: 31.1%, Dev Error: 24.9%
Epoch 2, Updates: 29.5%, Dev Error: 23.9%
Epoch 3, Updates: 29.8%, Dev Error: 24.3%
Epoch 4, Updates: 29.1%, Dev Error: 24.1%
Epoch 5, Updates: 29.7%, Dev Error: 24.2%
Epoch 6, Updates: 29.4%, Dev Error: 23.9%
Epoch 7, Updates: 29.4%, Dev Error: 23.6%
Epoch 8, Updates: 29.4%, Dev Error: 23.8%
Epoch 9, Updates: 29.1%, Dev Error: 24.1%
Epoch 10, Updates: 29.1%, Dev Error: 24.4%
Best Dev Error: 23.6%, Time: 14.9 secs
Predictions saved to C:/Users/badhe/Downloads/badhe_HW4_ML/test2.predicted.csv
