In [8]:
import numpy as np
import pandas as pd
import time
from gensim.models import KeyedVectors
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

# Load word vectors
wv = KeyedVectors.load('embs_train.kv')

# Function to compute the sentence embedding with TF-IDF weighting
def sentence_embedding(sentence, word_counts, tfidf_matrix, feature_names):
    tokens = sentence.split()
    embeddings = []
    for token in tokens:
        if token in wv and word_counts[token] >= 2:  # Prune words with low counts
            if token in feature_names:
                index = np.where(feature_names == token)[0][0]  # Get the index of the token
                tfidf_value = tfidf_matrix[0, index]  # Get the TF-IDF value
                embeddings.append(wv[token] * tfidf_value)  # Weight by TF-IDF
    if len(embeddings) > 0:
        return np.sum(embeddings, axis=0)  # Use sum instead of mean
    else:
        return np.zeros(wv.vector_size)

# Prepare data
def prepare_data(data, word_counts, tfidf_matrix, feature_names):
    X = np.array([sentence_embedding(sentence, word_counts, tfidf_matrix[i], feature_names) for i, sentence in enumerate(data['sentence'])])
    y = np.array([1 if label == '+' else 0 for label in data['target']])  # Binary labels for sklearn
    return X, y

if __name__ == "__main__":
    # Measure total runtime
    start_time = time.time()
    
    # Load data
    load_start = time.time()
    train_data = pd.read_csv('train.csv')  
    dev_data = pd.read_csv('dev.csv')
    test_data = pd.read_csv('test.csv')

    # Count word frequencies in the training set
    freq_start = time.time()
    word_counts = Counter()
    for review in train_data['sentence']:
        word_counts.update(review.split())

    # Create a TF-IDF vectorizer
    tfidf_start = time.time()
    tfidf_vectorizer = TfidfVectorizer(vocabulary=word_counts.keys())
    tfidf_matrix = tfidf_vectorizer.fit_transform(train_data['sentence'])
    feature_names = tfidf_vectorizer.get_feature_names_out()

    # Prepare training and development data
    prep_start = time.time()
    X_train, y_train = prepare_data(train_data, word_counts, tfidf_matrix, feature_names)
    X_dev_matrix = tfidf_vectorizer.transform(dev_data['sentence'])
    X_dev, y_dev = prepare_data(dev_data, word_counts, X_dev_matrix, feature_names)

    # Train Logistic Regression model
    train_start = time.time()
    model = LogisticRegression(max_iter=1000, C=1.0, solver='liblinear')
    model.fit(X_train, y_train)

    # Evaluate on development data
    eval_start = time.time()
    y_dev_pred = model.predict(X_dev)
    dev_accuracy = accuracy_score(y_dev, y_dev_pred)
    dev_error_rate = 1 - dev_accuracy
    print(f"Development Accuracy: {dev_accuracy * 100:.2f}%")
    print(f"Development Error Rate: {dev_error_rate * 100:.2f}%")

    # Predict on test data and save results
    test_start = time.time()
    X_test_matrix = tfidf_vectorizer.transform(test_data['sentence'])
    X_test, _ = prepare_data(test_data, word_counts, X_test_matrix, feature_names)  # No target column for prediction
    test_predictions = model.predict(X_test)
    test_data['target'] = ['+' if pred == 1 else '-' for pred in test_predictions]
    test_data.to_csv("C:/Users/badhe/Downloads/badhe_HW4_ML/test4.predicted.csv", index=False)
    print("Test predictions saved to test.predicted.csv")

    # Total runtime
    print(f"Total runtime: {time.time() - start_time:.2f} seconds")


Data loading time: 0.02 seconds
Word frequency computation time: 0.03 seconds
TF-IDF computation time: 0.09 seconds
Data preparation time: 48.47 seconds
Model training time: 0.39 seconds
Development Accuracy: 76.60%
Development Error Rate: 23.40%
Development evaluation time: 0.01 seconds
Test prediction time: 5.23 seconds
Test predictions saved to test.predicted.csv
Total runtime: 54.25 seconds
