# Comparing Word Embeddings for Sentiment Analysis

In [21]:
import gensim
import json
import numpy as np
from gensim.models import Word2Vec, FastText, KeyedVectors
import os

# Get the current working directory and define the base directory.
current_dir = os.getcwd()
# get rid of two level of the directory
base_dir = os.path.dirname(current_dir)
base_dir = os.path.dirname(base_dir)

In [22]:
# Load the pre-trained models:
word2vec_model = Word2Vec.load('Models/word2vec.model')
fasttext_model = FastText.load('Models/fasttext.model')
glove_model = KeyedVectors.load_word2vec_format('Models/glove/vectors.txt', binary=False, no_header=True)

# Define seed words for sentiment
positive_words = ["good", "great", "excellent", "positive", "fortunate", "correct", "superior", "happy", "beneficial"]
negative_words = ["bad", "terrible", "poor", "negative", "unfortunate", "wrong", "inferior", "sad", "harmful"]

def compute_average_vector(words, model):
    """Compute the average vector for a list of words from the given model."""
    vectors = []
    # For Word2Vec and FastText models, use .wv; for GloVe loaded as KeyedVectors, use the model directly.
    for word in words:
        if hasattr(model, 'wv'):
            if word in model.wv:
                vectors.append(model.wv[word])
        else:
            if word in model:
                vectors.append(model[word])
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return None

# Precompute seed vectors for each model
pos_vec_word2vec = compute_average_vector(positive_words, word2vec_model)
neg_vec_word2vec = compute_average_vector(negative_words, word2vec_model)

pos_vec_fasttext = compute_average_vector(positive_words, fasttext_model)
neg_vec_fasttext = compute_average_vector(negative_words, fasttext_model)

pos_vec_glove = compute_average_vector(positive_words, glove_model)
neg_vec_glove = compute_average_vector(negative_words, glove_model)

def cosine_similarity(v1, v2):
    """Compute the cosine similarity between two vectors."""
    if v1 is None or v2 is None:
        return 0
    norm1 = np.linalg.norm(v1)
    norm2 = np.linalg.norm(v2)
    if norm1 == 0 or norm2 == 0:
        return 0
    return np.dot(v1, v2) / (norm1 * norm2)

def get_sentence_vector(text, model):
    """Tokenize the text and compute the average embedding vector using the provided model."""
    # Simple tokenization (you can improve this with nltk or regex as needed)
    words = text.lower().split()
    vectors = []
    for word in words:
        # Remove common punctuation
        word = word.strip('.,!?";:')
        if hasattr(model, 'wv'):
            if word in model.wv:
                vectors.append(model.wv[word])
        else:
            if word in model:
                vectors.append(model[word])
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return None

def sentiment_score(text, model, pos_seed, neg_seed):
    """
    Calculate a simple sentiment score for the given text using the provided model.
    The score is defined as the cosine similarity between the text vector and the positive seed vector
    minus the similarity between the text vector and the negative seed vector.
    """
    sent_vec = get_sentence_vector(text, model)
    if sent_vec is None:
        return 0
    pos_sim = cosine_similarity(sent_vec, pos_seed)
    neg_sim = cosine_similarity(sent_vec, neg_seed)
    return pos_sim - neg_sim


In [23]:
def evaluate_dataset(dataset_file, dataset_name):
    """Evaluate sentiment on all articles in the given dataset and print summary statistics."""
    with open(dataset_file, 'r', encoding='utf-8') as f:
        articles = json.load(f)
    
    results = {
        "word2vec": [],
        "fasttext": [],
        "glove": []
    }
    
    # Process each article.
    for article in articles:
        title = article.get("title", "")
        content = article.get("content", "")
        full_text = title + " " + content
        
        score_w2v = sentiment_score(full_text, word2vec_model, pos_vec_word2vec, neg_vec_word2vec)
        score_fasttext = sentiment_score(full_text, fasttext_model, pos_vec_fasttext, neg_vec_fasttext)
        score_glove = sentiment_score(full_text, glove_model, pos_vec_glove, neg_vec_glove)
        
        results["word2vec"].append(score_w2v)
        results["fasttext"].append(score_fasttext)
        results["glove"].append(score_glove)
    
    # Compute and print summary statistics.
    print(f"--- Sentiment Analysis Summary for {dataset_name} ---")
    for model_name, scores in results.items():
        if scores:
            avg_score = np.mean(scores)
            positive_count = sum(1 for s in scores if s > 0)
            negative_count = sum(1 for s in scores if s < 0)
            total = len(scores)
            print(f"{model_name} - Average Score: {avg_score:.4f} | Positive: {positive_count} | Negative: {negative_count} | Total Articles: {total}")
        else:
            print(f"{model_name} - No articles processed.")
    print("\n")

# Build dataset paths relative to the base directory.
datasets = [
    (os.path.join(base_dir, "Training Data", "News Articles", "Output", "data_nyt.json"), "NYT Articles"),
    (os.path.join(base_dir, "Training Data", "News Articles", "Output", "data_usn.json"), "US News Articles"),
    (os.path.join(base_dir, "Training Data", "News Articles", "Output", "data_wp.json"), "WP Articles")
]

# Evaluate sentiment on each dataset.
for file_path, name in datasets:
    evaluate_dataset(file_path, name)

--- Sentiment Analysis Summary for NYT Articles ---
word2vec - Average Score: -0.1353 | Positive: 1 | Negative: 9 | Total Articles: 10
fasttext - Average Score: -0.0355 | Positive: 4 | Negative: 6 | Total Articles: 10
glove - Average Score: 0.0250 | Positive: 6 | Negative: 4 | Total Articles: 10


--- Sentiment Analysis Summary for US News Articles ---
word2vec - Average Score: -0.1077 | Positive: 3 | Negative: 6 | Total Articles: 9
fasttext - Average Score: -0.0752 | Positive: 1 | Negative: 8 | Total Articles: 9
glove - Average Score: 0.0319 | Positive: 5 | Negative: 4 | Total Articles: 9


--- Sentiment Analysis Summary for WP Articles ---
word2vec - Average Score: -0.1035 | Positive: 5 | Negative: 11 | Total Articles: 16
fasttext - Average Score: -0.0387 | Positive: 7 | Negative: 9 | Total Articles: 16
glove - Average Score: 0.0256 | Positive: 9 | Negative: 7 | Total Articles: 16


