WORD2VEC - SENTIMENT ANALYSIS WITH COSINE **SIMILARITY**

In [12]:
import numpy as np
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize

# Download the 'punkt_tab' resource
nltk.download('punkt_tab')

# Sample corpus to train Word2Vec
sentences = [
    "This is a positive sentence",
    "This is another positive sentence",
    "This is a negative sentence",
    "I love this product",
    "The movie was terrible"
]

# Tokenize the sentences
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

# Train Word2Vec model
word2vec_model = Word2Vec(tokenized_sentences, vector_size=50, window=5, min_count=1, workers=4)

# Function to get sentence embeddings by averaging word embeddings
def get_sentence_embedding(sentence, model):
    words = word_tokenize(sentence.lower())
    word_embeddings = [model.wv[word] for word in words if word in model.wv]
    if len(word_embeddings) > 0:
        return np.mean(word_embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

# Function to calculate cosine similarity
def calculate_similarity(text1, text2):
    embedding1 = get_sentence_embedding(text1, word2vec_model)
    embedding2 = get_sentence_embedding(text2, word2vec_model)
    return cosine_similarity([embedding1], [embedding2])[0][0]

# Example sentences
text1 = "This is a positive sentence"
text2 = "This is another positive sentence"
text3 = "This is a negative sentence"

# Calculate similarities
similarity_1_2 = calculate_similarity(text1, text2)
similarity_1_3 = calculate_similarity(text1, text3)

# Print results
print(f"Similarity between '{text1}' and '{text2}': {similarity_1_2:.4f}")
print(f"Similarity between '{text1}' and '{text3}': {similarity_1_3:.4f}")

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Similarity between 'This is a positive sentence' and 'This is another positive sentence': 0.8277
Similarity between 'This is a positive sentence' and 'This is a negative sentence': 0.8526


BERT - SENTIMENT ANALYSIS WITH COSINE **SIMILARITY**

In [11]:
#!pip install transformers sentence-transformers

from transformers import PreTrainedTokenizerFast, BertModel
from sentence_transformers import SentenceTransformer, util
import torch

# Load pre-trained ModernBERT model and tokenizer
model_name = 'answerdotai/ModernBERT-base'  # Changed to ModernBERT
tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Or use a sentence transformer model directly for better performance
# model = SentenceTransformer('answerdotai/ModernBERT-base') # Example: ModernBERT-base


def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling of token embeddings
    return embeddings


def calculate_similarity(text1, text2):
    embedding1 = get_bert_embedding(text1)
    embedding2 = get_bert_embedding(text2)

    # Calculate cosine similarity
    similarity_score = util.cos_sim(embedding1, embedding2).item()
    return similarity_score


# Example usage
text1 = "This is a positive sentence."
text2 = "This is another positive sentence."
text3 = "This is a negative sentence."

similarity_1_2 = calculate_similarity(text1, text2)
similarity_1_3 = calculate_similarity(text1, text3)

print(f"Similarity between '{text1}' and '{text2}': {similarity_1_2}")
print(f"Similarity between '{text1}' and '{text3}': {similarity_1_3}")

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

You are using a model of type modernbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Some weights of BertModel were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.layer.0.output.dense.bias', 'encod

Similarity between 'This is a positive sentence.' and 'This is another positive sentence.': 0.983720064163208
Similarity between 'This is a positive sentence.' and 'This is a negative sentence.': 0.9839354753494263
