In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from safetensors.torch import load_file
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import numpy as np
import torch.nn as nn
import math
from sklearn.preprocessing import LabelEncoder

nltk.download('stopwords')
nltk.download('punkt')

# Loading the vocabulary
def load_vocabulary(vocab_path):
    vocab = {}
    with open(vocab_path, 'r') as f:
        for line in f:
            word = line.strip()
            vocab[word] = len(vocab)
    return vocab

vocab_path = '../../vocab/glove_model_vocab.txt'
vocab = load_vocabulary(vocab_path)
vocab['[CLS]'] = len(vocab)
vocab_size = len(vocab)

# Loading the embedding matrix
def load_npy_embeddings(embedding_path, vocab):
    embeddings = np.load(embedding_path)
    embedding_dim = embeddings.shape[1]
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, idx in vocab.items():
        if idx < embeddings.shape[0]:
            embedding_matrix[idx] = embeddings[idx]
    return embedding_matrix

embedding_path = '../../embeddings/glove_model_embeddings_300_d.npy'
embedding_matrix = load_npy_embeddings(embedding_path, vocab)

# Define the necessary model classes
class EmbeddingLayer(nn.Module):
    def __init__(self, vocab_size, embed_size, pretrained_embeddings=None):
        super(EmbeddingLayer, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(pretrained_embeddings, dtype=torch.float32))

    def forward(self, x):
        return self.embedding(x)

class PositionalEncoding(nn.Module):
    def __init__(self, embed_size, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.embed_size = embed_size
        self.encoding = self.get_positional_encoding(max_len, embed_size)

    @staticmethod
    def get_positional_encoding(max_len, embed_size):
        encoding = torch.zeros(max_len, embed_size)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, embed_size, 2).float() * (-math.log(10000.0) / embed_size))
        encoding[:, 0::2] = torch.sin(position * div_term)
        encoding[:, 1::2] = torch.cos(position * div_term)
        return encoding.unsqueeze(0).transpose(0, 1)

    def forward(self, x):
        seq_len = x.size(1)
        if seq_len > self.encoding.size(0):
            self.encoding = self.get_positional_encoding(seq_len, self.embed_size)
        encoding = self.encoding[:seq_len, :].to(x.device)
        return x + encoding.transpose(0, 1)

class EncoderBlock(nn.Module):
    def __init__(self, embed_size, num_heads, ff_dim, dropout_rate=0.1):
        super(EncoderBlock, self).__init__()
        self.attention = nn.MultiheadAttention(embed_size, num_heads, dropout=dropout_rate)
        self.ffn = nn.Sequential(
            nn.Linear(embed_size, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_size)
        )
        self.layernorm1 = nn.LayerNorm(embed_size)
        self.layernorm2 = nn.LayerNorm(embed_size)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x, mask):
        attn_output, _ = self.attention(x, x, x, attn_mask=mask)
        x = self.layernorm1(x + self.dropout(attn_output))
        ffn_output = self.ffn(x)
        x = self.layernorm2(x + self.dropout(ffn_output))
        return x

class SentimentAnalysisModel(nn.Module):
    def __init__(self, embed_size, num_layers, num_heads, ff_dim, vocab_size, max_len, num_classes, dropout_rate=0.1):
        super(SentimentAnalysisModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.positional_encoding = PositionalEncoding(embed_size, max_len)
        self.encoder_layers = nn.ModuleList([EncoderBlock(embed_size, num_heads, ff_dim, dropout_rate) for _ in range(num_layers)])
        self.dropout = nn.Dropout(dropout_rate)
        self.fc_out = nn.Linear(embed_size, num_classes)

    def forward(self, x, mask=None):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        x = x.transpose(0, 1)
        for layer in self.encoder_layers:
            x = layer(x, mask)
        x = x.transpose(0, 1)
        x = self.dropout(x[:, 0, :])
        return self.fc_out(x)

pt_model_path = "../../models/from_scratch_model/final_model_from_scratch.pth"

# Load the pre-trained model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_pt = SentimentAnalysisModel(embed_size=300, num_layers=6, num_heads=12, ff_dim=1024, vocab_size=vocab_size, max_len=250, num_classes=2, dropout_rate=0.2)
model_pt.load_state_dict(torch.load(pt_model_path, map_location=device))
model_pt.eval()  
model_pt.to(device)

# Load the Hugging Face model
hf_model_path = "../../models/fine_tuned_model/model.safetensors"
tokenizer_hf = AutoTokenizer.from_pretrained("bert-base-uncased")
state_dict_hf = load_file(hf_model_path)
model_hf = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", state_dict=state_dict_hf)

# Preprocess text
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    stop_words = set(stopwords.words('english'))
    text = ' '.join(word for word in text.split() if word not in stop_words)
    tokens = word_tokenize(text)
    return tokens

# Tokenize and pad text
def tokenize_and_pad_text(text, vocab, max_len=250):
    tokens = preprocess_text(text)
    tokenized = [vocab.get(word, vocab['[CLS]']) for word in tokens]
    tokenized = tokenized[:max_len] + [0] * (max_len - len(tokenized)) if len(tokenized) < max_len else tokenized[:max_len]
    return torch.tensor(tokenized, dtype=torch.long).unsqueeze(0)  # Add batch dimension

# Function to run inference
def predict_sentiment(text, model, vocab, max_len=250):
    tokenized_text = tokenize_and_pad_text(text, vocab, max_len).to(device)
    with torch.no_grad():
        mask = None  # Assuming no mask for simplicity
        output = model(tokenized_text, mask)
        _, predicted = torch.max(output.data, 1)
    return predicted.item()

# Loading the label encoder and fit it with the same labels used during training
label_encoder = LabelEncoder()
label_encoder.fit(['negative', 'positive'])  # Ensure this matches your original training labels

# Reviews for testing
positive_reviews_1 = [
    "Great product!",
    "Loved it!",
    "Highly recommend.",
    "Very satisfied.",
    "Excellent quality.",
    "Will buy again.",
    "Perfect fit!",
    "Amazing service.",
    "Top-notch!",
    "Exceeded expectations."
]
positive_reviews_2 = [
    "The delivery was prompt and the item was exactly as described.",
    "Fantastic customer service, they resolved my issue quickly.",
    "The sound quality of these headphones is phenomenal.",
    "The fabric is soft and comfortable, great for everyday wear.",
    "This book kept me engaged from start to finish, a must-read.",
    "The hiking trail was well-marked and had breathtaking views.",
    "The event was well-organized and very enjoyable.",
    "The coffee maker brews perfect coffee every time.",
    "The battery life on this laptop is impressive, lasts all day.",
    "The tutorial videos were easy to follow and very informative."
]
positive_reviews_3 = [
    "The smartphone's camera capabilities are truly remarkable, capturing vivid colors and sharp details even in low-light conditions. The user interface is intuitive and the battery life exceeds my expectations, lasting well into the next day with heavy usage.",
    "I attended a weekend workshop on digital marketing, and it was a game-changer for my business. The speakers were industry experts who shared invaluable insights and practical strategies that I could implement immediately.",
    "The cruise vacation was an unforgettable experience. The ship was luxurious, with a variety of dining options, entertainment, and activities for all ages. The staff were incredibly attentive, and the shore excursions were well-planned and enriching.",
    "The custom-built gaming PC I ordered exceeded my expectations in every way. The build quality is impeccable, with meticulously routed cables and high-end components that deliver top-tier performance in every game I've played so far.",
    "The electric car not only provides a smooth and quiet ride but also offers impressive acceleration and a range that easily meets my daily commuting needs. The advanced driver-assistance features enhance safety, making every journey a pleasure."
]

negative_reviews_1 = [
    "Terrible experience.",
    "Not as described.",
    "Very disappointed.",
    "Won't buy again.",
    "Poor quality.",
    "Too expensive.",
    "Didn't like it.",
    "Waste of money.",
    "Not worth it.",
    "Bad service."
]
negative_reviews_2 = [
    "The software is slow and crashes frequently, making it very frustrating to use.",
    "The hotel's location is inconvenient, far from major attractions and public transportation.",
    "The jacket started falling apart after just a few wears, very disappointed with the quality.",
    "The concert venue was overcrowded and the acoustics were terrible.",
    "The restaurant service was extremely slow, and the food was bland and overpriced.",
    "The vacuum cleaner is bulky and difficult to maneuver, not practical for daily use.",
    "The movie was poorly written and the acting was subpar, not worth the ticket price.",
    "The customer service hotline is always busy, and it takes forever to get a response.",
    "The app is filled with bugs and the user interface is not intuitive at all.",
    "The subscription box had repetitive items and didn't match my preferences."
]

negative_reviews_3 = [
    "The new version of the software is riddled with bugs that significantly hamper productivity. It crashes frequently, and the user interface changes have made navigation cumbersome. The promised new features are either missing or don't work as advertised.",
    "I recently stayed at a highly-rated resort, but my experience was far from pleasant. The room was not cleaned properly, with visible dust and stains on the sheets. The food at the on-site restaurant was mediocre, and the staff seemed indifferent to guest needs.",
    "The expensive DSLR camera failed to meet my expectations. The autofocus is slow and inaccurate, resulting in many blurry shots. The battery life is disappointing, and the camera frequently overheats during extended use, making it unreliable for professional photography.",
    "The online course on data science was a huge letdown. The lectures were poorly structured, with outdated content that didn't align with current industry standards. The assignments were vague, and the feedback from instructors was minimal and unhelpful.",
    "The high-end blender I purchased started emitting a burnt smell after a few uses. Despite its hefty price tag, it struggles to blend even soft fruits smoothly. The customer service was unresponsive to my complaints, leaving me with a defective product and no resolution."
]




all_reviews = positive_reviews_3 + negative_reviews_3
# Tokenize and prepare inputs for Hugging Face model
inputs_hf = tokenizer_hf(all_reviews, return_tensors="pt", padding=True, truncation=True)
# Getting predictions from Hugging Face model
with torch.no_grad():
    outputs_hf = model_hf(**inputs_hf)
    predictions_hf = torch.argmax(outputs_hf.logits, dim=-1)
# Getting predictions from the from-scratch model
predictions_pt = [predict_sentiment(review, model_pt, vocab) for review in all_reviews]
labels = {0: "Negative", 1: "Positive"}
results = {
    "Review": all_reviews,
    "Expected Sentiment": ["Positive"] * len(positive_reviews_3) + ["Negative"] * len(negative_reviews_3),
    "Fine-tuned BERT": [labels[pred.item()] for pred in predictions_hf],
    "From-scratch Model": [labels[pred] for pred in predictions_pt]
}
df_results = pd.DataFrame(results)
styled_results = df_results.style.set_table_styles(
    [{'selector': 'th', 'props': [('text-align', 'center'), ('font-weight', 'bold')]},
     {'selector': 'td', 'props': [('text-align', 'center')]}]
).set_properties(**{'text-align': 'left'})
styled_results


[nltk_data] Downloading package stopwords to /Users/jayyy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/jayyy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,Review,Expected Sentiment,Fine-tuned BERT,From-scratch Model
0,"The smartphone's camera capabilities are truly remarkable, capturing vivid colors and sharp details even in low-light conditions. The user interface is intuitive and the battery life exceeds my expectations, lasting well into the next day with heavy usage.",Positive,Positive,Negative
1,"I attended a weekend workshop on digital marketing, and it was a game-changer for my business. The speakers were industry experts who shared invaluable insights and practical strategies that I could implement immediately.",Positive,Positive,Negative
2,"The cruise vacation was an unforgettable experience. The ship was luxurious, with a variety of dining options, entertainment, and activities for all ages. The staff were incredibly attentive, and the shore excursions were well-planned and enriching.",Positive,Positive,Positive
3,"The custom-built gaming PC I ordered exceeded my expectations in every way. The build quality is impeccable, with meticulously routed cables and high-end components that deliver top-tier performance in every game I've played so far.",Positive,Positive,Negative
4,"The electric car not only provides a smooth and quiet ride but also offers impressive acceleration and a range that easily meets my daily commuting needs. The advanced driver-assistance features enhance safety, making every journey a pleasure.",Positive,Positive,Positive
5,"The new version of the software is riddled with bugs that significantly hamper productivity. It crashes frequently, and the user interface changes have made navigation cumbersome. The promised new features are either missing or don't work as advertised.",Negative,Negative,Negative
6,"I recently stayed at a highly-rated resort, but my experience was far from pleasant. The room was not cleaned properly, with visible dust and stains on the sheets. The food at the on-site restaurant was mediocre, and the staff seemed indifferent to guest needs.",Negative,Negative,Positive
7,"The expensive DSLR camera failed to meet my expectations. The autofocus is slow and inaccurate, resulting in many blurry shots. The battery life is disappointing, and the camera frequently overheats during extended use, making it unreliable for professional photography.",Negative,Negative,Negative
8,"The online course on data science was a huge letdown. The lectures were poorly structured, with outdated content that didn't align with current industry standards. The assignments were vague, and the feedback from instructors was minimal and unhelpful.",Negative,Negative,Negative
9,"The high-end blender I purchased started emitting a burnt smell after a few uses. Despite its hefty price tag, it struggles to blend even soft fruits smoothly. The customer service was unresponsive to my complaints, leaving me with a defective product and no resolution.",Negative,Negative,Negative
