# Sentiment Analysis — Comparison of Different Methods
#### Mateusz Kantorski

####  Data Loading

In [1]:
from datasets import load_dataset


#IMDB dataset
dataset = load_dataset("imdb", cache_dir="./dataset/")

train_texts = dataset['train']['text']
train_labels = dataset['train']['label']

test_texts = dataset['test']['text']
test_labels = dataset['test']['label']

print(test_texts[0])
print(test_labels[0])

  from .autonotebook import tqdm as notebook_tqdm


I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to alway

#### Naive Bayes Classifier with TF-IDF

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report


vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)


nb_model = MultinomialNB()
nb_model.fit(X_train, train_labels)


nb_preds = nb_model.predict(X_test)
print("Naive Bayes:")
print(classification_report(test_labels, nb_preds))

Naive Bayes:
              precision    recall  f1-score   support

           0       0.86      0.86      0.86     12500
           1       0.86      0.86      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000



#### Transformer (BERT) with Hugging Face

In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence
import numpy as np
from tqdm import tqdm


def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

#https://nlp.stanford.edu/projects/glove/
glove_path = "./dataset/glove.6B.300d.txt"  # path to file with embeddings 
embeddings_index = load_glove_embeddings(glove_path)


class SentimentDataset(Dataset):
    def __init__(self, texts, labels, embeddings, max_len=100):
        self.texts = texts
        self.labels = labels
        self.embeddings = embeddings
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def text_to_embedding(self, text):
        tokens = text.lower().split()
        vectors = []
        for token in tokens[:self.max_len]:
            if token in self.embeddings:
                vectors.append(torch.tensor(self.embeddings[token]))
            else:
                vectors.append(torch.zeros(300)) 
        return torch.stack(vectors) if vectors else torch.zeros((1, 300))

    def __getitem__(self, idx):
        emb = self.text_to_embedding(self.texts[idx])
        label = torch.tensor(self.labels[idx], dtype=torch.long)
        return emb, label

def collate_batch(batch):
    embeddings, labels = zip(*batch)
    embeddings_padded = pad_sequence(embeddings, batch_first=True)
    labels = torch.stack(labels)
    return embeddings_padded, labels



train_dataset = SentimentDataset(train_texts, train_labels, embeddings_index)
test_dataset = SentimentDataset(test_texts, test_labels, embeddings_index)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_batch)


class LSTMSentiment(nn.Module):
    def __init__(self, embedding_dim=300, hidden_dim=128, output_dim=2):
        super().__init__()
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        out = self.fc(hn[-1])
        return out

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTMSentiment().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)



def train_epoch(model, dataloader):
    model.train()
    total_loss = 0
    for embeddings, labels in tqdm(dataloader):
        embeddings, labels = embeddings.to(device), labels.to(device)
        optimizer.zero_grad()
        output = model(embeddings)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)



def evaluate(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for embeddings, labels in dataloader:
            embeddings, labels = embeddings.to(device), labels.to(device)
            outputs = model(embeddings)
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    return classification_report(all_labels, all_preds, digits=4)



for epoch in range(4):
    loss = train_epoch(model, train_loader)
    print(f"Epoch {epoch+1} loss: {loss:.4f}")

print("LSTM on embeddings:")
print(evaluate(model, test_loader))


100%|██████████| 782/782 [00:32<00:00, 23.89it/s]


Epoch 1 loss: 0.6878


100%|██████████| 782/782 [00:33<00:00, 23.04it/s]


Epoch 2 loss: 0.5353


100%|██████████| 782/782 [00:34<00:00, 22.63it/s]


Epoch 3 loss: 0.4504


100%|██████████| 782/782 [00:34<00:00, 22.83it/s]


Epoch 4 loss: 0.4114
LSTM on embeddings:
              precision    recall  f1-score   support

           0     0.8088    0.8026    0.8057     12500
           1     0.8041    0.8102    0.8072     12500

    accuracy                         0.8064     25000
   macro avg     0.8065    0.8064    0.8064     25000
weighted avg     0.8065    0.8064    0.8064     25000



## Transformer (BERT) z Hugging Face

In [None]:

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments



tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

encoded_train = dataset['train'].map(tokenize_function, batched=True)
encoded_test = dataset['test'].map(tokenize_function, batched=True)



encoded_train.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
encoded_test.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])



training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    save_strategy="no",
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_train,
    eval_dataset=encoded_test,
    tokenizer=tokenizer,
)

trainer.train()


preds_output = trainer.predict(encoded_test)
pred_labels = preds_output.predictions.argmax(-1)
print(classification_report(encoded_test["label"], pred_labels))


Map: 100%|██████████| 25000/25000 [00:09<00:00, 2723.63 examples/s]
Map: 100%|██████████| 25000/25000 [00:09<00:00, 2678.68 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.2229,0.192411
2,0.1222,0.26233
3,0.0531,0.30831


              precision    recall  f1-score   support

           0       0.93      0.94      0.93     12500
           1       0.93      0.93      0.93     12500

    accuracy                           0.93     25000
   macro avg       0.93      0.93      0.93     25000
weighted avg       0.93      0.93      0.93     25000

