# Sentiment Analysis with TorchText

In [1]:
pip install  -q torch==2.1.0 torchtext==0.16.0 portalocker>=2.0.0

Note: you may need to restart the kernel to use updated packages.


In [2]:
import random
import torch
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torch.utils.data import DataLoader, random_split
from torchtext.vocab import build_vocab_from_iterator, GloVe
from collections import Counter

SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
random.seed(SEED)

In [3]:
def generate_bigrams(x):
    n_grams = zip(*[x[i:] for i in range(2)])
    for n_gram in n_grams:
        x.append(" ".join(n_gram))
    return x


train_iter, test_iter = IMDB(split=("train", "test"))
train_data = list(train_iter)
test_data = list(test_iter)

In [4]:
num_train = int(len(train_data) * 0.7)
train_data, valid_data = random_split(
    train_data, [num_train, len(train_data) - num_train]
)

print(f"Train: {len(train_data)}, Valid: {len(valid_data)}, Test: {len(test_data)}")

tokenizer = get_tokenizer("spacy")


def yield_tokens(data_iter):
    for _, text in data_iter:
        yield generate_bigrams(tokenizer(text))

Train: 17500, Valid: 7500, Test: 25000




In [5]:
UNK_TOKEN = "<unk>"
PAD_TOKEN = "<pad>"

vocab = build_vocab_from_iterator(
    yield_tokens(train_data), max_tokens=25000, specials=[UNK_TOKEN, PAD_TOKEN]
)
vocab.set_default_index(vocab[UNK_TOKEN])  # Set default index for unknown tokens

glove_vectors = GloVe(name="6B", dim=100)
# vocab.load_vectors(glove_vectors)

text_pipeline = lambda x: [vocab[token] for token in generate_bigrams(tokenizer(x))]
label_pipeline = lambda x: 1 if x == "pos" else 0

.vector_cache/glove.6B.zip: 862MB [02:41, 5.34MB/s]                               
100%|█████████▉| 399999/400000 [00:26<00:00, 14873.65it/s]


In [6]:
embedding_matrix = torch.zeros(len(vocab), glove_vectors.dim)
for i, token in enumerate(vocab.get_itos()):
    embedding_matrix[i] = glove_vectors[token]

In [7]:
from torch.nn.utils.rnn import pad_sequence


def collate_batch(batch):
    label_list, text_list = [], []
    for _label, _text in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text))
        text_list.append(processed_text)
    return (
        torch.tensor(label_list, dtype=torch.float64).to(device),
        pad_sequence(text_list, padding_value=1.0).to(device),
    )


batch_size = 64


def batch_sampler():
    indices = [(i, len(tokenizer(s[1]))) for i, s in enumerate(train_data)]
    random.shuffle(indices)
    pooled_indices = []

    for i in range(0, len(indices), batch_size * CODING_SZ):
        pooled_indices.extend(
            sorted(indices[i : i + batch_size * CODING_SZ], key=lambda x: x[1])
        )
        pooled_indices = [x[0] for x in pooled_indices]

        for i in range(0, len(pooled_indices), batch_size):
            yield pooled_indices[i : i + batch_size]

In [8]:
BATCH_SIZE = 64
CODING_SZ = 100
train_dataloader = DataLoader(
    train_data,
    #                            batch_sampler=batch_sampler(),
    collate_fn=collate_batch,
    batch_size=BATCH_SIZE,
    shuffle=True,
)

valid_dataloader = DataLoader(
    valid_data, collate_fn=collate_batch, batch_size=BATCH_SIZE, shuffle=True
)
test_dataloader = DataLoader(
    test_data, collate_fn=collate_batch, batch_size=BATCH_SIZE, shuffle=True
)

In [9]:
import torch.nn as nn
import torch.nn.functional as F


class FastText(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, pad_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.embedding.weight.data.copy_(embedding_matrix)
        self.fc = nn.Linear(embedding_dim, output_dim)

    def forward(self, text):
        embedded = self.embedding(text).permute(1, 0, 2)
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(1)
        return self.fc(pooled)


model = FastText(
    vocab_size=len(vocab),
    embedding_dim=CODING_SZ,
    output_dim=1,
    pad_idx=vocab[PAD_TOKEN],
)
model.embedding.weight.data[vocab[UNK_TOKEN]] = torch.zeros(CODING_SZ)
model.embedding.weight.data[vocab[UNK_TOKEN]] = torch.zeros(CODING_SZ)

In [10]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
criterion = criterion.to(device)

In [11]:
EPOCHS = 3
from tqdm import tqdm

for epoch in range(EPOCHS):
    epoch_loss = 0
    epoch_acc = 0

    model.train()
    for label, text in tqdm(train_dataloader, total=len(train_dataloader)):
        optimizer.zero_grad()
        predictions = model(text).squeeze(1)
        loss = criterion(predictions, label)

        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct = (rounded_preds == label).float()
        acc = correct.sum() / len(correct)

        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    print(
        "Epoch %d Train: Loss: %.4f Acc: %.4f"
        % (epoch, epoch_loss / len(train_dataloader), epoch_acc / len(train_dataloader))
    )

    epoch_loss = 0
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for label, text in tqdm(valid_dataloader, total=len(valid_dataloader)):
            predictions = model(text).squeeze(1)
            loss = criterion(predictions, label)

            rounded_preds = torch.round(torch.sigmoid(predictions))
            correct = (rounded_preds == label).float()
            acc = correct.sum() / len(correct)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
    print(
        "Epoch %d Valid: Loss: %.4f Acc: %.4f"
        % (epoch, epoch_loss / len(valid_dataloader), epoch_acc / len(valid_dataloader))
    )

100%|██████████| 274/274 [00:32<00:00,  8.49it/s]


Epoch 0 Train: Loss: 0.4072 Acc: 0.9378


100%|██████████| 118/118 [00:14<00:00,  8.07it/s]


Epoch 0 Valid: Loss: 0.1661 Acc: 1.0000


100%|██████████| 274/274 [00:31<00:00,  8.68it/s]


Epoch 1 Train: Loss: 0.0926 Acc: 1.0000


100%|██████████| 118/118 [00:12<00:00,  9.23it/s]


Epoch 1 Valid: Loss: 0.0535 Acc: 1.0000


100%|██████████| 274/274 [00:31<00:00,  8.66it/s]


Epoch 2 Train: Loss: 0.0360 Acc: 1.0000


100%|██████████| 118/118 [00:12<00:00,  9.28it/s]

Epoch 2 Valid: Loss: 0.0261 Acc: 1.0000





In [12]:
test_loss = 0
test_acc = 0
model.eval()
with torch.no_grad():
    for label, text in tqdm(test_dataloader):
        predictions = model(text).squeeze(1)
        loss = criterion(predictions, label)

        rounded_preds = torch.round(torch.sigmoid(predictions))
        correct = (rounded_preds == label).float()
        acc = correct.sum() / len(correct)

        test_loss += loss.item()
        test_acc += acc.item()

print(
    "Test: Loss: %.4f Acc: %.4f"
    % (test_loss / len(test_dataloader), test_acc / len(test_dataloader))
)

100%|██████████| 391/391 [00:46<00:00,  8.44it/s]

Test: Loss: 0.0247 Acc: 1.0000





In [13]:
criterion(predictions, label)

tensor(0.0344, device='cuda:0', dtype=torch.float64)

In [14]:
rounded_preds

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       device='cuda:0')

In [15]:
label

tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       device='cuda:0', dtype=torch.float64)

In [16]:
(text).shape

torch.Size([2331, 40])