In [21]:
#!pip install torch==2.0.1 torchtext==0.15.2 torchdata==0.6.1 --force-reinstall --no-cache-dir
#!pip install portalocker>=2.0.0


#import sys
#!{sys.executable} -m pip install "portalocker>=2.0.0"


In [22]:
#!pip install torch==2.1.0 torchtext==0.14.1
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import time
import random
import numpy as np 

In [23]:
from torchtext.datasets import AG_NEWS
train_iter = AG_NEWS(split='train')
print(next(iter(train_iter)))


(3, "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")


In [24]:
SEED = 1234
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)

device = torch.device ("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using {device}')

Using cpu


In [25]:
train_iter = AG_NEWS(split='train')
test_iter = AG_NEWS(split='test')
for i, (label, line) in zip(range(5), train_iter):
    print(f'Label {label}, Text: {line}')


Label 3, Text: Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
Label 3, Text: Carlyle Looks Toward Commercial Aerospace (Reuters) Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market.
Label 3, Text: Oil and Economy Cloud Stocks' Outlook (Reuters) Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.
Label 3, Text: Iraq Halts Oil Exports from Main Southern Pipeline (Reuters) Reuters - Authorities have halted oil export\flows from the main pipeline in southern Iraq after\intelligence showed a rebel militia could strike\infrastructure, an oil official said on Saturday.
Label 3, Text: Oil prices soar to all-

🧱 Step 4: Tokenize the Text and Build the Vocabulary
🧠 What’s happening here?
To prepare text for model training, we must:
Tokenize the text (split into words).
Build a vocabulary from all tokens (map words to integer indices).
This is necessary so we can convert raw text → tensors.
We’ll use torchtext's built-in tokenizer and vocabulary builder.

In [26]:
#Setup tokenizer
tokenizer = get_tokenizer('basic_english')
#Build vocab from training dataset
def yield_tokens(data_iter):
    for label, text in data_iter:
        yield tokenizer(text)

train_iter = AG_NEWS(split='train')

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

print(f'Vocab Size: {len(vocab)}')
print(f"Example tokens: {tokenizer('U.S. Stock rally to new highs after trade deals')}")
print(f"Token indices: {[vocab[token] for token in tokenizer('U.S. Stock rally to new highs after trade deals')]}")

Vocab Size: 95811
Example tokens: ['u', '.', 's', '.', 'stock', 'rally', 'to', 'new', 'highs', 'after', 'trade', 'deals']
Token indices: [51, 1, 9, 1, 294, 688, 4, 23, 1723, 34, 268, 1809]


🔢 Step 5: Encode Text and Labels into Tensors
🧠 Why this matters:
Before passing data into a model, we must:
Convert each text sample into a list of integer token indices.
Convert each label into an integer (already done in AG_NEWS, but can be mapped manually for custom data).
Wrap this logic in a function for DataLoader use later.

In [27]:
# Converts a raw string into a list of token indices using vocab
def text_pipeline(x):
    return vocab(tokenizer(x))

# Labels in AG_NEWS are already 1-based: [1, 2, 3, 4]
# We'll subtract 1 to make them 0-based: [0, 1, 2, 3]
def label_pipeline(x):
    return int(x) - 1

# Test it out
sample_text = "U.S. Stock rally to new highs after trade deals"
sample_label = 3  # Sci/Tech (just an example)

print(f"Text: {sample_text}")
print(f"Token indices: {text_pipeline(sample_text)}")
print(f"Label index: {label_pipeline(sample_label)}")


Text: U.S. Stock rally to new highs after trade deals
Token indices: [51, 1, 9, 1, 294, 688, 4, 23, 1723, 34, 268, 1809]
Label index: 2


In [28]:
sample_text = "U.S. Stock rally to new highs after trade deals"
tokens = tokenizer(sample_text)
print(tokens)

[token for token in tokens if token not in vocab.get_stoi()]

[vocab[token] for token in tokens]

text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]
tokens = tokenizer(sample_text)
indices = text_pipeline(sample_text)

print(f"Tokens: {tokens}")
print(f"Indices: {indices}")
print(f"Length match? {len(tokens) == len(indices)}")


['u', '.', 's', '.', 'stock', 'rally', 'to', 'new', 'highs', 'after', 'trade', 'deals']
Tokens: ['u', '.', 's', '.', 'stock', 'rally', 'to', 'new', 'highs', 'after', 'trade', 'deals']
Indices: [51, 1, 9, 1, 294, 688, 4, 23, 1723, 34, 268, 1809]
Length match? True


🧰 Step 6: Define Collate Function and Set Up DataLoader
🧠 Why this step matters:
PyTorch's DataLoader batches raw data.
But with text data, each sample has a different length.
So we need a custom collate function to:
Apply token and label pipelines
Pad or merge sequences
Return batch tensors

In [29]:
from torch.nn.utils.rnn import pad_sequence

# Collate function to process a batch of (label, text) tuples
def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    
    for label, text in batch:
        label_list.append(label_pipeline(label))
        processed_text = torch.tensor(text_pipeline(text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))

    # Convert lists to tensors
    label_tensor = torch.tensor(label_list, dtype=torch.int64)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)  # Running total offsets
    text_tensor = torch.cat(text_list)
    
    return label_tensor.to(device), text_tensor.to(device), offsets.to(device)


In [30]:
import torch.nn as nn 

class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super(TextClassificationModel, self).__intit__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, num_classes)

In [None]:
vocab_size = len(vocab)
embed_dim = 64           # You can use 100, 128, etc. if needed
num_classes = len(set(['label']))  # or 2 for binary classification

model = TextClassificationModel(vocab_size, embed_dim, num_classes).to(device)


NameError: name 'df' is not defined

In [31]:
def forward(self, text, offsets):
    embedded = self.embedding(text, offsets)
    return self.fc(embedded)

Train the Model

In [32]:
def train_model(dataloader):
    model.train()
    total_acc, total_count = 0, 0

    for label, text, offsets in dataloader:
        optimizer.zero_grad()
        output = model(text, offsets)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()

        total_acc += (output.argmax(1) == label).sum().item()
        total_count += label.size(0)
    return total_acc / total_count


Check how well it learned

In [33]:
def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for label, text, offsets in dataloader:
            output = model(text, offsets)
            total_acc += (output.argmax(1) == label).sum().item()
            total_count += label.size(0)

    return total_acc / total_count


Run the model for rounds (Epochs)

In [34]:
for epoch in range(5):
    train_acc = train_model(train_iter)
    val_acc = evaluate(test_iter)
    print(f"Epoch {epoch+1}: Train Accuracy = {train_acc:.4f}, Valiation Accuracy = {val_acc:.4f}")

NameError: name 'model' is not defined