# Text Classification Pipeline

This notebook demonstrates a **text classification pipeline** using deep learning (LSTM):
1. Data loading & preprocessing
2. Vocabulary & embeddings
3. Model definition (LSTM)
4. Training & evaluation
5. Prediction on new text

We’ll use the **IMDB dataset** for sentiment classification.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## 2. Load and Preprocess Data

In [None]:
tokenizer = get_tokenizer("basic_english")

def yield_tokens(data_iter):
    for label, text in data_iter:
        yield tokenizer(text)

train_iter, test_iter = IMDB()
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

print("Vocab size:", len(vocab))

## 3. Data Processing Functions

In [None]:
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: 1 if x == 'pos' else 0

def collate_batch(batch):
    text_list, label_list = [], []
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.long)
        text_list.append(processed_text)
    text_list = pad_sequence(text_list, batch_first=True)
    label_list = torch.tensor(label_list, dtype=torch.long)
    return text_list.to(device), label_list.to(device)

train_iter, test_iter = IMDB()
train_loader = DataLoader(list(train_iter), batch_size=32, shuffle=True, collate_fn=collate_batch)
test_loader = DataLoader(list(test_iter), batch_size=32, shuffle=False, collate_fn=collate_batch)

## 4. Define LSTM Model

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        x = self.embedding(x)
        _, (h_n, _) = self.lstm(x)
        out = self.dropout(h_n[-1])
        return self.fc(out)

model = LSTMClassifier(len(vocab), embed_dim=64, hidden_dim=128, num_classes=2).to(device)

## 5. Training Setup

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

## 6. Training Loop

In [None]:
def train_model(model, loader, criterion, optimizer, epochs=2):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for texts, labels in loader:
            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(loader):.4f}")

train_model(model, train_loader, criterion, optimizer, epochs=2)

## 7. Evaluation

In [None]:
def evaluate(model, loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for texts, labels in loader:
            outputs = model(texts)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Accuracy: {100 * correct / total:.2f}%")

evaluate(model, test_loader)

## 8. Inference on New Sentences

In [None]:
def predict_sentiment(text, model):
    model.eval()
    tokens = torch.tensor(text_pipeline(text)).unsqueeze(0).to(device)
    with torch.no_grad():
        output = model(tokens)
        prediction = torch.argmax(output, dim=1).item()
    return "Positive" if prediction == 1 else "Negative"

print(predict_sentiment("This movie was fantastic!", model))
print(predict_sentiment("The plot was boring and predictable.", model))

## Summary
- Built a **text classification pipeline** for IMDB sentiment dataset.
- Preprocessed text → built vocab → trained an LSTM classifier.
- Achieved classification and real-time prediction on custom text.