# 1. CONFIG 
Define paths, model filenames, and all hyper‑parameters in one place:
- Data path, output weight file  
- Embedding/LSTM dimensions, layers, bidirectionality, dropout  
- Training settings (max length, batch size, epochs, learning rate)  
- Vocabulary minimum frequency & random seed  


In [12]:
# 1. CONFIG
DATA_PATH      = "/kaggle/input/sentim/processed_sentiment_data.csv"
MODEL_WEIGHTS  = "./bilstm_sentiment.pt"
EMBED_DIM      = 128
HIDDEN_DIM     = 256
N_LAYERS       = 2
BIDIRECTIONAL  = True
DROPOUT        = 0.3
MAX_LEN        = 128
BATCH_SIZE     = 64
EPOCHS         = 6
LR             = 3e-4
MIN_FREQ       = 2
SEED           = 42


# 2. IMPORTS & REPRODUCIBILITY
- Standard libs (regex, random, time, warnings)  
- NumPy, pandas for data handling  
- PyTorch APIs for model, training, device  
- sklearn metrics, DataLoader for batching  
- Set global seeds and deterministic behavior  


In [13]:
# 2. IMPORTS & REPRODUCIBILITY
import re, random, time, collections, warnings, sys
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, classification_report
from torch.utils.data import Dataset, DataLoader

# reproducibility
torch.manual_seed(SEED)
np.random.seed(SEED)
random.seed(SEED)
torch.backends.cudnn.deterministic = True
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")


# 3. TOKENIZER & VOCAB
- Simple regex‑based tokenizer  
- `Vocab` class to build token ↔ index maps, keeping only words ≥ `MIN_FREQ`  
- Special tokens `<pad>` and `<unk>`  


In [14]:
# 3. TOKENIZER & VOCAB
_token_re = re.compile(r"\w+|[^\w\s]", re.UNICODE)

def tokenize(text: str):
    return _token_re.findall(text.lower())

class Vocab:
    def __init__(self, counter, min_freq=1, specials=None):
        specials = specials or []
        # include specials first, then tokens ≥ min_freq
        self.itos = specials + [
            w for w, c in counter.items()
            if c >= min_freq and w not in specials
        ]
        self.stoi = {w: i for i, w in enumerate(self.itos)}

    def __len__(self):
        return len(self.itos)

    def __getitem__(self, token):
        # return index or <unk>
        return self.stoi.get(token, self.stoi["<unk>"])


# 4. DATASET
- `SentimentDS` wraps a DataFrame of `text` & `sentiment`  
- Builds its own vocab on `build_vocab=True`  
- Encodes text → list of token‑IDs, truncated to `MAX_LEN`  
- Maps sentiment strings → integer labels  


In [15]:
# 4. DATASET
class SentimentDS(Dataset):
    def __init__(self, df, vocab=None, build_vocab=False):
        self.texts = df["text"].fillna("").tolist()
        self.orig_labels = df["sentiment"].tolist()

        # label mappings
        self.label2id = {
            l: i for i, l in enumerate(sorted(set(self.orig_labels)))
        }
        self.id2label = {i: l for l, i in self.label2id.items()}
        self.labels = [self.label2id[l] for l in self.orig_labels]

        # build or reuse vocab
        if build_vocab:
            counter = collections.Counter()
            for t in self.texts:
                counter.update(tokenize(t))
            self.vocab = Vocab(counter, MIN_FREQ, specials=["<pad>", "<unk>"])
        else:
            self.vocab = vocab

    def encode(self, text):
        ids = [self.vocab[t] for t in tokenize(text)[:MAX_LEN]]
        if not ids:  # protect against blank text
            ids = [self.vocab["<unk>"]]
        return ids

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        seq = torch.tensor(self.encode(self.texts[idx]), dtype=torch.long)
        label = self.labels[idx]
        return seq, label


# 5. DATA PREPARATION
- Read full CSV, split into train/val (80/20)  
- Build vocab on training set  
- Create PyTorch `DataLoader` with padding collate fn  


In [16]:
# 5. DATA PREPARATION
df_all   = pd.read_csv(DATA_PATH)
df_train = df_all.sample(frac=0.8, random_state=SEED)
df_val   = df_all.drop(df_train.index)

# datasets
train_ds = SentimentDS(df_train, build_vocab=True)
VOCAB    = train_ds.vocab
val_ds   = SentimentDS(df_val, vocab=VOCAB)

PAD_IDX   = VOCAB["<pad>"]
N_CLASSES = len(train_ds.label2id)

# collate_fn for padding & lengths
def collate(batch):
    seqs, labels = zip(*batch)
    lengths = torch.tensor([len(s) for s in seqs], device=DEVICE)
    padded  = nn.utils.rnn.pad_sequence(
        seqs, batch_first=True, padding_value=PAD_IDX
    )
    return padded.to(DEVICE), lengths, torch.tensor(labels, device=DEVICE)

train_loader = DataLoader(
    train_ds, BATCH_SIZE, shuffle=True, collate_fn=collate
)
val_loader = DataLoader(
    val_ds, BATCH_SIZE, shuffle=False, collate_fn=collate
)

print(f"Vocabulary size: {len(VOCAB)}    Classes: {N_CLASSES}", file=sys.stderr)


Vocabulary size: 4330    Classes: 3


# 6. MODEL
- `BiLSTMClassifier` with embedding, packed LSTM, dropout, and final linear layer  
- Supports bidirectional LSTM and variable number of layers  


In [17]:
# 6. MODEL
class BiLSTMClassifier(nn.Module):
    def __init__(
        self, vocab_size, embed_dim, hidden_dim, n_layers,
        n_classes, bidir=True, dropout=0.3, pad_idx=0
    ):
        super().__init__()
        self.embedding = nn.Embedding(
            vocab_size, embed_dim, padding_idx=pad_idx
        )
        self.lstm = nn.LSTM(
            embed_dim, hidden_dim, n_layers,
            batch_first=True, bidirectional=bidir,
            dropout=dropout if n_layers > 1 else 0.0
        )
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(
            hidden_dim * (2 if bidir else 1), n_classes
        )
        self.bidir = bidir

    def forward(self, x, lengths):
        emb = self.dropout(self.embedding(x))
        packed = nn.utils.rnn.pack_padded_sequence(
            emb, lengths.cpu(), batch_first=True, enforce_sorted=False
        )
        _, (h, _) = self.lstm(packed)
        # concat last hidden states from both directions if bidir
        if self.bidir:
            h = torch.cat((h[-2], h[-1]), dim=1)
        else:
            h = h[-1]
        return self.fc(self.dropout(h))

# instantiate, loss & optimizer
model = BiLSTMClassifier(
    len(VOCAB), EMBED_DIM, HIDDEN_DIM,
    N_LAYERS, N_CLASSES, BIDIRECTIONAL,
    DROPOUT, PAD_IDX
).to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)


# 7. TRAIN / EVAL FUNCTIONS
- `run_epoch` handles one pass (train or eval):  
  - loops batches, computes loss  
  - backprop + step if training  
  - collects preds & ground truths  
  - returns avg loss & accuracy  


In [18]:
# 7. TRAIN / EVAL FUNCTIONS
def run_epoch(loader, train=True):
    if train:
        model.train()
    else:
        model.eval()
    total_loss, preds, gts = 0.0, [], []
    for x, lengths, y in loader:
        if train:
            optimizer.zero_grad()
        logits = model(x, lengths)
        loss   = criterion(logits, y)
        if train:
            loss.backward()
            optimizer.step()
        total_loss += loss.item() * y.size(0)
        preds.extend(logits.argmax(1).tolist())
        gts.extend(y.tolist())
    avg_loss = total_loss / len(loader.dataset)
    acc = accuracy_score(gts, preds)
    return avg_loss, acc, preds, gts


# 8. TRAINING LOOP
- Loop over epochs, track train/val loss & acc  
- Save best model weights when validation accuracy improves  


In [19]:
# 8. TRAINING LOOP
metrics = []
best_acc = 0.0

for epoch in range(1, EPOCHS + 1):
    start = time.time()
    train_loss, train_acc, _, _ = run_epoch(train_loader, train=True)
    val_loss, val_acc, _, _     = run_epoch(val_loader, train=False)
    metrics.append({
        "epoch": epoch,
        "train_loss": round(train_loss, 4),
        "val_loss":   round(val_loss,   4),
        "val_acc":    round(val_acc,    3)
    })
    print(
        f"Epoch {epoch:02d}/{EPOCHS}  "
        f"train_loss {train_loss:.4f}  val_loss {val_loss:.4f}  "
        f"val_acc {val_acc:.3f}  time {time.time()-start:.1f}s",
        file=sys.stderr
    )
    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), MODEL_WEIGHTS)


Epoch 01/6  train_loss 0.9429  val_loss 0.6570  val_acc 0.848  time 0.7s
Epoch 02/6  train_loss 0.5873  val_loss 0.4700  val_acc 0.848  time 0.6s
Epoch 03/6  train_loss 0.5200  val_loss 0.4712  val_acc 0.848  time 0.6s
Epoch 04/6  train_loss 0.5114  val_loss 0.4479  val_acc 0.848  time 0.6s
Epoch 05/6  train_loss 0.4925  val_loss 0.4402  val_acc 0.891  time 0.6s
Epoch 06/6  train_loss 0.4750  val_loss 0.4331  val_acc 0.891  time 0.6s


# 9. EPOCH METRICS TABLE
Display per‑epoch losses & validation accuracy as a pandas table.


In [20]:
# 9. EPOCH METRICS TABLE
print("\nEpoch metrics")
print(pd.DataFrame(metrics).to_string(index=False))



Epoch metrics
 epoch  train_loss  val_loss  val_acc
     1      0.9429    0.6570    0.848
     2      0.5873    0.4700    0.848
     3      0.5200    0.4712    0.848
     4      0.5114    0.4479    0.848
     5      0.4925    0.4402    0.891
     6      0.4750    0.4331    0.891


# 10. CLASSIFICATION REPORT
- Load the best saved weights  
- Run one final evaluation on validation set  
- Print sklearn’s classification report + best val accuracy


In [21]:
# 10. CLASSIFICATION REPORT

try:
    model.load_state_dict(torch.load(MODEL_WEIGHTS, weights_only=True))
except TypeError:
    model.load_state_dict(torch.load(MODEL_WEIGHTS))

# get predictions
_, _, y_pred, y_true = run_epoch(val_loader, train=False)
print("\nClassification report")
print(classification_report(
    y_true, y_pred,
    target_names=[train_ds.id2label[i] for i in range(N_CLASSES)],
    zero_division=0
))
print(f"\nBest validation accuracy: {best_acc:.3f}")
print(f"Model weights saved to {MODEL_WEIGHTS}")



Classification report
              precision    recall  f1-score   support

     LABEL_0       0.89      1.00      0.94       140
     LABEL_1       1.00      0.41      0.58        17
     LABEL_2       0.00      0.00      0.00         8

    accuracy                           0.89       165
   macro avg       0.63      0.47      0.51       165
weighted avg       0.85      0.89      0.86       165


Best validation accuracy: 0.891
Model weights saved to ./bilstm_sentiment.pt


# 11. INFERENCE FUNCTIONS
- `predict(text)` tokenizes & runs the model to return label + confidence  
- `demo(lines)` prints a few example inferences  


In [22]:
# 11. INFERENCE FUNCTIONS
def predict(text):
    ids = [VOCAB[t] for t in tokenize(text)[:MAX_LEN]] or [VOCAB["<unk>"]]
    tensor = torch.tensor(ids, device=DEVICE).unsqueeze(0)
    length = torch.tensor([len(ids)], device=DEVICE)
    with torch.no_grad():
        logits = model(tensor, length)
    idx  = logits.argmax(1).item()
    conf = torch.softmax(logits, 1).max().item()
    return train_ds.id2label[idx], conf

def demo(lines):
    print("\nInference examples")
    for s in lines:
        label, conf = predict(s)
        print(f"'{s}' -> {label}  ({conf:.2%})")

# run demo
demo([
    "I absolutely loved this product!",
    "This is the worst experience I've ever had.",
    ""
])



Inference examples
'I absolutely loved this product!' -> LABEL_0  (82.02%)
'This is the worst experience I've ever had.' -> LABEL_0  (91.46%)
'' -> LABEL_1  (39.25%)
