<a href="https://colab.research.google.com/github/Nithya07shree/colab-notes-aiml/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

TOKENIZATION

In [3]:
# sentence tokenizer
import nltk.data
text = 'Tokenization is a fundamental step in Natural Language Processing (NLP). It involves dividing a Textual input into smaller units known as tokens. These tokens can be in the form of words, characters, sub-words, or sentences. It helps in improving interpretability of text by different models. '
tokenizer = nltk.data.load('tokenizers/punkt/PY3/english.pickle')
tokenizer.tokenize(text)

['Tokenization is a fundamental step in Natural Language Processing (NLP).',
 'It involves dividing a Textual input into smaller units known as tokens.',
 'These tokens can be in the form of words, characters, sub-words, or sentences.',
 'It helps in improving interpretability of text by different models.']

In [4]:
# word tokenizer
from nltk.tokenize import word_tokenize
words = word_tokenize(text)

In [5]:
dictionary_text = {word: i+1 for i, word in enumerate(set(words))}
print(dictionary_text)

{'known': 1, 'helps': 2, 'sub-words': 3, 'NLP': 4, 'or': 5, 'as': 6, 'text': 7, 'smaller': 8, 'different': 9, 'Textual': 10, 'by': 11, 'the': 12, 'input': 13, 'fundamental': 14, ')': 15, 'form': 16, 'models': 17, '.': 18, 'dividing': 19, 'These': 20, 'improving': 21, 'into': 22, 'interpretability': 23, 'a': 24, ',': 25, 'sentences': 26, 'units': 27, 'characters': 28, 'Language': 29, 'tokens': 30, 'of': 31, 'step': 32, 'Processing': 33, 'It': 34, 'Tokenization': 35, 'in': 36, 'words': 37, 'involves': 38, 'can': 39, '(': 40, 'be': 41, 'Natural': 42, 'is': 43}


LSTM

In [7]:
import torch
import torch.nn as nn
# load lstm model
lstm = nn.LSTM(input_size=50, hidden_size=128, num_layers=2, batch_first=True)

# input: (batch size, sequence length, embedding dim) = (32 sentences, 10 words per sentence, 50 dim vectors)
input_data = torch.randn(32, 10, 50)

# op: hidden state for every time step
# (hn, cn) : final hidden state and ce;; state (summary of the sentence)
output, (hn, cn) = lstm(input_data)


IMDB Movie Reviews dataset (Sentiment Analysis).

1. get IMDB Movie reviews dataset
2. preprocess text: tokenize nd use an nn.Embedding layer
3. Pass embeddings into an nn.lstm
4. take final hidden state adn pass it to a nn.Linear layer to predict positive or negative review

In [1]:
import torch
import torch.nn as nn

In [2]:
class LstmSentimentAnalysis(nn.Module):
  def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
    super(LstmSentimentAnalysis, self).__init__()
    self.embedding = nn.Embedding(vocab_size, embed_dim)
    self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
    self.fc = nn.Linear(hidden_dim, output_dim)
    self.sigmoid = nn.Sigmoid()

  def forward(self, x):
    embeddings = self.embedding(x)
    output, (hn,cn) = self.lstm(embeddings)
    lastHiddenState = hn[-1]
    out = self.fc(lastHiddenState)
    return self.sigmoid(out)

In [4]:
# get datasets
!pip install datasets torchtext

Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.12.0
    Uninstalling fsspec-2025.12.0:
      Successfully uninstalled fsspec-2025.12.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gradio 5.50.0 requires pillow<12.0,>=8.0, but you have pillow 12.0.0 which is incompatible.[0m[31m
[0mSuccessfully installed fsspec-2025.3.0


In [5]:
from datasets import load_dataset
from torch.utils.data import DataLoader
from collections import Counter

# load dataset
dataset = load_dataset("imdb")

# build vocabulary
word_count = Counter()
for text in dataset['train']['text']:
  word_count.update(text.lower().split())

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

plain_text/unsupervised-00000-of-00001.p(…):   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [8]:
# creating dictionary
# 1. shifts every index up by two slots. 1st 2 indices (0 & 1) for 2 special 'utility' tokens
# at index 0, we add padding. LSTMs require all sequences in a batch to be the same length. If one review is 10 words and another is 8, we add two 0s to the short one so they match.
# at index 1, we add unknown variable . If a user types a word that wasn't in your top 5,000 (like "supercalifragilistic"), the model replaces it with 1 instead of crashing.
vocab = {word: i+2 for i, (word, _) in enumerate(word_count.most_common(5000))}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1 # For words not in our top 5000

In [9]:
def preprocess(text, max_len=100):
  tokens = text.lower().split()
  # convert tokens to indices from vocabulary
  indices = [vocab.get(token,1) for token in tokens]
  # apply padding
  if len(indices) > max_len:
    indices = indices[:max_len]
  else:
    indices += [0]*(max_len-len(indices))
  return torch.tensor(indices)

In [10]:
class IMDBDataset(torch.utils.data.Dataset):
    def __init__(self, split, max_len=100):
        self.data = dataset[split]
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]['text']
        label = self.data[idx]['label']
        return preprocess(text, self.max_len), label

train_loader = DataLoader(IMDBDataset('train'), batch_size=32, shuffle=True)
test_loader = DataLoader(IMDBDataset('test'), batch_size=32)

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LstmSentimentAnalysis(len(vocab), 64, 128, 1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

def train(epochs =5):
  model.train()
  for epoch in range(epochs):
    total_loss = 0
    for texts, labels in train_loader:
      texts, labels = texts.to(device), labels.to(device).float().view(-1, 1)
      preds = model(texts)
      loss = criterion(preds, labels)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()
      total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader)}")

def test():
  model.eval()
  correct = 0
  total = 0
  with torch.no_grad():
    for texts, labels in test_loader:
      texts, labels = texts.to(device), labels.to(device).float().view(-1, 1)
      outputs = model(texts)
      predictions = torch.round(outputs)
      correct += (predictions == labels).sum().item()
      total += labels.size(0)
  print(f"Test Accuracy: {100. * correct / total:.2f}%")

In [16]:
train(3)

Epoch 1/3, Loss: 0.687254319524826
Epoch 2/3, Loss: 0.6547619149736736
Epoch 3/3, Loss: 0.6087956449107441


In [20]:
test()

Test Accuracy: 49.59%


In [21]:
train(7)

Epoch 1/7, Loss: 0.6926792861555543
Epoch 2/7, Loss: 0.6047315318185045
Epoch 3/7, Loss: 0.43651833371890475
Epoch 4/7, Loss: 0.36574342017016753
Epoch 5/7, Loss: 0.30951332285657257
Epoch 6/7, Loss: 0.2560028614538252
Epoch 7/7, Loss: 0.19709958940687233


In [22]:
test()

Test Accuracy: 79.34%
