<a href="https://colab.research.google.com/github/Rahulrama6705/perplexity/blob/main/perplexity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

LAB 2 -Write a Python Program to compute BLEU and perplexity scores for n-gram and RNN Language models.

In [None]:
!pip install nltk torch torchtext

import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from nltk.util import ngrams
from collections import Counter
import math
import torch
import torch.nn as nn
import torch.optim as optim

nltk.download('punkt')


Collecting torchtext
  Downloading torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl.metadata (7.9 kB)
Downloading torchtext-0.18.0-cp312-cp312-manylinux1_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchtext
Successfully installed torchtext-0.18.0


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Small sample corpus
corpus = [
    "the cat sat on the mat",
    "the dog sat on the mat",
    "the cat lay on the rug",
    "the dog lay on the rug"
]

tokenized_corpus = [nltk.word_tokenize(sent) for sent in corpus]
tokenized_corpus


[['the', 'cat', 'sat', 'on', 'the', 'mat'],
 ['the', 'dog', 'sat', 'on', 'the', 'mat'],
 ['the', 'cat', 'lay', 'on', 'the', 'rug'],
 ['the', 'dog', 'lay', 'on', 'the', 'rug']]

In [None]:
def build_ngram_model(tokenized_texts, n=2):
    model = Counter()
    total = 0
    for sent in tokenized_texts:
        for gram in ngrams(sent, n):
            model[gram] += 1
            total += 1
    return model, total

bigram_model, bigram_total = build_ngram_model(tokenized_corpus, n=2)
trigram_model, trigram_total = build_ngram_model(tokenized_corpus, n=3)


In [None]:
def ngram_perplexity(test_sentence, model, total_count, n=2):
    tokens = nltk.word_tokenize(test_sentence)
    grams = list(ngrams(tokens, n))
    log_prob_sum = 0
    vocab = len(model)

    for gram in grams:
        count = model.get(gram, 0)
        prob = (count + 1) / (total_count + vocab)   # Add-1 smoothing
        log_prob_sum += math.log(prob)

    perplexity = math.exp(-log_prob_sum / len(grams))
    return perplexity

test_sentence = "the cat lay on the mat"
print("Bigram PP:", ngram_perplexity(test_sentence, bigram_model, bigram_total, n=2))
print("Trigram PP:", ngram_perplexity(test_sentence, trigram_model, trigram_total, n=3))


Bigram PP: 10.117866411063428
Trigram PP: 11.430952132988166


In [None]:
reference = [nltk.word_tokenize("the cat sat on the mat")]
candidate = nltk.word_tokenize("the cat lay on the mat")

smooth = SmoothingFunction().method1
bleu_score = sentence_bleu(reference, candidate, smoothing_function=smooth)

print("BLEU Score:", bleu_score)


BLEU Score: 0.25406637407730737


In [None]:
class RNNLanguageModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=32, hidden_dim=64):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embeds = self.embed(x)
        out, _ = self.rnn(embeds)
        logits = self.fc(out)
        return logits


In [None]:
# Build vocab
all_tokens = [word for sent in tokenized_corpus for word in sent]
vocab = list(set(all_tokens))
word2idx = {w:i for i,w in enumerate(vocab)}
idx2word = {i:w for w,i in word2idx.items()}

def encode(sentence):
    return torch.tensor([word2idx[w] for w in nltk.word_tokenize(sentence)])

# Training data: next-word prediction
inputs, targets = [], []

for sent in tokenized_corpus:
    encoded = encode(" ".join(sent))
    inputs.append(encoded[:-1])
    targets.append(encoded[1:])

inputs = nn.utils.rnn.pad_sequence(inputs, batch_first=True)
targets = nn.utils.rnn.pad_sequence(targets, batch_first=True)


In [None]:
model = RNNLanguageModel(len(vocab))
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

epochs = 50
for epoch in range(epochs):
    optimizer.zero_grad()
    logits = model(inputs)
    loss = criterion(logits.view(-1, len(vocab)), targets.view(-1))
    loss.backward()
    optimizer.step()

print("Training Complete — Final Loss:", loss.item())


Training Complete — Final Loss: 0.27784213423728943


In [None]:
def rnn_perplexity(model, sentence):
    encoded = encode(sentence)
    input_seq = encoded[:-1].unsqueeze(0)
    target_seq = encoded[1:].unsqueeze(0)

    with torch.no_grad():
        logits = model(input_seq)
        loss = criterion(logits.view(-1, len(vocab)), target_seq.view(-1))

    return math.exp(loss.item())

print("RNN Perplexity:", rnn_perplexity(model, test_sentence))


RNN Perplexity: 5.968755708905813
