<a href="https://colab.research.google.com/github/ShraddhaSharma24/Natural-Language-Processing/blob/main/text_summarization_using_seq2seq_mpdel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import random
import string

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Tiny dataset of English → Pig Latin words
data = [
    ("hello", "ellohay"),
    ("world", "orldway"),
    ("python", "ythonpay"),
    ("chat", "hatchay"),
    ("bot", "otbay")
]


In [2]:
all_chars = string.ascii_lowercase + " "  # 26 letters + space
char2idx = {ch: i for i, ch in enumerate(all_chars)}
idx2char = {i: ch for ch, i in char2idx.items()}

def word_to_tensor(word):
    tensor = torch.zeros(len(word), 1, len(all_chars))
    for li, letter in enumerate(word):
        tensor[li][0][char2idx[letter]] = 1
    return tensor.to(device)

def tensor_to_word(tensor):
    return ''.join([idx2char[i] for i in tensor])


In [3]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size)

    def forward(self, input_seq):
        hidden = torch.zeros(1, 1, self.hidden_size).to(device)
        for i in range(input_seq.size(0)):
            _, hidden = self.rnn(input_seq[i].unsqueeze(0), hidden)
        return hidden


In [4]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(output_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, hidden, target_length):
        input = torch.zeros(1, 1, len(all_chars)).to(device)
        decoded_indices = []

        for _ in range(target_length):
            output, hidden = self.rnn(input, hidden)
            output = self.softmax(self.out(output[0]))
            topi = output.argmax(1)
            decoded_indices.append(topi.item())

            input = torch.zeros(1, 1, len(all_chars)).to(device)
            input[0][0][topi.item()] = 1

        return decoded_indices


In [5]:
encoder = EncoderRNN(len(all_chars), 128).to(device)
decoder = DecoderRNN(128, len(all_chars)).to(device)

criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=0.01)

# Train loop
for epoch in range(500):
    pair = random.choice(data)
    input_tensor = word_to_tensor(pair[0])
    target_tensor = torch.tensor([char2idx[c] for c in pair[1]], dtype=torch.long).to(device)

    encoder.zero_grad()
    decoder.zero_grad()

    hidden = encoder(input_tensor)
    decoded = decoder(hidden, target_tensor.size(0))

    loss = 0
    for i in range(len(decoded)):
        loss += criterion(torch.log_softmax(decoder.out.weight[decoded[i]].unsqueeze(0), dim=1), target_tensor[i].unsqueeze(0))

    loss.backward()
    optimizer.step()

    if epoch % 100 == 0:
        print(f"[{epoch}] Loss: {loss.item():.4f}")


[0] Loss: 24.3709
[100] Loss: 29.3675
[200] Loss: 15.6784
[300] Loss: 20.1264
[400] Loss: 21.8515


In [6]:
def translate(word):
    input_tensor = word_to_tensor(word)
    hidden = encoder(input_tensor)
    decoded_indices = decoder(hidden, 10)
    return tensor_to_word(decoded_indices)

print("\n🔄 Translations:")
for eng, pig in data:
    print(f"{eng} → {translate(eng)}")



🔄 Translations:
hello → hhhhhhhhhh
world → chhhhhhhhh
python → tuhhhhhhhh
chat → whhhhhhhhh
bot → thhhhhhhhh


In [1]:
!pip install datasets --quiet
from datasets import load_dataset

dataset = load_dataset("opus_books", "en-fr")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Peek at the Data


In [2]:
# Take a small subset for training
train_data = dataset['train']
print(train_data[0])


{'id': '0', 'translation': {'en': 'The Wanderer', 'fr': 'Le grand Meaulnes'}}


In [3]:
# Extract sentence pairs
pairs = [(item['translation']['en'], item['translation']['fr']) for item in train_data]
pairs = [pair for pair in pairs if pair[0] and pair[1]]  # Remove empty ones
pairs = pairs[:10000]  # We'll use only 10k for quick training


In [4]:
!pip uninstall -y torch torchtext
!pip install torch==2.0.1 torchtext==0.15.2 --quiet



Found existing installation: torch 2.0.1
Uninstalling torch-2.0.1:
  Successfully uninstalled torch-2.0.1
Found existing installation: torchtext 0.15.2
Uninstalling torchtext-0.15.2:
  Successfully uninstalled torchtext-0.15.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchaudio 2.6.0+cu124 requires torch==2.6.0, but you have torch 2.0.1 which is incompatible.
torchvision 0.21.0+cu124 requires torch==2.6.0, but you have torch 2.0.1 which is incompatible.[0m[31m
[0m

 Tokenize and Prepare Sequences

In [5]:
import torch
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def tokenize(sentence):
    return sentence.lower().strip().split()

def build_vocab(sentences):
    vocab = build_vocab_from_iterator([tokenize(s) for s in sentences], specials=["<pad>", "<sos>", "<eos>", "<unk>"])
    vocab.set_default_index(vocab["<unk>"])
    return vocab

src_sentences = [src for src, tgt in pairs]
tgt_sentences = [tgt for src, tgt in pairs]

src_vocab = build_vocab(src_sentences)
tgt_vocab = build_vocab(tgt_sentences)


In [20]:
from torchtext.vocab import build_vocab_from_iterator

# Dummy English tokenized data (you can use real English-French pairs later)
tokenized_en = [["hello", "world"], ["good", "morning"], ["how", "are", "you"]]

SRC_vocab = build_vocab_from_iterator(tokenized_en, specials=["<pad>", "<sos>", "<eos>", "<unk>"])
SRC_vocab.set_default_index(SRC_vocab["<unk>"])

INPUT_DIM = len(SRC_vocab)         # Now this will work
ENC_EMB_DIM = 256
HIDDEN_DIM = 512

# Example tokenized French sentences corresponding to the English ones
tokenized_fr = [["bonjour", "le", "monde"], ["bon", "matin"], ["comment", "ça", "va"]]
from torchtext.vocab import build_vocab_from_iterator

# Add special tokens
TRG_vocab = build_vocab_from_iterator(tokenized_fr, specials=["<pad>", "<sos>", "<eos>", "<unk>"])
TRG_vocab.set_default_index(TRG_vocab["<unk>"])

# Define output dimension
OUTPUT_DIM = len(TRG_vocab)

print(f"TRG vocab size: {OUTPUT_DIM}")



TRG vocab size: 12


In [21]:
INPUT_DIM = len(SRC_vocab)      # Number of tokens in English vocab
OUTPUT_DIM = len(TRG_vocab)     # Number of tokens in French vocab


In [22]:
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HIDDEN_DIM)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HIDDEN_DIM)


Prepare Tensors

In [6]:
def numericalize(sentence, vocab):
    tokens = ["<sos>"] + tokenize(sentence) + ["<eos>"]
    return torch.tensor([vocab[token] for token in tokens], dtype=torch.long)

def collate_batch(batch):
    src_batch, tgt_batch = [], []
    for src, tgt in batch:
        src_tensor = numericalize(src, src_vocab)
        tgt_tensor = numericalize(tgt, tgt_vocab)
        src_batch.append(src_tensor)
        tgt_batch.append(tgt_tensor)
    src_batch = pad_sequence(src_batch, padding_value=src_vocab["<pad>"])
    tgt_batch = pad_sequence(tgt_batch, padding_value=tgt_vocab["<pad>"])
    return src_batch, tgt_batch

train_dataloader = DataLoader(pairs, batch_size=32, shuffle=True, collate_fn=collate_batch)


In [8]:
import torch
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim)

    def forward(self, src):
        embedded = self.embedding(src)
        outputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell


In [9]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim)
        self.fc_out = nn.Linear(hidden_dim, output_dim)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)  # (1, batch_size)
        embedded = self.embedding(input)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))  # (batch_size, output_dim)
        return prediction, hidden, cell


In [10]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.fc_out.out_features

        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)

        hidden, cell = self.encoder(src)

        input = trg[0, :]  # <sos> token

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output

            teacher_force = torch.rand(1).item() < teacher_forcing_ratio
            top1 = output.argmax(1)

            input = trg[t] if teacher_force else top1

        return outputs


In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

INPUT_DIM = SRC_vocab
OUTPUT_DIM = TRG_vocab
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HIDDEN_DIM = 512

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HIDDEN_DIM)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HIDDEN_DIM)

model = Seq2Seq(enc, dec, device).to(device)

optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)


TypeError: empty() received an invalid combination of arguments - got (tuple, dtype=NoneType, device=NoneType), but expected one of:
 * (tuple of ints size, *, tuple of names names, torch.memory_format memory_format, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)
 * (tuple of ints size, *, torch.memory_format memory_format, Tensor out, torch.dtype dtype, torch.layout layout, torch.device device, bool pin_memory, bool requires_grad)


In [24]:
print(f"INPUT_DIM = {INPUT_DIM}")
print(f"ENC_EMB_DIM = {ENC_EMB_DIM}")
print(f"HIDDEN_DIM = {HIDDEN_DIM}")


INPUT_DIM = Vocab()
ENC_EMB_DIM = 256
HIDDEN_DIM = 512
