In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import BartTokenizer
from datasets import load_dataset

# Load dataset
dataset = load_dataset("gopalkalpande/bbc-news-summary")
train_data = dataset["train"]

In [None]:
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np

# Define a custom Dataset class
class TextSummarizationDataset(Dataset):
    def __init__(self, data, max_input_length=100, max_target_length=30):
        self.data = data
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length
        self.input_texts = []
        self.target_texts = []
        self.build_dataset()

    def build_dataset(self):
        for example in self.data:
            input_text = example["Articles"]
            target_text = example["Summaries"]
            # Truncate or pad input text to max_input_length
            input_text = input_text[:self.max_input_length]
            input_text = input_text + ' ' * (self.max_input_length - len(input_text))
            # Truncate target text to max_target_length
            target_text = target_text[:self.max_target_length]
            self.input_texts.append(input_text)
            self.target_texts.append(target_text)

    def __len__(self):
        return len(self.input_texts)

    def __getitem__(self, idx):
        return {"input_text": self.input_texts[idx], "target_text": self.target_texts[idx]}

# Tokenization function
def tokenize_text(text):
    tokens = text.split()
    token_ids = [vocab_to_idx[token] if token in vocab_to_idx else vocab_to_idx['<UNK>'] for token in tokens]
    return token_ids

# Build vocabulary
all_texts = [example["Articles"] for example in train_data] + [example["Summaries"] for example in train_data]
all_tokens = ' '.join(all_texts).split()
vocab_counter = Counter(all_tokens)
vocab = [token for token, count in vocab_counter.items() if count > 5]  # Filter out tokens with count <= 5
vocab_to_idx = {token: idx for idx, token in enumerate(vocab)}
vocab_to_idx['<PAD>'] = len(vocab_to_idx)
vocab_to_idx['<UNK>'] = len(vocab_to_idx)

# Define collate function for DataLoader
def collate_fn(batch):
    input_texts = [item["input_text"] for item in batch]
    target_texts = [item["target_text"] for item in batch]
    input_token_ids = [tokenize_text(text) for text in input_texts]
    target_token_ids = [tokenize_text(text) for text in target_texts]

    # Pad sequences to maximum length
    input_tensor = torch.nn.utils.rnn.pad_sequence([torch.LongTensor(ids) for ids in input_token_ids], batch_first=True, padding_value=vocab_to_idx['<PAD>'])
    target_tensor = torch.nn.utils.rnn.pad_sequence([torch.LongTensor(ids) for ids in target_token_ids], batch_first=True, padding_value=vocab_to_idx['<PAD>'])

    return {"input_tensor": input_tensor, "target_tensor": target_tensor}


# Create dataset instance
max_input_length = 100
max_target_length = 30
custom_dataset = TextSummarizationDataset(train_data, max_input_length, max_target_length)

# Create DataLoader
BATCH_SIZE = 32
dataloader = DataLoader(custom_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

# Example usage
for batch in dataloader:
    input_tensor = batch["input_tensor"]  # Shape: (batch_size, max_input_length)
    target_tensor = batch["target_tensor"]  # Shape: (batch_size, max_target_length)
    break

In [None]:
# Print lengths of tokenized sequences for input and target texts
input_token_lengths = [len(tokenize_text(text)) for text in custom_dataset.input_texts]
target_token_lengths = [len(tokenize_text(text)) for text in custom_dataset.target_texts]

print("Input token length statistics:")
print("Mean:", np.mean(input_token_lengths))
print("Max:", np.max(input_token_lengths))
print("Min:", np.min(input_token_lengths))

print("\nTarget token length statistics:")
print("Mean:", np.mean(target_token_lengths))
print("Max:", np.max(target_token_lengths))
print("Min:", np.min(target_token_lengths))


Input token length statistics:
Mean: 15.759442446043165
Max: 22
Min: 11

Target token length statistics:
Mean: 5.60521582733813
Max: 9
Min: 3


In [None]:
# Print tokenized sequences and their lengths before converting to tensors
for idx in range(5):  # Print information for the first 5 samples
    print(f"Sample {idx + 1}:")
    print("Input text:", custom_dataset.input_texts[idx])
    print("Tokenized input:", tokenize_text(custom_dataset.input_texts[idx]))
    print("Length of tokenized input:", len(tokenize_text(custom_dataset.input_texts[idx])))
    print("Target text:", custom_dataset.target_texts[idx])
    print("Tokenized target:", tokenize_text(custom_dataset.target_texts[idx]))
    print("Length of tokenized target:", len(tokenize_text(custom_dataset.target_texts[idx])))
    print()


Sample 1:
Input text: Budget to set scene for election..Gordon Brown will seek to put the economy at the centre of Labour'
Tokenized input: [0, 1, 2, 3, 4, 15375, 5, 6, 7, 1, 8, 9, 10, 11, 9, 12, 13, 15375]
Length of tokenized input: 18
Target text: - Increase in the stamp duty t
Tokenized target: [72, 15375, 19, 9, 46, 44, 15375]
Length of tokenized target: 7

Sample 2:
Input text: Army chiefs in regiments decision..Military chiefs are expected to meet to make a final decision on 
Tokenized input: [278, 279, 19, 280, 15375, 279, 139, 29, 1, 218, 1, 281, 16, 282, 283, 74]
Length of tokenized input: 16
Target text: "They are very much not for th
Tokenized target: [386, 139, 180, 101, 64, 4, 15375]
Length of tokenized target: 7

Sample 3:
Input text: Howard denies split over ID cards..Michael Howard has denied his shadow cabinet was split over its d
Tokenized input: [446, 447, 448, 237, 449, 15375, 446, 123, 450, 24, 143, 451, 357, 448, 237, 452, 15375]
Length of tokenized input: 17
Targ

In [None]:
# Example usage
for batch_idx, batch in enumerate(dataloader):
    input_tensor = batch["input_tensor"]  # Shape: (batch_size, max_input_length)
    target_tensor = batch["target_tensor"]  # Shape: (batch_size, max_target_length)

    print(f"Batch {batch_idx + 1}:")
    print("Input tensor shape:", input_tensor.shape)
    print("Target tensor shape:", target_tensor.shape)
    print()

    # Print the first sequence in the batch to verify padding
    print("First sequence in the batch:")
    print("Input sequence:", input_tensor[0])
    print("Target sequence:", target_tensor[0])

    # Break after printing the first batch
    break


Batch 1:
Input tensor shape: torch.Size([32, 19])
Target tensor shape: torch.Size([32, 8])

First sequence in the batch:
Input sequence: tensor([  960,  4414,   345, 14948, 15375, 15375,   960,  1639,   687,    28,
            1,  2410,     1,  3906,    16,   676,  3292, 15375, 15374])
Target sequence: tensor([ 1823,  3276,    62,   367, 15375, 15374, 15374, 15374])


In [None]:
import torch
import torch.nn as nn

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)

    def forward(self, input_seq):
        embedded = self.embedding(input_seq)
        outputs, (hidden, cell) = self.lstm(embedded)
        return hidden, cell


In [None]:
class Decoder(nn.Module):
    def __init__(self, output_size, hidden_size, num_layers=1):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input_seq, hidden, cell):
        input_seq = input_seq.unsqueeze(1)
        embedded = self.embedding(input_seq)
        outputs, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        predictions = self.softmax(self.fc(outputs.squeeze(1)))
        return predictions, hidden, cell


In [None]:
import random

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, teacher_forcing_ratio=0.5):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.teacher_forcing_ratio = teacher_forcing_ratio

    def forward(self, input_seq, target_seq):
        batch_size = input_seq.size(0)
        target_len = target_seq.size(1)
        target_vocab_size = self.decoder.fc.out_features

        # Initialize outputs and hidden states
        outputs = torch.zeros(batch_size, target_len, target_vocab_size).to(input_seq.device)
        hidden, cell = self.encoder(input_seq)

        # First input to the decoder is the < SOS > token
        input_seq = target_seq[:, 0]

        # Iterate over sequence length
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(input_seq, hidden, cell)
            outputs[:, t] = output
            # Determine whether to use teacher forcing
            teacher_force = random.random() < self.teacher_forcing_ratio
            top1 = output.argmax(1)
            input_seq = target_seq[:, t] if teacher_force else top1

        return outputs

In [None]:
TEACHER_FORCING_RATIO = 0.5

In [None]:
import torch.optim as optim

# Define loss function
criterion = nn.CrossEntropyLoss()

# Define optimizer
learning_rate = 0.001
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


In [None]:
# Define model
encoder = Encoder(input_size=len(vocab_to_idx), hidden_size=256)
decoder = Decoder(output_size=len(vocab_to_idx), hidden_size=256)
model = Seq2Seq(encoder, decoder, teacher_forcing_ratio=TEACHER_FORCING_RATIO)

In [None]:
import random

# Set device
device = "cpu"

def train_epoch(model, dataloader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0

    for batch in dataloader:
        input_tensor = batch["input_tensor"].to(device)
        target_tensor = batch["target_tensor"].to(device)

        optimizer.zero_grad()

        # Forward pass
        output = model(input_tensor, target_tensor)

        # Calculate loss
        loss = criterion(output.permute(0, 2, 1), target_tensor)

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

# Function to evaluate the model on validation data
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0.0

    with torch.no_grad():
        for batch in dataloader:
            input_tensor = batch["input_tensor"].to(device)
            target_tensor = batch["target_tensor"].to(device)

            output = model(input_tensor, target_tensor)
            loss = criterion(output.permute(0, 2, 1), target_tensor)

            total_loss += loss.item()

    return total_loss / len(dataloader)

# Training parameters
NUM_EPOCHS = 100


# Move model to device
model.to(device)

# Training loop
for epoch in range(1, NUM_EPOCHS + 1):
    train_loss = train_epoch(model, dataloader, criterion, optimizer, device)
    val_loss = evaluate(model, dataloader, criterion, device)
    print(f"Epoch {epoch}: Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")


Epoch 1: Train Loss: 4.6748, Val Loss: 4.5465
Epoch 2: Train Loss: 4.5780, Val Loss: 4.4261
Epoch 3: Train Loss: 4.4682, Val Loss: 4.2985
Epoch 4: Train Loss: 4.3348, Val Loss: 4.2328
Epoch 5: Train Loss: 4.2071, Val Loss: 4.0916
Epoch 6: Train Loss: 4.1352, Val Loss: 3.9374
Epoch 7: Train Loss: 3.9319, Val Loss: 3.7868
Epoch 8: Train Loss: 3.8751, Val Loss: 3.6800
Epoch 9: Train Loss: 3.6972, Val Loss: 3.5617
Epoch 10: Train Loss: 3.5951, Val Loss: 3.4500
Epoch 11: Train Loss: 3.5166, Val Loss: 3.3046
Epoch 12: Train Loss: 3.3930, Val Loss: 3.1709
Epoch 13: Train Loss: 3.2231, Val Loss: 3.0589
Epoch 14: Train Loss: 3.1668, Val Loss: 2.9687
Epoch 15: Train Loss: 3.0355, Val Loss: 2.8253
Epoch 16: Train Loss: 2.9112, Val Loss: 2.7632
Epoch 17: Train Loss: 2.8092, Val Loss: 2.6206
Epoch 18: Train Loss: 2.7055, Val Loss: 2.5034
Epoch 19: Train Loss: 2.6156, Val Loss: 2.4438
Epoch 20: Train Loss: 2.5570, Val Loss: 2.4008
Epoch 21: Train Loss: 2.4717, Val Loss: 2.3072
Epoch 22: Train Loss: 

In [None]:
model

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(15376, 256)
    (lstm): LSTM(256, 256, batch_first=True)
  )
  (decoder): Decoder(
    (embedding): Embedding(15376, 256)
    (lstm): LSTM(256, 256, batch_first=True)
    (fc): Linear(in_features=256, out_features=15376, bias=True)
    (softmax): LogSoftmax(dim=1)
  )
)

In [None]:
def generate_summary(input_text, model, tokenizer, device, max_summary_length=50):
    # Tokenize the input text
    print("Input Text:", input_text)
    inputs = tokenizer([input_text], max_length=1024, return_tensors="pt", truncation=True).to(device)
    print("Input Tensor:", inputs)

    # Generate the summary
    with torch.no_grad():
        target_seq = torch.zeros((inputs.input_ids.size(0), max_summary_length)).long().to(device)
        output_ids = model(inputs.input_ids, target_seq=target_seq)

    # Decode the generated summary
    summary_ids = torch.argmax(output_ids, dim=-1)
    print("Shape of summary_ids:", summary_ids.shape)
    print("Type of summary_ids:", type(summary_ids))
    print("Summary IDs:", summary_ids)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

    return summary


In [None]:
from transformers import BartTokenizer

# Initialize the tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")

# Example usage
device = "cpu"
input_text = "enter text here to get a summary. this model is trained to make a summary of the text you enter here."
summary = generate_summary(input_text, model, tokenizer, device)
print("Generated Summary:", summary)


Input Text: enter text here to get a summary. this model is trained to make a summary of the text you enter here.
Input Tensor: {'input_ids': tensor([[    0, 11798,  2788,   259,     7,   120,    10,  4819,     4,    42,
          1421,    16,  5389,     7,   146,    10,  4819,     9,     5,  2788,
            47,  2914,   259,     4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])}
Shape of summary_ids: torch.Size([1, 50])
Type of summary_ids: <class 'torch.Tensor'>
Summary IDs: tensor([[    0,   216,  1418,   792, 15375, 15375, 15375, 15374, 15374, 15374,
         15374, 15374, 15374, 15374, 15374, 15374, 15374, 15374, 15374, 15374,
         15374, 15374, 15374, 15374, 15374, 15374, 15374, 15374, 15374, 15374,
         15374, 15374, 15374, 15374, 15374, 15374, 15374, 15374, 15374, 15374,
         15374, 15374, 15374, 15374, 15374, 15374, 15374, 15374, 15374, 15374]])
Generated Summary:  know district board���a