## Load Data

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import pandas as pd
import numpy as np
import re
from torch.nn.utils.rnn import pad_sequence
import random

# Check CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Preprocessing Function (using GPU where applicable)
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text


In [2]:
import pandas as pd
import random

# Load and preprocess datasets
def load_data(file_path):
    df = pd.read_csv(file_path)
    df['article'] = df['article'].apply(preprocess_text)
    df['highlights'] = df['highlights'].apply(preprocess_text)
    return df['article'].tolist(), df['highlights'].tolist()

train_texts, train_summaries = load_data("resources/train.csv")
val_texts, val_summaries = load_data("resources/validation.csv")

SAMPLE_SIZE = 15000  # Adjust based on available GPU memory

# Ensure we don't sample more than available data
train_sample_size = min(SAMPLE_SIZE, len(train_texts))
val_sample_size = min(SAMPLE_SIZE // 4, len(val_texts))  # Use smaller validation set

# Randomly sample data
train_sample_indices = random.sample(range(len(train_texts)), train_sample_size)
val_sample_indices = random.sample(range(len(val_texts)), val_sample_size)

# Subset the dataset
train_texts = [train_texts[i] for i in train_sample_indices]
train_summaries = [train_summaries[i] for i in train_sample_indices]

val_texts = [val_texts[i] for i in val_sample_indices]
val_summaries = [val_summaries[i] for i in val_sample_indices]


### Build vocabulary

In [None]:
from collections import Counter

# Flatten lists into a single Counter object
word_counts = Counter(word for text in train_texts + train_summaries for word in text.split())
vocab = {word: i+2 for i, (word, _) in enumerate(word_counts.items())}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1
vocab["<SOS>"] = len(vocab)
vocab["<EOS>"] = len(vocab) + 1
rev_vocab = {idx: word for word, idx in vocab.items()}
VOCAB_SIZE = len(vocab)

def load_glove_embeddings(glove_path, vocab, embed_dim=100):
    embeddings = np.random.uniform(-0.1, 0.1, (len(vocab), embed_dim))  # Random init
    embeddings[0] = np.zeros(embed_dim)  # <PAD> is zero vector
    
    with open(glove_path, "r", encoding="utf-8") as f:
        for line in f:
            split_line = line.split()
            word, vector = split_line[0], np.array(split_line[1:], dtype=np.float32)
            if word in vocab:
                embeddings[vocab[word]] = vector

    return torch.tensor(embeddings, dtype=torch.float32)

# Load GloVe embeddings
glove_path = "resources/glove.6B.100d.txt"
EMBED_SIZE = 100
embedding_matrix = load_glove_embeddings(glove_path, vocab, EMBED_SIZE)

## Dataset Class

In [17]:
class NewsDataset(Dataset):
    def __init__(self, texts, summaries, vocab, max_len=100):
        self.texts = [torch.tensor([vocab.get(word, 1) for word in text.split()], dtype=torch.long) for text in texts]
        self.summaries = [torch.tensor([vocab.get(word, 1) for word in summary.split()], dtype=torch.long) for summary in summaries]
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx][:self.max_len]
        summary = self.summaries[idx][:self.max_len]
        return text, summary

### Collate Function for Padding

In [35]:
MAX_LEN = 128  # Set a reasonable sequence length

def collate_fn(batch):
    text, summary = zip(*batch)  # Unpack batch
    
    text = [torch.as_tensor(t[:MAX_LEN], dtype=torch.long).clone().detach() for t in text]
    summary = [torch.as_tensor(s[:MAX_LEN], dtype=torch.long).clone().detach() for s in summary]

    text = pad_sequence(text, batch_first=True, padding_value=0)  # Pad sequences
    summary = pad_sequence(summary, batch_first=True, padding_value=0)

    return text, summary



## Define LSTM Model

In [48]:
class LSTMSeq2Seq(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, embedding_matrix, dropout=0.5):
        super(LSTMSeq2Seq, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.encoder = nn.LSTM(embed_size, hidden_size, batch_first=True, dropout=dropout)
        self.decoder = nn.LSTM(embed_size, hidden_size, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, src, tgt, teacher_forcing_ratio=0.8):
        embed_src = self.embedding(src)
        embed_tgt = self.embedding(tgt)
        
        _, (hidden, cell) = self.encoder(embed_src)
        
        outputs = []
        decoder_input = embed_tgt[:, 0].unsqueeze(1)  # Start token
        
        for t in range(tgt.shape[1] - 1):
            output, (hidden, cell) = self.decoder(decoder_input, (hidden, cell))
            output = self.fc(output)
            outputs.append(output)

            # Teacher forcing: Use true target word some of the time
            if random.random() < teacher_forcing_ratio:
                decoder_input = self.embedding(tgt[:, t + 1]).unsqueeze(1)
            else:
                decoder_input = self.embedding(torch.argmax(output, dim=-1)).detach()

        return torch.cat(outputs, dim=1)


##  Function to generate summary

In [49]:
import torch
import heapq
import numpy as np

def beam_search(model, text, vocab, rev_vocab, beam_width=3, max_len=50):
    model.eval()
    device = next(model.parameters()).device
    
    text_tensor = torch.tensor([vocab.get(word, vocab["<UNK>"]) for word in text.split()], dtype=torch.long).unsqueeze(0).to(device)
    
    with torch.no_grad():
        _, (hidden, cell) = model.encoder(model.embedding(text_tensor))

    sequences = [([], 0, hidden, cell)]
    
    for _ in range(max_len):
        new_sequences = []
        for seq, score, hidden, cell in sequences:
            last_word_idx = seq[-1] if seq else vocab["<SOS>"]
            last_word_tensor = torch.tensor([[last_word_idx]], dtype=torch.long).to(device)

            embed_tgt = model.embedding(last_word_tensor)
            output, (new_hidden, new_cell) = model.decoder(embed_tgt, (hidden, cell))
            output_probs = torch.softmax(model.fc(output[:, -1, :]), dim=-1).cpu().numpy()

            top_indices = np.argsort(output_probs[0])[-beam_width:]
            for idx in top_indices:
                new_sequences.append((seq + [idx], score + np.log(output_probs[0][idx]), new_hidden, new_cell))

        sequences = heapq.nlargest(beam_width, new_sequences, key=lambda x: x[1])

    best_sequence = sequences[0][0]
    summary = " ".join([rev_vocab.get(idx, "<UNK>") for idx in best_sequence if idx not in {vocab["<PAD>"], vocab["<SOS>"], vocab["<EOS>"]}])
    return summary

In [50]:
HIDDEN_SIZE = 64
BATCH_SIZE = 16
EPOCHS = 20
LR = 0.001

### Create DataLoaders

In [51]:
from torch.utils.data import DataLoader, SubsetRandomSampler

subset_size = 10000
train_indices = list(range(min(subset_size, len(train_texts))))
val_indices = list(range(min(subset_size, len(val_texts))))

train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)

train_dataset = NewsDataset(train_texts, train_summaries, vocab)
val_dataset = NewsDataset(val_texts, val_summaries, vocab)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, collate_fn=collate_fn, pin_memory=True)

In [52]:
model = LSTMSeq2Seq(VOCAB_SIZE, EMBED_SIZE, HIDDEN_SIZE, embedding_matrix).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=2)



### Training loop

In [53]:
import torch
import gc

gc.collect()  
torch.cuda.empty_cache()
model = torch.compile(model)  # JIT Compilation for speedup



In [54]:
from torch.cuda.amp import autocast, GradScaler

def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs=20, patience=3):
    scaler = GradScaler()
    best_val_loss = float("inf")
    counter = 0

    for epoch in range(epochs):
        model.train()
        train_loss = 0

        for text, summary in train_loader:
            text, summary = text.to(device), summary.to(device)
            optimizer.zero_grad()

            with autocast(device_type="cuda", dtype=torch.float16):
                output = model(text, summary)
                loss = criterion(output.view(-1, VOCAB_SIZE), summary[:, 1:].reshape(-1))

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            train_loss += loss.item()

        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for text, summary in val_loader:
                text, summary = text.to(device), summary.to(device)
                with autocast(device_type="cuda", dtype=torch.float16):
                    output = model(text, summary)
                    loss = criterion(output.view(-1, VOCAB_SIZE), summary[:, 1:].reshape(-1))
                val_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)
        print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

        scheduler.step(avg_val_loss)

        # Early Stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            counter = 0
            torch.save(model.state_dict(), "best_model.pth")
        else:
            counter += 1
            if counter >= patience:
                print("Early stopping triggered. Loading best model.")
                model.load_state_dict(torch.load("best_model.pth"))
                break


In [55]:
train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, EPOCHS)

Epoch 1, Train Loss: 8.2608, Val Loss: 7.8390
Epoch 2, Train Loss: 7.7774, Val Loss: 7.6463
Epoch 3, Train Loss: 7.5800, Val Loss: 7.4855
Epoch 4, Train Loss: 7.4157, Val Loss: 7.3792
Epoch 5, Train Loss: 7.2843, Val Loss: 7.3024
Epoch 6, Train Loss: 7.1773, Val Loss: 7.2414
Epoch 7, Train Loss: 7.0863, Val Loss: 7.2019
Epoch 8, Train Loss: 7.0082, Val Loss: 7.1691
Epoch 9, Train Loss: 6.9409, Val Loss: 7.1496
Epoch 10, Train Loss: 6.8811, Val Loss: 7.1258
Epoch 11, Train Loss: 6.8223, Val Loss: 7.1040
Epoch 12, Train Loss: 6.7625, Val Loss: 7.0789
Epoch 13, Train Loss: 6.7048, Val Loss: 7.0543
Epoch 14, Train Loss: 6.6510, Val Loss: 7.0408
Epoch 15, Train Loss: 6.6010, Val Loss: 7.0281
Epoch 16, Train Loss: 6.5557, Val Loss: 7.0126
Epoch 17, Train Loss: 6.5131, Val Loss: 7.0041
Epoch 18, Train Loss: 6.4723, Val Loss: 7.0011
Epoch 19, Train Loss: 6.4351, Val Loss: 7.0004
Epoch 20, Train Loss: 6.3979, Val Loss: 6.9902


In [47]:
summary = beam_search(model, train_texts[0], vocab, rev_vocab)
print("Generated Summary:", summary)


'a cameron of a and and is was is the and on is was is year and in the year in the the been the years were the people were services and the uk have it is us is in after the and week to the new government in the'