## Load Data

In [60]:
import torch
import torch.nn as nn
import torch.optim as optim
from bs4 import BeautifulSoup
from spacy.lang.en import stop_words
from torch.utils.data import DataLoader, Dataset, random_split
import pandas as pd
import numpy as np
import re
from torch.nn.utils.rnn import pad_sequence
import random

# Check CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")




In [61]:
import pandas as pd
import random

# Load and preprocess datasets
def load_data(file_path):
    df = pd.read_csv(file_path)
    df['article'] = df['article'].apply(preprocess_text)
    df['highlights'] = df['highlights'].apply(preprocess_text)
    return df['article'].tolist(), df['highlights'].tolist()

train_texts, train_summaries = load_data("resources/train.csv")
val_texts, val_summaries = load_data("resources/validation.csv")

SAMPLE_SIZE = 10000  # Adjust based on available GPU memory

# Ensure we don't sample more than available data
train_sample_size = min(SAMPLE_SIZE, len(train_texts))
val_sample_size = min(SAMPLE_SIZE // 4, len(val_texts))  # Use smaller validation set

# Randomly sample data
train_sample_indices = random.sample(range(len(train_texts)), train_sample_size)
val_sample_indices = random.sample(range(len(val_texts)), val_sample_size)

# Subset the dataset
train_texts = [train_texts[i] for i in train_sample_indices]
train_summaries = [train_summaries[i] for i in train_sample_indices]

val_texts = [val_texts[i] for i in val_sample_indices]
val_summaries = [val_summaries[i] for i in val_sample_indices]


### Build vocabulary

In [62]:
from collections import Counter

# Flatten lists into a single Counter object
word_counts = Counter(word for text in train_texts + train_summaries for word in text.split())
vocab = {word: i+2 for i, (word, _) in enumerate(word_counts.items())}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1
vocab["<SOS>"] = len(vocab)
vocab["<EOS>"] = len(vocab) + 1
rev_vocab = {idx: word for word, idx in vocab.items()}
VOCAB_SIZE = len(vocab)

def load_glove_embeddings(glove_path, vocab, embed_dim=100):
    embeddings = np.random.uniform(-0.1, 0.1, (len(vocab), embed_dim))  # Random init
    embeddings[0] = np.zeros(embed_dim)  # <PAD> is zero vector
    
    with open(glove_path, "r", encoding="utf-8") as f:
        for line in f:
            split_line = line.split()
            word, vector = split_line[0], np.array(split_line[1:], dtype=np.float32)
            if word in vocab:
                embeddings[vocab[word]] = vector

    return torch.tensor(embeddings, dtype=torch.float32)

# Load GloVe embeddings
glove_path = "resources/glove.6B.100d.txt"
EMBED_SIZE = 100
embedding_matrix = load_glove_embeddings(glove_path, vocab, EMBED_SIZE)

In [63]:
# Preprocessing Function (using GPU where applicable)
def preprocess_text(text,num):
    newString = text.lower()
    newString = BeautifulSoup(newString, "lxml").text
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub('"','', newString)
    newString = ' '.join([embedding_matrix[t] if t in embedding_matrix else t for t in newString.split(" ")])    
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub("[^a-zA-Z]", " ", newString) 
    newString = re.sub('[m]{2,}', 'mm', newString)
    if(num==0):
        tokens = [w for w in newString.split() if not w in stop_words]
    else:
        tokens=newString.split()
    long_words=[]
    for i in tokens:
        if len(i)>1:                                                 #removing short word
            long_words.append(i)   
    return (" ".join(long_words)).strip()

## Dataset Class

In [64]:
class NewsDataset(Dataset):
    def __init__(self, texts, summaries, vocab, max_len=100):
        self.texts = [torch.tensor([vocab.get(word, 1) for word in text.split()], dtype=torch.long) for text in texts]
        self.summaries = [torch.tensor([vocab.get(word, 1) for word in summary.split()], dtype=torch.long) for summary in summaries]
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx][:self.max_len]
        summary = self.summaries[idx][:self.max_len]
        return text, summary

### Collate Function for Padding

In [65]:
MAX_LEN = 75  # Set a reasonable sequence length

def collate_fn(batch):
    text, summary = zip(*batch)  # Unpack batch
    
    text = [torch.as_tensor(t[:MAX_LEN], dtype=torch.long).clone().detach() for t in text]
    summary = [torch.as_tensor(s[:MAX_LEN], dtype=torch.long).clone().detach() for s in summary]

    text = pad_sequence(text, batch_first=True, padding_value=0)  # Pad sequences
    summary = pad_sequence(summary, batch_first=True, padding_value=0)

    return text, summary



## Define LSTM Model

In [67]:
class LSTMSeq2Seq(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, embedding_matrix, dropout=0.3):
        super(LSTMSeq2Seq, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.encoder = nn.LSTM(embed_size, hidden_size, batch_first=True, dropout=dropout)
        self.decoder = nn.LSTM(embed_size, hidden_size, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        embed_src = self.embedding(src)
        embed_tgt = self.embedding(tgt)
        
        _, (hidden, cell) = self.encoder(embed_src)
        
        outputs = []
        decoder_input = embed_tgt[:, 0].unsqueeze(1)  # Start token
        
        for t in range(tgt.shape[1] - 1):
            output, (hidden, cell) = self.decoder(decoder_input, (hidden, cell))
            output = self.fc(output)
            outputs.append(output)

            # Teacher forcing: Use true target word some of the time
            if random.random() < teacher_forcing_ratio:
                decoder_input = self.embedding(tgt[:, t + 1]).unsqueeze(1)
            else:
                decoder_input = self.embedding(torch.argmax(output, dim=-1)).detach()

        return torch.cat(outputs, dim=1)


##  Function to generate summary

In [75]:
import torch
import heapq
import numpy as np

def beam_search(model, text, vocab, rev_vocab, beam_width=3, max_len=50):
    model.eval()
    with torch.no_grad():
        # Convert text to indices
        text_indices = [vocab.get(word, vocab["<UNK>"]) for word in text.split()]
        text_tensor = torch.tensor(text_indices, dtype=torch.long).unsqueeze(0).to(device)  # (1, seq_len)

        # Encode input text
        embedded_text = model.embedding(text_tensor)
        _, (hidden, cell) = model.encoder(embedded_text)  # Encoder output states

        # Initialize beam search: (score, sequence, hidden state, cell state)
        sequences = [(0, [vocab["<SOS>"]], hidden, cell)]

        for _ in range(max_len):
            all_candidates = []
            for score, seq, h, c in sequences:
                last_word = seq[-1]

                # Stop expanding sequences that already reached <EOS>
                if last_word == vocab["<EOS>"]:
                    all_candidates.append((score, seq, h, c))
                    continue

                # Convert last word to tensor
                last_word_tensor = torch.tensor([last_word], dtype=torch.long).to(device)
                embedded_input = model.embedding(last_word_tensor).unsqueeze(1)  # (1, 1, embed_size)

                # Decoder step
                output, (new_h, new_c) = model.decoder(embedded_input, (h, c))

                # Compute probabilities and get top-k words
                output_probs = torch.softmax(model.fc(output.squeeze(1)), dim=-1)  # (1, vocab_size)
                topk_probs, topk_indices = torch.topk(output_probs, beam_width, dim=-1)  # (1, beam_width)

                # Add new candidates to the list
                for i in range(beam_width):
                    word_idx = topk_indices[0][i].item()
                    word_prob = topk_probs[0][i].item()

                    candidate = (score + np.log(word_prob), seq + [word_idx], new_h, new_c)
                    all_candidates.append(candidate)

            # Keep top `beam_width` sequences
            sequences = sorted(all_candidates, key=lambda x: x[0], reverse=True)[:beam_width]

        # Select the best sequence
        best_seq = sequences[0][1]

        # Convert indices to words
        summary = " ".join([rev_vocab[idx] for idx in best_seq if idx not in {vocab["<SOS>"], vocab["<EOS>"], vocab["<PAD>"]}])
        return summary


In [69]:
HIDDEN_SIZE = 128
BATCH_SIZE = 32
EPOCHS = 10
LR = 0.001

### Create DataLoaders

In [70]:
from torch.utils.data import DataLoader, SubsetRandomSampler

subset_size = 5000
train_indices = list(range(min(subset_size, len(train_texts))))
val_indices = list(range(min(subset_size, len(val_texts))))

train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)

train_dataset = NewsDataset(train_texts, train_summaries, vocab)
val_dataset = NewsDataset(val_texts, val_summaries, vocab)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn, pin_memory=True, num_workers=4, prefetch_factor=2)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, collate_fn=collate_fn, pin_memory=True, num_workers=4, prefetch_factor=2)

In [71]:
model = LSTMSeq2Seq(VOCAB_SIZE, EMBED_SIZE, HIDDEN_SIZE, embedding_matrix).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=2)



### Training loop

In [72]:
import torch
import gc

gc.collect()  
torch.cuda.empty_cache()
model = torch.compile(model)  # JIT Compilation for speedup



In [73]:
from torch.cuda.amp import autocast, GradScaler

def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs=20, patience=3):
    scaler = torch.amp.GradScaler()
    best_val_loss = float("inf")
    counter = 0
    accumulation_steps = 4  # Accumulate gradients over 4 batches before updating

    for epoch in range(epochs):
        model.train()
        train_loss = 0

        for i, (text, summary) in enumerate(train_loader):  # Use enumerate() here!
            text, summary = text.to(device), summary.to(device)
            optimizer.zero_grad()

            with torch.amp.autocast("cuda", dtype=torch.float16):
                output = model(text, summary)
                loss = criterion(output.view(-1, VOCAB_SIZE).to(device), summary[:, 1:].reshape(-1).to(device))

            scaler.scale(loss).backward()

            if (i + 1) % accumulation_steps == 0:  # Only update every N steps
                scaler.step(optimizer)
                scaler.update()
            
            train_loss += loss.item()

        # Validation Step
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for text, summary in val_loader:
                text, summary = text.to(device), summary.to(device)
                with torch.amp.autocast("cuda", dtype=torch.float16):
                    output = model(text, summary)
                    loss = criterion(output.view(-1, VOCAB_SIZE), summary[:, 1:].reshape(-1))
                val_loss += loss.item()

        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)
        print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

        scheduler.step(avg_val_loss)

        # Early Stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            counter = 0
            torch.save(model.state_dict(), "best_model.pth")
        else:
            counter += 1
            if counter >= patience:
                print("Early stopping triggered. Loading best model.")
                model.load_state_dict(torch.load("best_model.pth"))
                break


In [74]:
train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, EPOCHS)

Epoch 1, Train Loss: 10.2410, Val Loss: 8.2275
Epoch 2, Train Loss: 8.1067, Val Loss: 8.0849
Epoch 3, Train Loss: 8.0522, Val Loss: 8.0270
Epoch 4, Train Loss: 7.9972, Val Loss: 7.9976
Epoch 5, Train Loss: 7.9738, Val Loss: 7.9823
Epoch 6, Train Loss: 7.9549, Val Loss: 7.9756
Epoch 7, Train Loss: 7.9407, Val Loss: 7.9726
Epoch 8, Train Loss: 7.9311, Val Loss: 7.9713
Epoch 9, Train Loss: 7.9202, Val Loss: 7.9627
Epoch 10, Train Loss: 7.9135, Val Loss: 7.9643


In [76]:
summary = beam_search(model, train_texts[0], vocab, rev_vocab, beam_width=5, max_len=75)
print("Generated Summary:", summary)


Generated Summary: was to to the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the the
