## Load Data

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import pandas as pd
import numpy as np
import re
from torch.nn.utils.rnn import pad_sequence
import random

# Check CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Preprocessing Function (using GPU where applicable)
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text


In [2]:
import pandas as pd
import random

# Load and preprocess datasets
def load_data(file_path):
    df = pd.read_csv(file_path)
    df['article'] = df['article'].apply(preprocess_text)
    df['highlights'] = df['highlights'].apply(preprocess_text)
    return df['article'].tolist(), df['highlights'].tolist()

train_texts, train_summaries = load_data("resources/train.csv")
val_texts, val_summaries = load_data("resources/validation.csv")

SAMPLE_SIZE = 15000  # Adjust based on available GPU memory

# Ensure we don't sample more than available data
train_sample_size = min(SAMPLE_SIZE, len(train_texts))
val_sample_size = min(SAMPLE_SIZE // 4, len(val_texts))  # Use smaller validation set

# Randomly sample data
train_sample_indices = random.sample(range(len(train_texts)), train_sample_size)
val_sample_indices = random.sample(range(len(val_texts)), val_sample_size)

# Subset the dataset
train_texts = [train_texts[i] for i in train_sample_indices]
train_summaries = [train_summaries[i] for i in train_sample_indices]

val_texts = [val_texts[i] for i in val_sample_indices]
val_summaries = [val_summaries[i] for i in val_sample_indices]


### Build vocabulary

In [16]:
from collections import Counter

# Flatten lists into a single Counter object
word_counts = Counter(word for text in train_texts + train_summaries for word in text.split())

# Build vocabulary
vocab = {word: i+2 for i, (word, _) in enumerate(word_counts.items())}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1
VOCAB_SIZE = len(vocab)

def load_glove_embeddings(glove_path, vocab, embed_dim=100):
    embeddings = np.random.uniform(-0.1, 0.1, (len(vocab), embed_dim))  # Random init
    embeddings[0] = np.zeros(embed_dim)  # <PAD> is zero vector
    
    with open(glove_path, "r", encoding="utf-8") as f:
        for line in f:
            split_line = line.split()
            word, vector = split_line[0], np.array(split_line[1:], dtype=np.float32)
            if word in vocab:
                embeddings[vocab[word]] = vector

    return torch.tensor(embeddings, dtype=torch.float32)

# Load GloVe embeddings
glove_path = "resources/glove.6B.100d.txt"
EMBED_SIZE = 100
embedding_matrix = load_glove_embeddings(glove_path, vocab, EMBED_SIZE)

## Dataset Class

In [17]:
class NewsDataset(Dataset):
    def __init__(self, texts, summaries, vocab, max_len=100):
        self.texts = [torch.tensor([vocab.get(word, 1) for word in text.split()], dtype=torch.long) for text in texts]
        self.summaries = [torch.tensor([vocab.get(word, 1) for word in summary.split()], dtype=torch.long) for summary in summaries]
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx][:self.max_len]
        summary = self.summaries[idx][:self.max_len]
        return text, summary

### Collate Function for Padding

In [35]:
MAX_LEN = 128  # Set a reasonable sequence length

def collate_fn(batch):
    text, summary = zip(*batch)  # Unpack batch
    
    text = [torch.as_tensor(t[:MAX_LEN], dtype=torch.long).clone().detach() for t in text]
    summary = [torch.as_tensor(s[:MAX_LEN], dtype=torch.long).clone().detach() for s in summary]

    text = pad_sequence(text, batch_first=True, padding_value=0)  # Pad sequences
    summary = pad_sequence(summary, batch_first=True, padding_value=0)

    return text, summary



## Define LSTM Model

In [36]:
class LSTMSeq2Seq(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, embedding_matrix, dropout=0.3):
        super(LSTMSeq2Seq, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)  # Allows fine-tuning
        self.encoder = nn.LSTM(embed_size, hidden_size, batch_first=True, dropout=dropout)
        self.decoder = nn.LSTM(embed_size, hidden_size, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, src, tgt):
        embed_src = self.embedding(src)
        embed_tgt = self.embedding(tgt)
        _, (hidden, cell) = self.encoder(embed_src)
        output, _ = self.decoder(embed_tgt, (hidden, cell))
        output = self.fc(output)
        return output


##  Function to generate summary

In [37]:
import heapq

def beam_search(model, text, vocab, beam_width=3, max_len=50):
    model.eval()
    text_tensor = torch.tensor([vocab.get(word, 1) for word in text.split()], dtype=torch.long).unsqueeze(0).to(device)
    
    with torch.no_grad():
        output = model(text_tensor, text_tensor)
    
    sequences = [([], 0)]  # (Generated sequence, score)
    
    for _ in range(max_len):
        new_sequences = []
        for seq, score in sequences:
            last_word_idx = seq[-1] if seq else vocab["<PAD>"]
            output_probs = torch.softmax(output[0, len(seq)], dim=-1).cpu().numpy()

            top_indices = np.argsort(output_probs)[-beam_width:]
            for idx in top_indices:
                new_sequences.append((seq + [idx], score + np.log(output_probs[idx])))

        sequences = heapq.nlargest(beam_width, new_sequences, key=lambda x: x[1])
    
    best_sequence = sequences[0][0]
    summary = " ".join([list(vocab.keys())[list(vocab.values()).index(idx)] for idx in best_sequence if idx in vocab.values()])
    return summary


In [39]:
EMBED_SIZE = 100
HIDDEN_SIZE = 64
BATCH_SIZE = 16
EPOCHS = 20
LR = 0.001

### Create DataLoaders

In [40]:

from torch.utils.data import DataLoader, SubsetRandomSampler

# Define subset size (e.g., 50,000 samples)
subset_size = 10000  
train_indices = list(range(min(subset_size, len(train_texts))))
val_indices = list(range(min(subset_size, len(val_texts))))

train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)

train_dataset = NewsDataset(train_texts, train_summaries, vocab)
val_dataset = NewsDataset(val_texts, val_summaries, vocab)

train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn, pin_memory=True
)
val_loader = DataLoader(
    val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, collate_fn=collate_fn, pin_memory=True
)




In [41]:
model = LSTMSeq2Seq(VOCAB_SIZE, EMBED_SIZE, HIDDEN_SIZE, embedding_matrix).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)



### Training loop

In [42]:
import torch
import gc

gc.collect()  
torch.cuda.empty_cache()
model = torch.compile(model)  # JIT Compilation for speedup



In [43]:
import torch
from torch.amp import autocast, GradScaler  # Ensure correct import

def train_model(model, train_loader, val_loader, criterion, optimizer, epochs):
    ACCUMULATION_STEPS = 2
    scaler = GradScaler()  # ✅ No need for `device="cuda"`

    best_val_loss = float("inf")
    patience = 3  # Number of epochs to wait before stopping
    counter = 0  
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for text, summary in train_loader:
            text, summary = text.to(device), summary.to(device)
            optimizer.zero_grad()
    
            with torch.autocast("cuda", dtype=torch.float16):
                output = model(text, summary[:, :-1])
                loss = criterion(output.view(-1, VOCAB_SIZE), summary[:, 1:].reshape(-1))
    
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
    
            train_loss += loss.item()
    
        # Validation phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for text, summary in val_loader:
                text, summary = text.to(device), summary.to(device)
                with torch.autocast("cuda", dtype=torch.float16):
                    output = model(text, summary[:, :-1])
                    loss = criterion(output.view(-1, VOCAB_SIZE), summary[:, 1:].reshape(-1))
                val_loss += loss.item()
        scheduler.step(avg_val_loss)
        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)
    
        print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")
    
        # Early stopping check
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            counter = 0  # Reset patience counter
            torch.save(model.state_dict(), "best_model.pth")  # Save best model
        else:
            counter += 1
            if counter >= patience:
                print("Early stopping triggered. Loading best model.")
                model.load_state_dict(torch.load("best_model.pth"))
                break  # Stop training


In [None]:
train_model(model, train_loader, val_loader, criterion, optimizer, EPOCHS)

In [33]:
beam_search(model, train_texts[0], vocab)

'the cantlie of the in and of 26 was report was and was 26 is new and was the new in the is been in people in a men were services in the us the he he incident was in after the and morning he the local government he the'