## Load Data

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import pandas as pd
import numpy as np
import re
from torch.nn.utils.rnn import pad_sequence
import random

# Check CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Preprocessing Function (using GPU where applicable)
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text


In [2]:
import pandas as pd
import random

# Load and preprocess datasets
def load_data(file_path):
    df = pd.read_csv(file_path)
    df['article'] = df['article'].apply(preprocess_text)
    df['highlights'] = df['highlights'].apply(preprocess_text)
    return df['article'].tolist(), df['highlights'].tolist()

train_texts, train_summaries = load_data("resources/train.csv")
val_texts, val_summaries = load_data("resources/validation.csv")

SAMPLE_SIZE = 15000  # Adjust based on available GPU memory

# Ensure we don't sample more than available data
train_sample_size = min(SAMPLE_SIZE, len(train_texts))
val_sample_size = min(SAMPLE_SIZE // 4, len(val_texts))  # Use smaller validation set

# Randomly sample data
train_sample_indices = random.sample(range(len(train_texts)), train_sample_size)
val_sample_indices = random.sample(range(len(val_texts)), val_sample_size)

# Subset the dataset
train_texts = [train_texts[i] for i in train_sample_indices]
train_summaries = [train_summaries[i] for i in train_sample_indices]

val_texts = [val_texts[i] for i in val_sample_indices]
val_summaries = [val_summaries[i] for i in val_sample_indices]


### Build vocabulary

In [3]:
from collections import Counter

# Flatten lists into a single Counter object
word_counts = Counter(word for text in train_texts + train_summaries for word in text.split())

# Build vocabulary
vocab = {word: i+2 for i, (word, _) in enumerate(word_counts.items())}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1
VOCAB_SIZE = len(vocab)


## Dataset Class

In [4]:
class NewsDataset(Dataset):
    def __init__(self, texts, summaries, vocab, max_len=100):
        self.texts = [torch.tensor([vocab.get(word, 1) for word in text.split()], dtype=torch.long) for text in texts]
        self.summaries = [torch.tensor([vocab.get(word, 1) for word in summary.split()], dtype=torch.long) for summary in summaries]
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx][:self.max_len]
        summary = self.summaries[idx][:self.max_len]
        return text, summary

### Collate Function for Padding

In [5]:
MAX_LEN = 128  # Set a reasonable sequence length

def collate_fn(batch):
    text, summary = zip(*batch)  # Unpack batch
    
    text = [torch.as_tensor(t[:MAX_LEN], dtype=torch.long).clone().detach() for t in text]
    summary = [torch.as_tensor(s[:MAX_LEN], dtype=torch.long).clone().detach() for s in summary]

    text = pad_sequence(text, batch_first=True, padding_value=0)  # Pad sequences
    summary = pad_sequence(summary, batch_first=True, padding_value=0)

    return text, summary



## Define LSTM Model

In [6]:
class LSTMSeq2Seq(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super(LSTMSeq2Seq, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.encoder = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.decoder = nn.LSTM(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, src, tgt):
        embed_src = self.embedding(src)
        embed_tgt = self.embedding(tgt)
        _, (hidden, cell) = self.encoder(embed_src)
        output, _ = self.decoder(embed_tgt, (hidden, cell))
        output = self.fc(output)
        return output

##  Function to generate summary

In [7]:
def generate_summary(model, text, vocab, max_len=50):
    model.eval()
    text_tensor = torch.tensor([vocab.get(word, 1) for word in text.split()], dtype=torch.long).unsqueeze(0).to(device)
    with torch.no_grad():
        output = model(text_tensor, text_tensor)
    summary_indices = torch.argmax(output, dim=-1).squeeze(0).tolist()
    summary = " ".join([list(vocab.keys())[list(vocab.values()).index(idx)] for idx in summary_indices if idx in vocab.values()])
    return summary

In [8]:
EMBED_SIZE = 64
HIDDEN_SIZE = 128
BATCH_SIZE = 16
EPOCHS = 20
LR = 0.001

### Create DataLoaders

In [9]:

from torch.utils.data import DataLoader, SubsetRandomSampler

# Define subset size (e.g., 50,000 samples)
subset_size = 10000  
train_indices = list(range(min(subset_size, len(train_texts))))
val_indices = list(range(min(subset_size, len(val_texts))))

train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)

train_dataset = NewsDataset(train_texts, train_summaries, vocab)
val_dataset = NewsDataset(val_texts, val_summaries, vocab)

train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn, pin_memory=True
)
val_loader = DataLoader(
    val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, collate_fn=collate_fn, pin_memory=True
)




In [10]:
model = LSTMSeq2Seq(VOCAB_SIZE, EMBED_SIZE, HIDDEN_SIZE).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = optim.Adam(model.parameters(), lr=LR)

### Training loop

In [11]:
import torch
import gc

gc.collect()  
torch.cuda.empty_cache()
model = torch.compile(model)  # JIT Compilation for speedup



In [12]:
import torch
from torch.amp import autocast, GradScaler  # Ensure correct import

def train_model(model, train_loader, val_loader, criterion, optimizer, epochs):
    ACCUMULATION_STEPS = 2
    scaler = GradScaler()  # ✅ No need for `device="cuda"`

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        optimizer.zero_grad()
    
        for i, (text, summary) in enumerate(train_loader):
            text = text.to(device, non_blocking=True)  
            summary = summary.to(device, non_blocking=True)

            with torch.autocast("cuda", dtype=torch.float16):  # ✅ Fixed!
                output = model(text, summary[:, :-1])
                loss = criterion(output.view(-1, VOCAB_SIZE), summary[:, 1:].reshape(-1))

            scaler.scale(loss).backward()  # ✅ Ensure AMP scaling
        
            if (i + 1) % ACCUMULATION_STEPS == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()
    
            train_loss += loss.item()

        # Validation Phase
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for text, summary in val_loader:
                text = text.to(device, non_blocking=True)
                summary = summary.to(device, non_blocking=True)
        
                with autocast(device_type="cuda", dtype=torch.float16):  # ✅ Fixed!
                    output = model(text, summary[:, :-1])
                    loss = criterion(output.view(-1, VOCAB_SIZE), summary[:, 1:].reshape(-1))
                
                val_loss += loss.item()
        
        print(f"Epoch {epoch+1}, Train Loss: {train_loss/len(train_loader):.4f}, Val Loss: {val_loss/len(val_loader):.4f}")


In [13]:
train_model(model, train_loader, val_loader, criterion, optimizer, EPOCHS)

Epoch 1, Train Loss: 8.3354, Val Loss: 7.8288
Epoch 2, Train Loss: 7.6266, Val Loss: 7.6198
Epoch 3, Train Loss: 7.3651, Val Loss: 7.4777
Epoch 4, Train Loss: 7.1631, Val Loss: 7.3809
Epoch 5, Train Loss: 6.9952, Val Loss: 7.3168
Epoch 6, Train Loss: 6.8389, Val Loss: 7.2571
Epoch 7, Train Loss: 6.6927, Val Loss: 7.2142
Epoch 8, Train Loss: 6.5580, Val Loss: 7.1885
Epoch 9, Train Loss: 6.4293, Val Loss: 7.1717
Epoch 10, Train Loss: 6.3066, Val Loss: 7.1592
Epoch 11, Train Loss: 6.1868, Val Loss: 7.1557
Epoch 12, Train Loss: 6.0714, Val Loss: 7.1575
Epoch 13, Train Loss: 5.9590, Val Loss: 7.1617
Epoch 14, Train Loss: 5.8501, Val Loss: 7.1688
Epoch 15, Train Loss: 5.7444, Val Loss: 7.1822
Epoch 16, Train Loss: 5.6417, Val Loss: 7.1957
Epoch 17, Train Loss: 5.5431, Val Loss: 7.2187
Epoch 18, Train Loss: 5.4481, Val Loss: 7.2410
Epoch 19, Train Loss: 5.3591, Val Loss: 7.2639
Epoch 20, Train Loss: 5.2728, Val Loss: 7.2936


In [14]:
generate_summary(model, train_texts[0], vocab)

'zomato terry of a people are is the in in of in in and as year and was the new in the and been by people and the years and services and the uk and it is us is on after being and week in the new waiter in the the were the uk the is the the years than were from splits in in is the new ignition and in the us and have a be the a us in a new in the to the of the world the the new station is in for the the of in samaritans was on after the and week he the hospital waiter in the the in the uk the is was the years and in the one years the in were mers day of of to he the one years people were investigating he be been in a he have to be at the he the money and to the in and to and laden mayor in in of the health forces he was to her of is killed the incident in the are was a good happy bang of the from and was couple was he of a people and from was he woman overacted of to he the one years people were investigating he be been in a he have to be at the he the money and to the in and of the mini