## Load Data

In [1]:
import torch
import torch.nn as nn
from spacy.lang.en.stop_words import STOP_WORDS
import pandas as pd
import numpy as np
from collections import Counter
import re
from torch.nn.utils.rnn import pad_sequence
import random
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"




# Check CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")




## Load Glove Embedding

In [2]:
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

## Preprocessing Functions

In [3]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
                           "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                           "you're": "you are", "you've": "you have"}

In [4]:
def preprocess_text(text):
    newString = text.lower()
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub('"', '', newString)
    newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])
    newString = re.sub(r"'s\b", "", newString)
    newString = re.sub("[^a-zA-Z]", " ", newString)
    newString = re.sub('[m]{2,}', 'mm', newString)
    
    tokens = [w for w in newString.split() if w not in STOP_WORDS]
    long_words = [i for i in tokens if len(i) > 1]  # Remove short words
    
    return " ".join(long_words).strip()

## Load & Preprocess Data

In [5]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    df['article'] = df['article'].apply(preprocess_text)
    df['highlights'] = df['highlights'].apply(preprocess_text)
    return df['article'].tolist(), df['highlights'].tolist()

train_texts, train_summaries = load_data("resources/train.csv")
val_texts, val_summaries = load_data("resources/validation.csv")

SAMPLE_SIZE = 10000  # Adjust based on available GPU memory

# Ensure we don't sample more than available data
train_sample_size = min(SAMPLE_SIZE, len(train_texts))
val_sample_size = min(SAMPLE_SIZE // 4, len(val_texts))  # Use smaller validation set

# Randomly sample data
train_sample_indices = random.sample(range(len(train_texts)), train_sample_size)
val_sample_indices = random.sample(range(len(val_texts)), val_sample_size)

# Subset the dataset
train_texts = [train_texts[i] for i in train_sample_indices]
train_summaries = [train_summaries[i] for i in train_sample_indices]

val_texts = [val_texts[i] for i in val_sample_indices]
val_summaries = [val_summaries[i] for i in val_sample_indices]

## Build Vocab

In [6]:
word_counts = Counter(word for text in train_texts + train_summaries for word in text.split())
vocab = {word: i+2 for i, (word, _) in enumerate(word_counts.items())}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1
vocab["<SOS>"] = len(vocab)
vocab["<EOS>"] = len(vocab) + 1
rev_vocab = {idx: word for word, idx in vocab.items()}
VOCAB_SIZE = len(vocab)

In [7]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def compute_cosine_similarity(vec1, vec2):
    # Move tensors to CPU and convert to NumPy
    vec1 = vec1.cpu().numpy().reshape(1, -1)
    vec2 = vec2.cpu().numpy().reshape(1, -1)
    
    return cosine_similarity(vec1, vec2)[0][0]  # Extract scalar similarity value



In [8]:
import torch.nn.functional as F

def find_closest_word(word, embeddings, vocab):
    word_idx = vocab[word]  # Get index
    word_embedding = embeddings[word_idx].unsqueeze(0)  # Shape: (1, EMBED_SIZE)

    # Compute cosine similarity in one step
    similarities = F.cosine_similarity(word_embedding, embeddings, dim=1)

    # Exclude the word itself by setting its similarity to -inf
    similarities[word_idx] = float('-inf')

    # Find the closest word index
    closest_idx = torch.argmax(similarities).item()

    # Ensure index is valid
    if closest_idx < 0 or closest_idx >= len(vocab):
        return "<UNK>"  # Return unknown token if something goes wrong

    return list(vocab.keys())[closest_idx]  # Convert index to word


In [9]:
def precompute_closest_words(vocab, embeddings):
    closest_words = {}
    for i, word in enumerate(vocab):
        if i % 100 == 0:  # Print only every 100th word
            print(f"Checking word: {word} (Type: {type(word)})")
        closest_words[word] = find_closest_word(word, embeddings, vocab)
    return closest_words


## load GloVe Embeddings

In [10]:
glove_path = "resources/glove.6B.100d.txt"
EMBED_SIZE = 100
embeddings = load_glove_embeddings(glove_path)


In [11]:
# Assume `vocab` and `EMBED_SIZE` are already defined
weights_matrix = np.zeros((max(vocab.values()) + 1, EMBED_SIZE))

for word, i in vocab.items():
    weights_matrix[i] = embeddings.get(word, np.random.randn(EMBED_SIZE))
    

In [12]:
embedding_matrix = torch.tensor(weights_matrix, dtype=torch.float32).to(device)

## Find Closest Word Function

In [13]:
closest_word = precompute_closest_words(vocab, embedding_matrix)

Checking word: ian (Type: <class 'str'>)
Checking word: brave (Type: <class 'str'>)
Checking word: palace (Type: <class 'str'>)
Checking word: thursday (Type: <class 'str'>)
Checking word: recession (Type: <class 'str'>)
Checking word: unemployed (Type: <class 'str'>)
Checking word: connection (Type: <class 'str'>)
Checking word: waiting (Type: <class 'str'>)
Checking word: eighty (Type: <class 'str'>)
Checking word: bull (Type: <class 'str'>)
Checking word: information (Type: <class 'str'>)
Checking word: aid (Type: <class 'str'>)
Checking word: pay (Type: <class 'str'>)
Checking word: recall (Type: <class 'str'>)
Checking word: bothered (Type: <class 'str'>)
Checking word: mayer (Type: <class 'str'>)
Checking word: favourite (Type: <class 'str'>)
Checking word: score (Type: <class 'str'>)
Checking word: roshan (Type: <class 'str'>)
Checking word: friedman (Type: <class 'str'>)
Checking word: occurring (Type: <class 'str'>)
Checking word: hyperlipidemia (Type: <class 'str'>)
Checking 

## Dataset Class

In [58]:
from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, texts, summaries, vocab, embedding_matrix, device, max_len=100):
        self.texts = texts
        self.summaries = summaries
        self.vocab = vocab
        self.embedding_matrix = embedding_matrix
        self.device = device
        self.max_len = max_len  
        self.closest_words = {}  # ✅ Fix: Initialize this attribute

    def __getitem__(self, idx):
        text = self.texts[idx]
        summary = self.summaries[idx]

        text_tensor = torch.tensor(tokenizer.encode(text, max_length=self.max_len, truncation=True), dtype=torch.long)
        summary_tensor = torch.tensor(tokenizer.encode(summary, max_length=self.max_len, truncation=True), dtype=torch.long)

        return text_tensor, summary_tensor

    def __len__(self):
        return len(self.texts)




### Collate Function for Padding

In [59]:
MAX_LEN = 50  # Set a reasonable sequence length

def collate_fn(batch):
    text, summary = zip(*batch)
    text = [torch.as_tensor(t, dtype=torch.long).clone().detach() for t in text]
    summary = [torch.as_tensor(s, dtype=torch.long).clone().detach() for s in summary]

    text = pad_sequence(text, batch_first=True, padding_value=0)
    summary = pad_sequence(summary, batch_first=True, padding_value=0)

    return text.to(device), summary.to(device)



## Define LSTM Model

In [60]:
class LSTMSeq2Seq(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, embedding_matrix, dropout=0.3):
        super(LSTMSeq2Seq, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.encoder = nn.LSTM(embed_size, hidden_size, batch_first=True, dropout=dropout)
        self.decoder = nn.LSTM(embed_size, hidden_size, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        embed_src = self.embedding(src)
        embed_tgt = self.embedding(tgt)

        _, (hidden, cell) = self.encoder(embed_src)
        
        outputs = []
        decoder_input = embed_tgt[:, 0].unsqueeze(1)  # Start token
        
        for t in range(tgt.shape[1] - 1):  # Ensure same length as target
            output, (hidden, cell) = self.decoder(decoder_input, (hidden, cell))
            output = self.fc(output)
            outputs.append(output)

            # Teacher forcing: Use true target word some of the time
            if random.random() < teacher_forcing_ratio:
                decoder_input = self.embedding(tgt[:, t + 1]).unsqueeze(1)
            else:
                decoder_input = self.embedding(torch.argmax(output, dim=-1)).detach()

        return torch.cat(outputs, dim=1)


##  Function to generate summary

In [61]:
import torch
import heapq
import numpy as np

def beam_search(model, text, vocab, rev_vocab, beam_width=3, max_len=50):
    model.eval()
    with torch.no_grad():
        # Convert text to indices
        text_indices = [vocab.get(word, vocab["<UNK>"]) for word in text.split()]
        text_tensor = torch.tensor(text_indices, dtype=torch.long).unsqueeze(0).to(device)  # (1, seq_len)

        # Encode input text
        embedded_text = model.embedding(text_tensor)
        _, (hidden, cell) = model.encoder(embedded_text)  # Encoder output states

        # Initialize beam search: (score, sequence, hidden state, cell state)
        sequences = [(0, [vocab["<SOS>"]], hidden, cell)]

        for _ in range(max_len):
            all_candidates = []
            for score, seq, h, c in sequences:
                last_word = seq[-1]

                # Stop expanding sequences that already reached <EOS>
                if last_word == vocab["<EOS>"]:
                    all_candidates.append((score, seq, h, c))
                    continue

                # Convert last word to tensor
                last_word_tensor = torch.tensor([last_word], dtype=torch.long).to(device)
                embedded_input = model.embedding(last_word_tensor).unsqueeze(1)  # (1, 1, embed_size)

                # Decoder step
                output, (new_h, new_c) = model.decoder(embedded_input, (h, c))

                # Compute probabilities and get top-k words
                output_probs = torch.softmax(model.fc(output.squeeze(1)), dim=-1)  # (1, vocab_size)
                topk_probs, topk_indices = torch.topk(output_probs, beam_width, dim=-1)  # (1, beam_width)

                # Add new candidates to the list
                for i in range(beam_width):
                    word_idx = topk_indices[0][i].item()
                    word_prob = topk_probs[0][i].item()

                    candidate = ((score + np.log(word_prob)) / len(seq), seq + [word_idx], new_h, new_c)
                    all_candidates.append(candidate)

            # Keep top `beam_width` sequences
            sequences = sorted(all_candidates, key=lambda x: x[0], reverse=True)[:beam_width]

        # Select the best sequence
        best_seq = sequences[0][1]

        # Convert indices to words
        summary = " ".join([find_closest_word(rev_vocab.get(idx, "<UNK>"), vocab, embedding_matrix) if idx == vocab["<UNK>"] else rev_vocab[idx] for idx in best_seq if idx not in {vocab["<SOS>"], vocab["<EOS>"], vocab["<PAD>"]}])


        return summary


In [62]:
HIDDEN_SIZE = 256
BATCH_SIZE = 32

### Create DataLoaders

In [63]:
from torch.utils.data import DataLoader, SubsetRandomSampler

subset_size = 5000
train_indices = list(range(min(subset_size, len(train_texts))))
val_indices = list(range(min(subset_size, len(val_texts))))

In [64]:
train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)

In [65]:
train_dataset = NewsDataset(train_texts, train_summaries, vocab, embedding_matrix, max_len=100, device=device)
val_dataset = NewsDataset(val_texts, val_summaries, vocab, embedding_matrix, max_len=100, device=device)

In [66]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn, num_workers=0, pin_memory=False)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, collate_fn=collate_fn, num_workers=0, pin_memory=False)

In [67]:
model = LSTMSeq2Seq(VOCAB_SIZE, EMBED_SIZE, HIDDEN_SIZE, embedding_matrix).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)



### Training loop

In [68]:
import torch
import gc

gc.collect()  
torch.cuda.empty_cache()
model = torch.compile(model)  # JIT Compilation for speedup



In [69]:
from torch.cuda.amp import autocast, GradScaler

def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs=15, patience=5):
    scaler = torch.amp.GradScaler()
    best_val_loss = float("inf")
    counter = 0
    accumulation_steps = 4  # Accumulate gradients over multiple batches

    for epoch in range(epochs):
        teacher_forcing_ratio = max(0.5 * (0.99 ** epoch), 0.1)
        model.train()
        train_loss = 0

        for step, (text, summary) in enumerate(train_loader):
            text, summary = text.to(device), summary.to(device)
            optimizer.zero_grad()  # Reset gradients for each batch

            with torch.amp.autocast("cuda"):
                output = model(text, summary, teacher_forcing_ratio)
                loss = criterion(output.reshape(-1, VOCAB_SIZE), summary[:, :output.shape[1]].reshape(-1))
            
            scaler.scale(loss).backward()  # Accumulate gradients
            scaler.step(optimizer)
            scaler.update()
            
            train_loss += loss.item()

        # Validation Step (DO NOT BACKPROPAGATE HERE)
        model.eval()
        val_loss = 0
        with torch.no_grad():  # No gradients needed in validation
            for text, summary in val_loader:
                text, summary = text.to(device), summary.to(device)

                with torch.amp.autocast("cuda", dtype=torch.float16):
                    output = model(text, summary, teacher_forcing_ratio)
                    loss = criterion(output.reshape(-1, VOCAB_SIZE), summary[:, :output.shape[1]].reshape(-1))
                
                val_loss += loss.item()  # Just store the loss, no .backward()

        avg_train_loss = train_loss / len(train_loader)
        avg_val_loss = val_loss / len(val_loader)
        print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

        scheduler.step(avg_val_loss)

        # Early Stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            counter = 0
            torch.save(model.state_dict(), "best_model.pth")
        else:
            counter += 1
            if counter >= patience:
                print("Early stopping triggered. Loading best model.")
                model.load_state_dict(torch.load("best_model.pth"))
                break


In [70]:
train_model(model, train_loader, val_loader, criterion, optimizer, scheduler)

Epoch 1, Train Loss: 8.2455, Val Loss: 7.4650
Epoch 2, Train Loss: 7.0569, Val Loss: 6.7234
Epoch 3, Train Loss: 6.4081, Val Loss: 6.2186
Epoch 4, Train Loss: 5.9428, Val Loss: 5.8698
Epoch 5, Train Loss: 5.6113, Val Loss: 5.6365
Epoch 6, Train Loss: 5.3823, Val Loss: 5.4843
Epoch 7, Train Loss: 5.1864, Val Loss: 5.3060
Epoch 8, Train Loss: 5.0724, Val Loss: 5.3626
Epoch 9, Train Loss: 5.0221, Val Loss: 5.1182
Epoch 10, Train Loss: 4.9495, Val Loss: 5.1127
Epoch 11, Train Loss: 4.9096, Val Loss: 5.1515
Epoch 12, Train Loss: 4.8748, Val Loss: 5.2128
Epoch 13, Train Loss: 4.8752, Val Loss: 5.1168
Epoch 14, Train Loss: 4.7171, Val Loss: 5.0484
Epoch 15, Train Loss: 4.6849, Val Loss: 5.0647


In [71]:
summary = beam_search(model, train_texts[0], vocab, rev_vocab, beam_width=5, max_len=50)
print("Generated Summary:", summary)


Generated Summary: sgt foe foe transnistria transnistria epidurals epidurals epidurals ringing ringing fairview fairview glaciers glaciers vh vh vh vh vh vh vh vh vh vh vh vh vh vh vh vh vh vh vh vh vh vh vh vh vh vh vh vh vh vh vh vh vh vh pegg pegg
