## Load Data

In [1]:
import torch
import torch.nn as nn
from spacy.lang.en.stop_words import STOP_WORDS
import pandas as pd
import numpy as np
from collections import Counter
import re
from torch.nn.utils.rnn import pad_sequence
import random
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"




# Check CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")




## Load Glove Embedding

In [2]:
def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

## Preprocessing Functions

In [3]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
                           "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
                           "you're": "you are", "you've": "you have"}

In [4]:
def preprocess_text(text):
    newString = text.lower()
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub('"', '', newString)
    newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])
    newString = re.sub(r"'s\b", "", newString)
    newString = re.sub("[^a-zA-Z]", " ", newString)
    newString = re.sub('[m]{2,}', 'mm', newString)
    
    tokens = [w for w in newString.split() if w not in STOP_WORDS]
    long_words = [i for i in tokens if len(i) > 1]  # Remove short words
    
    return " ".join(long_words).strip()

## Load & Preprocess Data

In [5]:
def load_data(file_path):
    df = pd.read_csv(file_path)
    df['article'] = df['article'].apply(preprocess_text)
    df['highlights'] = df['highlights'].apply(preprocess_text)
    return df['article'].tolist(), df['highlights'].tolist()

train_texts, train_summaries = load_data("resources/train.csv")
val_texts, val_summaries = load_data("resources/validation.csv")

SAMPLE_SIZE = 10000  # Adjust based on available GPU memory

# Ensure we don't sample more than available data
train_sample_size = min(SAMPLE_SIZE, len(train_texts))
val_sample_size = min(SAMPLE_SIZE // 4, len(val_texts))  # Use smaller validation set

# Randomly sample data
train_sample_indices = random.sample(range(len(train_texts)), train_sample_size)
val_sample_indices = random.sample(range(len(val_texts)), val_sample_size)

# Subset the dataset
train_texts = [train_texts[i] for i in train_sample_indices]
train_summaries = [train_summaries[i] for i in train_sample_indices]

val_texts = [val_texts[i] for i in val_sample_indices]
val_summaries = [val_summaries[i] for i in val_sample_indices]

## Build Vocab

In [6]:
word_counts = Counter(word for text in train_texts + train_summaries for word in text.split())
vocab = {word: i+2 for i, (word, _) in enumerate(word_counts.items())}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1
vocab["<SOS>"] = len(vocab)
vocab["<EOS>"] = len(vocab) + 1
rev_vocab = {idx: word for word, idx in vocab.items()}
VOCAB_SIZE = len(vocab)

In [7]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def compute_cosine_similarity(vec1, vec2):
    # Move tensors to CPU and convert to NumPy
    vec1 = vec1.cpu().numpy().reshape(1, -1)
    vec2 = vec2.cpu().numpy().reshape(1, -1)
    
    return cosine_similarity(vec1, vec2)[0][0]  # Extract scalar similarity value



### Finding the closet word

In [8]:
import torch.nn.functional as F
def find_closest_word(word, embeddings, vocab):
    word_idx = vocab[word]  # Get index
    word_embedding = embeddings[word_idx].unsqueeze(0)  # Shape: (1, EMBED_SIZE)

    # Compute cosine similarity in one step
    similarities = F.cosine_similarity(word_embedding, embeddings, dim=1)

    # Exclude the word itself by setting its similarity to -inf
    similarities[word_idx] = float('-inf')

    # Find the closest word index
    closest_idx = torch.argmax(similarities).item()

    # Ensure index is valid
    if closest_idx < 0 or closest_idx >= len(vocab):
        return "<UNK>"  # Return unknown token if something goes wrong

    return list(vocab.keys())[closest_idx]  # Convert index to word


In [9]:
def precompute_closest_words(vocab, embeddings):
    closest_words = {}
    for i, word in enumerate(vocab):
        if i % 100 == 0:  # Print only every 100th word
            print(f"Checking word: {word} (Type: {type(word)})")
        closest_words[word] = find_closest_word(word, embeddings, vocab)
    return closest_words


## load GloVe Embeddings

GloVe (Global Vectors for Word Representation) is a word embedding technique that helps convert words into numerical vectors in a way that captures their semantic meaning. It was developed by researchers at Stanford University and differs from other word embedding methods like Word2Vec by using co-occurrence statistics from a large corpus.

In [10]:
glove_path = "resources/glove.6B.100d.txt"
EMBED_SIZE = 100
embeddings = load_glove_embeddings(glove_path)


In [11]:
# Assume `vocab` and `EMBED_SIZE` are already defined
weights_matrix = np.zeros((max(vocab.values()) + 1, EMBED_SIZE))

for word, i in vocab.items():
    weights_matrix[i] = embeddings.get(word, np.random.np.random.uniform(-0.05, 0.05, EMBED_SIZE))
    

In [12]:
embedding_matrix = torch.tensor(weights_matrix, dtype=torch.float32).to(device)

## Find Closest Word Function

In [13]:
closest_word = precompute_closest_words(vocab, embedding_matrix)

Checking word: linguists (Type: <class 'str'>)
Checking word: linguistic (Type: <class 'str'>)
Checking word: traders (Type: <class 'str'>)
Checking word: preferred (Type: <class 'str'>)
Checking word: ford (Type: <class 'str'>)
Checking word: stage (Type: <class 'str'>)
Checking word: evening (Type: <class 'str'>)
Checking word: vinall (Type: <class 'str'>)
Checking word: ending (Type: <class 'str'>)
Checking word: sense (Type: <class 'str'>)
Checking word: risk (Type: <class 'str'>)
Checking word: liberia (Type: <class 'str'>)
Checking word: true (Type: <class 'str'>)
Checking word: retirement (Type: <class 'str'>)
Checking word: global (Type: <class 'str'>)
Checking word: consulted (Type: <class 'str'>)
Checking word: finisher (Type: <class 'str'>)
Checking word: fee (Type: <class 'str'>)
Checking word: childhood (Type: <class 'str'>)
Checking word: message (Type: <class 'str'>)
Checking word: serbia (Type: <class 'str'>)
Checking word: refuse (Type: <class 'str'>)
Checking word: im

## Dataset Class

In [14]:
from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, texts, summaries, vocab, embedding_matrix, device, max_len=100):
        self.texts = texts
        self.summaries = summaries
        self.vocab = vocab
        self.embedding_matrix = embedding_matrix
        self.device = device
        self.max_len = max_len  
        self.closest_words = {}  # ✅ Fix: Initialize this attribute

    def __getitem__(self, idx):
        text = self.texts[idx]
        summary = self.summaries[idx]

        text_tensor = torch.tensor(tokenizer.encode(text, max_length=self.max_len, truncation=True), dtype=torch.long)
        summary_tensor = torch.tensor(tokenizer.encode(summary, max_length=self.max_len, truncation=True), dtype=torch.long)

        return text_tensor, summary_tensor

    def __len__(self):
        return len(self.texts)




### Collate Function for Padding

In [96]:
MAX_LEN = 100  # Set a reasonable sequence length

def collate_fn(batch):
    text, summary = zip(*batch)
    text = [torch.as_tensor(t, dtype=torch.long).clone().detach() for t in text]
    summary = [torch.as_tensor(s, dtype=torch.long).clone().detach() for s in summary]

    text = pad_sequence(text, batch_first=True, padding_value=0)
    summary = pad_sequence(summary, batch_first=True, padding_value=0)

    return text.to(device), summary.to(device)



## Define LSTM Model

In [97]:
class LSTMSeq2Seq(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, embedding_matrix, dropout=0.3):
        super(LSTMSeq2Seq, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)
        self.encoder = nn.LSTM(embed_size, hidden_size, batch_first=True, dropout=dropout)
        self.decoder = nn.LSTM(embed_size, hidden_size, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, src, tgt, teacher_forcing_ratio=0.5):
        embed_src = self.embedding(src)
        embed_tgt = self.embedding(tgt)

        _, (hidden, cell) = self.encoder(embed_src)
        
        outputs = []
        decoder_input = embed_tgt[:, 0].unsqueeze(1)  # Start token
        
        for t in range(tgt.shape[1] - 1):  # Ensure same length as target
            output, (hidden, cell) = self.decoder(decoder_input, (hidden, cell))
            output = self.fc(output)
            outputs.append(output)

            # Teacher forcing: Use true target word some of the time
            if random.random() < teacher_forcing_ratio:
                decoder_input = self.embedding(tgt[:, t + 1]).unsqueeze(1)
            else:
                decoder_input = self.embedding(torch.argmax(output, dim=-1)).detach()

        return torch.cat(outputs, dim=1)


##  Function to generate summary

### For sampling generating the summary output

In [118]:
import torch
import numpy as np

def top_p_sampling(model, text, vocab, rev_vocab, top_p=0.9, temperature=1.0, max_len=100):
    model.eval()
    with torch.no_grad():
        text_indices = [vocab.get(word, vocab["<UNK>"]) for word in text.split()]
        text_tensor = torch.tensor(text_indices, dtype=torch.long).unsqueeze(0).to(device)

        embedded_text = model.embedding(text_tensor)
        _, (hidden, cell) = model.encoder(embedded_text)

        generated_seq = [vocab["<SOS>"]]

        for _ in range(max_len):
            last_word = generated_seq[-1]
            if last_word == vocab["<EOS>"]:
                break

            last_word_tensor = torch.tensor([last_word], dtype=torch.long).to(device)
            embedded_input = model.embedding(last_word_tensor).unsqueeze(1)

            output, (hidden, cell) = model.decoder(embedded_input, (hidden, cell))
            output_probs = torch.softmax(model.fc(output.squeeze(1)) / temperature, dim=-1)

            # Top-P (Nucleus Sampling)
            sorted_probs, sorted_indices = torch.sort(output_probs, descending=True)
            cumulative_probs = torch.cumsum(sorted_probs, dim=-1)

            # Keep only words that form 90% probability mass
            top_p_mask = cumulative_probs <= top_p

            # Ensure at least one token is kept
            if not top_p_mask.any():
                top_p_mask[0] = True  

            filtered_probs = sorted_probs[top_p_mask]
            filtered_indices = sorted_indices[top_p_mask]

            # Normalize probabilities (only if sum > 0)
            if filtered_probs.sum() > 0:
                filtered_probs = filtered_probs / filtered_probs.sum()
            else:
                filtered_probs = torch.softmax(output_probs, dim=-1)  # Fallback to original probs

            if len(filtered_indices) == 0:
                next_word_idx = sorted_indices[0].item()  # Pick most probable word
            else:
                next_word_idx = np.random.choice(filtered_indices.cpu().numpy(), p=filtered_probs.cpu().numpy())

            generated_seq.append(next_word_idx)
            if next_word_idx == vocab["<EOS>"]:
                break

        summary = " ".join([rev_vocab[idx] for idx in generated_seq if idx not in {vocab["<SOS>"], vocab["<EOS>"], vocab["<PAD>"], vocab["<UNK>"]}])

        return summary


In [157]:
HIDDEN_SIZE = 216
BATCH_SIZE = 32

### Create DataLoaders

In [158]:
from torch.utils.data import DataLoader, SubsetRandomSampler

subset_size = 5000
train_indices = list(range(min(subset_size, len(train_texts))))
val_indices = list(range(min(subset_size, len(val_texts))))

In [159]:
train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)

In [160]:
train_dataset = NewsDataset(train_texts, train_summaries, vocab, embedding_matrix, max_len=100, device=device)
val_dataset = NewsDataset(val_texts, val_summaries, vocab, embedding_matrix, max_len=100, device=device)

In [161]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, sampler=train_sampler, collate_fn=collate_fn, num_workers=0, pin_memory=False)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, sampler=val_sampler, collate_fn=collate_fn, num_workers=0, pin_memory=False)

This is where you need to optimize the model to better its performance 

In [175]:
model = LSTMSeq2Seq(VOCAB_SIZE, EMBED_SIZE, HIDDEN_SIZE, embedding_matrix).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0005, weight_decay=5e-5)  # Lower LR
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)

### Training loop

In [176]:
import torch
import gc

gc.collect()  
torch.cuda.empty_cache()
model = torch.compile(model)  # JIT Compilation for speedup



In [177]:
from torch.cuda.amp import autocast, GradScaler

def train_model(model, train_loader, val_loader, criterion, optimizer, scheduler, epochs=30, patience=5):
    scaler = torch.amp.GradScaler()
    best_val_loss = float("inf")
    counter = 0
    accumulation_steps = 4  # Accumulate gradients over multiple batches

    for epoch in range(epochs):
        teacher_forcing_ratio = max(0.8 * (0.98 ** epoch), 0.2)
        model.train()
        train_loss = 0

        for step, (text, summary) in enumerate(train_loader):
            text, summary = text.to(device), summary.to(device)
            optimizer.zero_grad()  # Reset gradients for each batch

            with torch.amp.autocast("cuda"):
                output = model(text, summary, teacher_forcing_ratio)
                loss = criterion(output.reshape(-1, VOCAB_SIZE), summary[:, :output.shape[1]].reshape(-1))
            
            scaler.scale(loss).backward()  # Accumulate gradients
            
            if (step + 1) % accumulation_steps == 0:
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad()

            
            train_loss += loss.item()

        # Validation Step (DO NOT BACKPROPAGATE HERE)
        model.eval()
        val_loss = 0
        with torch.no_grad():  # No gradients needed in validation
            for text, summary in val_loader:
                text, summary = text.to(device), summary.to(device)

                with torch.amp.autocast("cuda"):
                    output = model(text, summary, teacher_forcing_ratio=0)  # No teacher forcing in validation
                    loss = criterion(output.reshape(-1, VOCAB_SIZE), summary[:, :output.shape[1]].reshape(-1))

                val_loss += loss.item()

        train_loss /= len(train_loader)
        val_loss /= len(val_loader)

        # Step scheduler
        scheduler.step(val_loss)

        print(f"Epoch [{epoch+1}/{epochs}] | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            counter = 0
            torch.save(model.state_dict(), "best_model.pth")  # Save best model
        else:
            counter += 1
            if counter >= patience:
                print("Early stopping triggered.")
                break
    torch.save(model.state_dict(), "NLPMLSTMPTextSuM/model.pth")
    print("The model is saved")



#### The goals here is to have a train and val loss around 0-3

In [178]:
train_model(model, train_loader, val_loader, criterion, optimizer, scheduler)

Epoch [1/30] | Train Loss: 10.4150 | Val Loss: 8.5559
Epoch [2/30] | Train Loss: 8.4073 | Val Loss: 8.4779
Epoch [3/30] | Train Loss: 8.3173 | Val Loss: 8.4141
Epoch [4/30] | Train Loss: 8.2430 | Val Loss: 8.3686
Epoch [5/30] | Train Loss: 8.1747 | Val Loss: 8.3214
Epoch [6/30] | Train Loss: 8.1176 | Val Loss: 8.2872
Epoch [7/30] | Train Loss: 8.0732 | Val Loss: 8.2587
Epoch [8/30] | Train Loss: 8.0318 | Val Loss: 8.2354
Epoch [9/30] | Train Loss: 8.0320 | Val Loss: 8.2124
Epoch [10/30] | Train Loss: 8.0294 | Val Loss: 8.1991
Epoch [11/30] | Train Loss: 7.9971 | Val Loss: 8.1911
Epoch [12/30] | Train Loss: 7.9611 | Val Loss: 8.1802
Epoch [13/30] | Train Loss: 7.9211 | Val Loss: 8.1772
Epoch [14/30] | Train Loss: 7.8876 | Val Loss: 8.2050
Epoch [15/30] | Train Loss: 7.8595 | Val Loss: 8.2014
Epoch [16/30] | Train Loss: 7.8402 | Val Loss: 8.1968
Epoch [17/30] | Train Loss: 7.8213 | Val Loss: 8.1925
Epoch [18/30] | Train Loss: 7.8062 | Val Loss: 8.1945
Early stopping triggered.
The model 

In [179]:
# Generate summary using your trained LSTM model

summary = top_p_sampling(model, train_texts[0], vocab, rev_vocab, top_p=0.9, temperature=0.7, max_len=100)
print("LSTM Model Summary:")
print(summary)


LSTM Model Summary:
