### Importing Libraries

In [None]:
import os
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
import nltk
import torch.nn.functional as F
from collections import Counter
import torch.optim as optim
from rouge import Rouge
import seaborn as sns
import matplotlib.pyplot as plt
from itertools import product

import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm


### Loading Data

In [None]:
# ===================== Load Data =====================
dir = 'part_A_outputs'
df_train = pd.read_csv(dir + '/train.csv').fillna('')
df_val = pd.read_csv(dir + '/validation.csv').fillna('')
df_test = pd.read_csv(dir + '/test.csv').fillna('')



# ===================== Vocabulary =====================
def build_vocab(texts, min_freq=0.01):
    special_tokens = ['<pad>', '<bos>', '<eos>', '<unk>']
    counter = Counter()
    for text in texts:
        counter.update(text.split())
    threshold = len(texts) * min_freq
    tokens = [t for t, f in counter.items() if f >= threshold]
    vocab = {word: idx for idx, word in enumerate(special_tokens + tokens)}
    return vocab

vocab = build_vocab(df_train['text'])
print(len(vocab))
inv_vocab = {i: w for w, i in vocab.items()}

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### Dataset Class


In [None]:
# ===================== Dataset =====================
class TitleDataset(Dataset):
    def __init__(self, texts, titles, vocab, max_len):
        self.texts = texts
        self.titles = titles
        self.vocab = vocab
        self.max_len = max_len

    def encode(self, sentence, add_tokens=False):
        tokens = sentence.split()
        idxs = [self.vocab.get(tok, self.vocab['<unk>']) for tok in tokens]
        if add_tokens:
            idxs = [self.vocab['<bos>']] + idxs + [self.vocab['<eos>']]
        idxs = idxs[:self.max_len]
        return torch.tensor(idxs + [self.vocab['<pad>']] * (self.max_len - len(idxs)))

    def __getitem__(self, idx):
        return self.encode(self.texts[idx]), self.encode(self.titles[idx], add_tokens=True)

    def __len__(self):
        return len(self.texts)

### Encoder

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, vocab, emb_dim=300, hidden_dim=300):
        super().__init__()
        self.embedding = nn.Embedding(len(vocab), emb_dim, padding_idx=0)
        self.gru = nn.GRU(emb_dim, hidden_dim, batch_first=True, bidirectional=True)

    def load_embeddings(self, weight_matrix):
        self.embedding.weight.data.copy_(weight_matrix)
        self.embedding.weight.requires_grad = False  # Optional: Freeze embeddings

    def forward(self, x):
        embedded = self.embedding(x)
        outputs, hidden = self.gru(embedded)
        hidden = torch.cat((hidden[-2], hidden[-1]), dim=1).unsqueeze(0)
        return outputs, hidden


### Hierarchical Encoder

In [None]:
import torch
import torch.nn as nn

class HierarchicalEncoderRNN(nn.Module):
    def __init__(self, vocab, emb_dim=300, hidden_dim=300, sent_len=30):
        super().__init__()
        self.emb_dim = emb_dim
        self.hidden_dim = hidden_dim
        self.sent_len = sent_len
        vocab_size = len(vocab)

        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        
        # Word-level GRU (process each sentence)
        self.word_gru = nn.GRU(emb_dim, hidden_dim, batch_first=True, bidirectional=True)
        
        # Sentence-level GRU (process each sentence representation)
        self.sent_gru = nn.GRU(hidden_dim * 2, hidden_dim, batch_first=True, bidirectional=True)
    
    def load_embeddings(self, weight_matrix):
        self.embedding.weight.data.copy_(weight_matrix)
        self.embedding.weight.requires_grad = False  # Optional: Freeze embeddings

    def forward(self, x):
        # x: [batch_size, total_seq_len]
        batch_size, total_seq_len = x.shape
        if total_seq_len < self.sent_len:
            pad_len = self.sent_len - total_seq_len
            x = torch.cat([x, torch.zeros(batch_size, pad_len, dtype=torch.long, device=x.device)], dim=1)
            total_seq_len = self.sent_len

        max_tokens = (total_seq_len // self.sent_len) * self.sent_len
        x = x[:, :max_tokens]
        num_sents = max_tokens // self.sent_len


        # Reshape to [B, num_sents, sent_len]
        x = x.view(batch_size, num_sents, self.sent_len)

        # Embed: [B, num_sents, sent_len, emb_dim]
        embedded = self.embedding(x)

        # Reshape to merge sentences into batch: [B * num_sents, sent_len, emb_dim]
        embedded = embedded.view(batch_size * num_sents, self.sent_len, self.emb_dim)

        # Word-level GRU
        word_outputs, word_hidden = self.word_gru(embedded)
        # Concatenate forward and backward hidden states: [B * num_sents, hidden_dim * 2]
        word_hidden = torch.cat((word_hidden[-2], word_hidden[-1]), dim=1)

        # Reshape back to sentence-level: [B, num_sents, hidden_dim * 2]
        sentence_reps = word_hidden.view(batch_size, num_sents, self.hidden_dim * 2)

        # Sentence-level GRU
        sent_outputs, sent_hidden = self.sent_gru(sentence_reps)

        # Combine last forward and backward hidden states
        sent_hidden = torch.cat((sent_hidden[-2], sent_hidden[-1]), dim=1).unsqueeze(0)  # [1, B, H*2]

        return sent_outputs, sent_hidden


### Decoder

In [None]:
class DecoderRNN(nn.Module):
    def __init__(self, vocab, emb_dim=300, hidden_dim=600):
        super().__init__()
        self.embedding = nn.Embedding(len(vocab), emb_dim, padding_idx=0)
        self.gru = nn.GRU(emb_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, len(vocab))

    def forward(self, x, hidden):
        x = self.embedding(x.unsqueeze(1))
        output, hidden = self.gru(x, hidden)
        logits = self.fc(output.squeeze(1))
        return logits, hidden

### Decoder2RNN

In [None]:
class Decoder2RNN(nn.Module):
    def __init__(self, vocab, emb_dim=300, hidden_dim=600):
        super().__init__()
        self.embedding = nn.Embedding(len(vocab), emb_dim, padding_idx=0)
        self.gru1 = nn.GRU(emb_dim, hidden_dim, batch_first=True)
        self.gru2 = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, len(vocab))

    def forward(self, x, hidden):
        x = self.embedding(x.unsqueeze(1))
        out1, hidden1 = self.gru1(x, hidden)
        out2, hidden2 = self.gru2(out1, hidden)
        logits = self.fc(out2.squeeze(1))
        return logits, hidden2


### Seq2Seq2 class with Beam Search code

In [None]:
class Seq2SeqRNN(nn.Module):
    def __init__(self, vocab, max_len, emb_dim=300, hidden_dim=300,
                 use_hier=False, use_dec2=False, glove_path=None):
        super().__init__()
        self.vocab = vocab
        self.max_len = max_len

        # Encoder
        if use_hier:
            self.encoder = HierarchicalEncoderRNN(vocab, emb_dim, hidden_dim)
        else:
            self.encoder = EncoderRNN(vocab, emb_dim, hidden_dim)

        # Load GloVe if path given
        if glove_path:
            glove_matrix = self.load_glove_weights(glove_path, emb_dim)
            self.encoder.load_embeddings(glove_matrix)

        # Decoder
        if use_dec2:
            self.decoder = Decoder2RNN(vocab, emb_dim, hidden_dim * 2)
        else:
            self.decoder = DecoderRNN(vocab, emb_dim, hidden_dim * 2)

    def load_glove_weights(self, path, emb_dim):
        weights = torch.randn(len(self.vocab), emb_dim)
        with open(path, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split()
                word = parts[0]
                if word in self.vocab:
                    idx = self.vocab[word]
                    weights[idx] = torch.tensor(list(map(float, parts[1:])), device=device)
        return weights.to(device)

    def forward(self, src, tgt=None, teacher_forcing=True, beam_search=False, beam_width=3):
        if beam_search:
            return self.beam_decode(src, beam_width)
        else:
            return self.greedy_decode(src, tgt, teacher_forcing)

    def greedy_decode(self, src, tgt=None, teacher_forcing=True):
        _, hidden = self.encoder(src)
        input_token = torch.full((src.size(0),), self.vocab['<bos>'], dtype=torch.long, device=src.device)
        outputs = []

        for t in range(self.max_len):
            out, hidden = self.decoder(input_token, hidden)
            outputs.append(out.unsqueeze(1))
            if teacher_forcing and tgt is not None and t < tgt.size(1):
                input_token = tgt[:, t]
            else:
                input_token = out.argmax(1)

        return torch.cat(outputs, dim=1)

    def beam_decode(self, src, beam_width):
        batch_size = src.size(0)
        assert batch_size == 1, "Beam search supports batch size 1 for now."
        
        # Get encoder output
        _, hidden = self.encoder(src)
        
        # Initialize beam
        # Each sequence contains: [token_ids, score, hidden_state]
        sequences = [[[self.vocab['<bos>']], 0.0, hidden]]
        finished_sequences = []
        
        # Beam search
        for _ in range(self.max_len - 1):  # -1 because we already added <bos>
            all_candidates = []
            
            # Expand each current sequence
            for seq, score, h in sequences:
                # Get the last token in sequence
                last_token = seq[-1]
                if last_token == self.vocab['<eos>']:
                    # If this sequence has ended, keep it for later
                    finished_sequences.append([seq, score, h])
                    continue
                    
                # Convert last token to tensor for decoder input
                input_token = torch.tensor([last_token], device=src.device)
                
                # Get decoder output
                out, h_new = self.decoder(input_token, h)
                log_probs = torch.nn.functional.log_softmax(out, dim=1)
                
                # Get top k candidates
                topk_log_probs, topk_idxs = torch.topk(log_probs, beam_width)
                
                # Create new candidate sequences
                for i in range(beam_width):
                    next_token = topk_idxs[0][i].item()
                    next_score = score + topk_log_probs[0][i].item()
                    candidate = [seq + [next_token], next_score, h_new]
                    all_candidates.append(candidate)
            
            # If all sequences have finished or we've run out of candidates
            if not all_candidates:
                break
                
            # Select top beam_width sequences
            ordered = sorted(all_candidates, key=lambda tup: tup[1], reverse=True)
            sequences = ordered[:beam_width]
            
            # Early stopping if all sequences have generated <eos>
            if all(seq[-1] == self.vocab['<eos>'] for seq, _, _ in sequences):
                finished_sequences.extend(sequences)
                break
        
        # Add any unfinished sequences to the finished list
        finished_sequences.extend(sequences)
        
        # Sort finished sequences by score
        finished_sequences = sorted(finished_sequences, key=lambda tup: tup[1], reverse=True)
        
        # Get best sequence
        best_seq = finished_sequences[0][0]
        
        # Convert token IDs to one-hot representation for each position
        vocab_size = len(self.vocab)
        output_tensor = torch.zeros(1, len(best_seq), vocab_size, device=src.device)
        
        # Fill in the one-hot positions
        for t, token_id in enumerate(best_seq):
            output_tensor[0, t, token_id] = 1.0
            
        return output_tensor

### Training 


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
glove_path = os.path.join('.', 'glove_vectors', 'glove.6B.300d.txt')
max_lens = [8]
batch_sizes = [32]

# to activate hierencoder, decoder2rnn and glove_path, make changes here.
def get_model(vocab, max_len, emb_dim=300, hidden_dim=300):
    model = Seq2SeqRNN(
        vocab, max_len=max_len, emb_dim=emb_dim, hidden_dim=hidden_dim,
        use_hier=False, use_dec2=False, glove_path=None
    ).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    return model, optimizer

def evaluate(model, data_loader, criterion):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for x, y in data_loader:
            x, y = x.to(device), y.to(device)
            out = model(x, y, teacher_forcing=False)
            out = out[:, :y.shape[1], :]  # Ensure match
            loss = criterion(out.reshape(-1, out.shape[-1]), y.reshape(-1))
            total_loss += loss.item()
    return total_loss / len(data_loader)

best_overall_loss = float('inf')
best_model_state_dict = None  # Save best model in memory
results = []

for max_len_val, batch_size_val in product(max_lens, batch_sizes):
    print(f"\n🟦 Training for max_len={max_len_val}, batch_size={batch_size_val}")
    
    train_loader = DataLoader(TitleDataset(df_train['text'], df_train['title'], vocab, max_len_val),
                              batch_size=batch_size_val, shuffle=True)
    val_loader = DataLoader(TitleDataset(df_val['text'], df_val['title'], vocab, max_len_val),
                            batch_size=batch_size_val)

    model, optimizer = get_model(vocab, max_len_val)
    criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])
    # Early stopping
    max_patience = 3
    patience_counter = 0
    best_val_loss = float('inf')
    best_epoch = 0

    for epoch in range(10):
        model.train()
        total_loss = 0
        for x, y in tqdm(train_loader, desc=f"Epoch {epoch}"):
            x, y = x.to(device), y.to(device)
            out = model(x, y, teacher_forcing=True)
            out = out[:, :y.shape[1], :]
            loss = criterion(out.reshape(-1, out.shape[-1]), y.reshape(-1))
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        avg_train_loss = total_loss / len(train_loader)
        avg_val_loss = evaluate(model, val_loader, criterion)
        print(f"Epoch {epoch}: Train Loss = {avg_train_loss:.4f}, Val Loss = {avg_val_loss:.4f}")

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_epoch = epoch
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= max_patience:
                print(f"Early stopping at epoch {epoch+1}")
                break

    # Save the best model across ALL configs
    if best_val_loss < best_overall_loss:
        best_overall_loss = best_val_loss
        best_model_state_dict = model.state_dict()  # Store weights

    results.append({
        'max_len': max_len_val,
        'batch_size': batch_size_val,
        'val_loss': best_val_loss,
        'epoch': best_epoch
    })

    print(f"✅ Finished: max_len={max_len_val}, batch_size={batch_size_val}, best_val_loss={best_val_loss:.4f} at epoch {best_epoch}")

# Save only the best model globally
if best_model_state_dict is not None:
    torch.save(best_model_state_dict, 'best_seq2seq.pt')
    print("🟢 Best model saved as 'best_seq2seq.pt'")

# ==== Plot Heatmap ====
df_results = pd.DataFrame(results)
heatmap_data = df_results.pivot(index='max_len', columns='batch_size', values='val_loss')

plt.figure(figsize=(8, 6))
sns.heatmap(heatmap_data, annot=True, fmt=".4f", cmap="viridis")
plt.title("Validation Loss Heatmap")
plt.ylabel("Max Sequence Length")
plt.xlabel("Batch Size")
plt.tight_layout()
plt.show()


### Generating Titles

In [None]:
def clean_text(text):
    return text.lower().strip()

def generate_title(model, src_tensor, beam=False, beam_width=3):
    model.eval()
    with torch.no_grad():
        # Call the model with appropriate parameters
        output = model(src_tensor, beam_search=beam, beam_width=beam_width)
        
        # Check if output is a tensor (from beam search) or a string
        if isinstance(output, torch.Tensor):
            # Convert tensor to tokens and then to string
            output_ids = output.argmax(dim=-1)  # [batch_size, seq_len]
            
            # Convert token IDs to words
            words = []
            for i in range(output_ids.size(0)):
                seq_words = [inv_vocab.get(idx.item(), '[UNK]') for idx in output_ids[i] 
                           if idx.item() not in [vocab['<bos>'], vocab['<eos>'], vocab['<pad>']]]
                words.append(' '.join(seq_words))
            
            # Join all words (typically just one sequence)
            result = words[0] if words else ''
        else:
            # If output is already a string, use it directly
            result = output
            
    return result

### Printing Rouge Scores

In [None]:
# Load the best model
model.load_state_dict(torch.load('best_seq2seq.pt'))

generated_titles = []

# Track ROUGE scores for each example
individual_rouge_scores = []

for i, row in df_test.iterrows():
    input_text = row['text']
    reference_title = row['title']
    
    if not input_text:
        continue

    # Convert input_text to tensor
    tokens = input_text.split()  # Replace with your actual tokenizer logic
    token_ids = [vocab.get(token, vocab['<unk>']) for token in tokens]
    src_tensor = torch.tensor(token_ids, dtype=torch.long).unsqueeze(0).to(device)  # shape: [1, seq_len]

    # Generate title using beam search
    generated = generate_title(model, src_tensor, beam=False, beam_width=3)
    
    # Safely check if the generated text is empty
    if not generated or generated.strip() == '':
        generated = 'EMPTY'
        
    generated_titles.append(generated)

# Evaluate using ROUGE
rouge = Rouge()
scores = rouge.get_scores(generated_titles, df_test['title'].tolist(), avg=True)
print("\nROUGE Evaluation:")
print(scores)