In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from collections import Counter
import random
import time
import math
import matplotlib.pyplot as plt
from torch.nn import functional as F
from torch.nn.utils.rnn import pad_sequence
import torch.nn.init as init
import os

# Ensure NLTK resources are downloaded
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# --- Step 0: Create Sample Data ---
print("=" * 70)
print("Step 0: Creating Sample Data")
print("=" * 70)

def create_sample_data(filename='news_summary.csv', num_samples=5000):
    """Creates a sample CSV file with dummy text and summaries"""
    texts = [
        "This is a sample article about the weather. It's sunny today with a high of 25 degrees.",
        "The stock market had a good day with all the major indices up more than one percent.",
        "Scientists have made a new discovery in cancer research that could lead to new treatments.",
        "The football team won their game last night in a close match.",
        "There's a new movie coming out that is generating a lot of buzz.",
        "Global warming is a serious threat and needs to be addressed by governments.",
        "The new study has shown that most people enjoy pizza.",
        "The local news has announced there will be heavy snow tomorrow.",
        "A new software has been released with significant bug fixes.",
        "The national park has been shut down due to a hurricane."
    ]
    summaries = [
        "Sunny weather expected today.",
        "Stock market jumps.",
        "Cancer research breakthrough.",
        "Football team wins close game.",
        "New movie generating buzz.",
        "Global warming a serious issue.",
        "Study shows people like pizza.",
        "Heavy snow expected tomorrow.",
        "New software with bug fixes.",
        "National park shut down for hurricane."
    ]

    data = []
    for _ in range(num_samples):
        text_index = random.randint(0, len(texts)-1)
        text = texts[text_index]
        summary = summaries[text_index]
        data.append({'text': text, 'summary': summary})

    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    print(f"  Sample data created and saved to '{filename}'.")
    return filename

# Create sample data
file_path = create_sample_data() # this will create data


# --- Step 1: Data Preprocessing ---
print("\n" + "=" * 70)
print("Step 1: Data Preprocessing")
print("=" * 70)
def load_data(filepath, sample_size=None):
    """
    Loads data from CSV, with optional sampling and columns for text and summary
    """
    df = pd.read_csv(filepath)
    if sample_size:
        df = df.sample(n=sample_size, random_state=SEED)

    # Ensure consistent column names for text and summary
    if "text" not in df.columns or "summary" not in df.columns:
        if "article" in df.columns and "highlights" in df.columns:
            df = df.rename(columns={"article": "text", "highlights": "summary"})
        elif df.shape[1] >= 2: # if it has only two columns, assume its text and summary
            df = df.rename(columns={df.columns[0]:"text", df.columns[1]:"summary"})
        else:
             raise ValueError("Dataframe must have at least two columns, or columns labeled 'text', 'summary', 'article', 'highlights'")

    return df


def preprocess_text(text):
    """
    Preprocesses the text by lowercasing, removing special characters, and tokenizing.
    """
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words] # removed stop words from the tokenized sequence
    return tokens


def build_vocabulary(texts, min_freq=2):
    """
    Builds a vocabulary from the preprocessed tokens.
    """
    all_tokens = [token for text in texts for token in text]
    token_counts = Counter(all_tokens)
    vocabulary = [token for token, count in token_counts.items() if count >= min_freq] #only taking the tokens that appear more than min_freq
    vocabulary.insert(0, '<pad>')
    vocabulary.insert(1, '<unk>') # Unknown tokens will be indexed to 1
    return {token: idx for idx, token in enumerate(vocabulary)}


def numericalize_text(tokens, vocabulary):
  """Convert tokenized text to numerical sequences using the vocabulary.
  if the token isn't in the vocabulary, then it is replaced with the unknown token
  """
  return [vocabulary.get(token, 1) for token in tokens]


def preprocess_data(df, min_freq=2):
    """
    Applies the preprocessing steps to the dataframe.
    """
    df['text_tokens'] = df['text'].apply(preprocess_text)
    df['summary_tokens'] = df['summary'].apply(preprocess_text)

    all_text_tokens =  df['text_tokens'].tolist() + df['summary_tokens'].tolist() # combine both for building the vocab
    vocabulary = build_vocabulary([token for tokens in all_text_tokens for token in tokens], min_freq)

    df['text_numerical'] = df['text_tokens'].apply(lambda tokens: numericalize_text(tokens, vocabulary))
    df['summary_numerical'] = df['summary_tokens'].apply(lambda tokens: numericalize_text(tokens, vocabulary))
    return df, vocabulary

# Load and preprocess data
df = load_data(file_path, sample_size=5000) # changed filename and sample size here
processed_df, vocabulary = preprocess_data(df, min_freq = 3) # added the min_freq here, reducing it from 5 to 3 for increased vocabulary size
vocab_size = len(vocabulary)
print("   Vocabulary size:", vocab_size)
print("   Data Preprocessing Complete.")
print(processed_df.head())

# --- Step 2: Dataset and DataLoader ---
print("\n" + "=" * 70)
print("Step 2: Dataset and DataLoader")
print("=" * 70)

class TextSummaryDataset(Dataset):
    """
    Dataset class for text summarization
    """
    def __init__(self, df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text_numerical = self.df['text_numerical'].iloc[idx]
        summary_numerical = self.df['summary_numerical'].iloc[idx]
        return torch.tensor(text_numerical), torch.tensor(summary_numerical)

def collate_batch(batch):
    """
    Pads sequences to the maximum length in the batch
    """
    text_sequences = [item[0] for item in batch]
    summary_sequences = [item[1] for item in batch]

    padded_text_seqs = pad_sequence(text_sequences, batch_first=True, padding_value=0)
    padded_summary_seqs = pad_sequence(summary_sequences, batch_first=True, padding_value=0) # uses the pad token in index 0 as the padding value

    return padded_text_seqs, padded_summary_seqs

# Split dataset into train/val
train_df, val_df = train_test_split(processed_df, test_size=0.2, random_state=SEED)

train_dataset = TextSummaryDataset(train_df)
val_dataset = TextSummaryDataset(val_df)

BATCH_SIZE = 64
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

print("  Dataset and DataLoader setup complete")


# --- Step 3: Model Architecture ---
print("\n" + "=" * 70)
print("Step 3: Model Architecture")
print("=" * 70)

class PositionalEncoding(nn.Module):
    """
    Implementation of Positional Encoding
    """
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        """
        Adds the positional encoding to the input
        """
        return x + self.pe[:, :x.size(1), :]

class MultiHeadAttention(nn.Module):
    """
    Implementation of Multi-Head Attention
    """
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"

        self.d_model = d_model
        self.num_heads = num_heads
        self.d_head = d_model // num_heads

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)

    def scaled_dot_product_attention(self, q, k, v, mask=None):
      """Calculates scaled dot-product attention"""
      attn_scores = torch.matmul(q, k.transpose(-2,-1)) / math.sqrt(self.d_head)
      if mask is not None:

        attn_scores = attn_scores.masked_fill(mask == 0, -1e9)
      attn_probs = F.softmax(attn_scores, dim=-1)
      output = torch.matmul(attn_probs, v)
      return output

    def split_heads(self, x):
        """Splits the input into heads"""
        batch_size, seq_length, d_model = x.size()
        return x.view(batch_size, seq_length, self.num_heads, self.d_head).transpose(1, 2)

    def combine_heads(self, x):
        """Combines the heads back together"""
        batch_size, _, seq_length, d_head = x.size()
        return x.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)

    def forward(self, q, k, v, mask=None):
      """Forward pass for multi head attention"""
      Q = self.W_q(q)
      K = self.W_k(k)
      V = self.W_v(v)

      Q_split = self.split_heads(Q)
      K_split = self.split_heads(K)
      V_split = self.split_heads(V)

      attn_output = self.scaled_dot_product_attention(Q_split, K_split, V_split, mask)
      attn_output_combined = self.combine_heads(attn_output)
      output = self.W_o(attn_output_combined)
      return output

class FeedForward(nn.Module):
    """Implementation of the feed-forward network"""
    def __init__(self, d_model, d_ff):
        super(FeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        """Forward pass for Feed Forward Network"""
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class EncoderLayer(nn.Module):
    """Encoder Layer of the Transformer"""
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ff = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
      """Forward pass for the encoder layer"""
      attn_output = self.mha(x,x,x,mask)
      x = self.norm1(x + self.dropout(attn_output))
      ff_output = self.ff(x)
      x = self.norm2(x + self.dropout(ff_output))
      return x


class DecoderLayer(nn.Module):
    """Decoder Layer of the Transformer"""
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        self.masked_mha = MultiHeadAttention(d_model, num_heads) #Masked Self Attention
        self.mha = MultiHeadAttention(d_model, num_heads) #Encoder Decoder Attention
        self.ff = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask = None, tgt_mask=None):
      """Forward pass for decoder layer"""
      masked_attn_output = self.masked_mha(x,x,x,tgt_mask) # Masked self-attention
      x = self.norm1(x + self.dropout(masked_attn_output))
      attn_output = self.mha(x, enc_output, enc_output, src_mask)
      x = self.norm2(x + self.dropout(attn_output))
      ff_output = self.ff(x)
      x = self.norm3(x + self.dropout(ff_output))
      return x

class Encoder(nn.Module):
    """Encoder module of the Transformer"""
    def __init__(self, vocab_size, d_model, num_heads, d_ff, num_layers, dropout, max_len):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)]) #multiple encoder layers
        self.dropout = nn.Dropout(dropout)
    def forward(self, x, mask=None):
      """Forward pass for Encoder"""
      x = self.dropout(self.pos_encoder(self.embedding(x)))
      for layer in self.layers:
        x = layer(x,mask)
      return x

class Decoder(nn.Module):
    """Decoder Module of the Transformer"""
    def __init__(self, vocab_size, d_model, num_heads, d_ff, num_layers, dropout, max_len):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len)
        self.layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)]) #Multiple decoder layers
        self.fc = nn.Linear(d_model, vocab_size) # fully connected to produce output
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
      """Forward pass for Decoder"""
      x = self.dropout(self.pos_encoder(self.embedding(x)))
      for layer in self.layers:
          x = layer(x, enc_output, src_mask, tgt_mask)
      x = self.fc(x)
      return x

class Transformer(nn.Module):
    """Full Transformer Model"""
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model, num_heads, d_ff, num_layers, dropout, max_len):
        super(Transformer, self).__init__()
        self.encoder = Encoder(src_vocab_size, d_model, num_heads, d_ff, num_layers, dropout, max_len)
        self.decoder = Decoder(tgt_vocab_size, d_model, num_heads, d_ff, num_layers, dropout, max_len)

        # Initialize weights with Xavier initialization
        for p in self.parameters():
           if p.dim() > 1:
              init.xavier_uniform_(p)


    def generate_mask(self, seq, pad_token=0):
      """Creates a mask for padding tokens"""
      mask = (seq != pad_token).unsqueeze(1).unsqueeze(2).unsqueeze(3) # added an unsqueeze to shape as batch_size,1,seq_len,1
      return mask

    def generate_tgt_mask(self, seq):
      """Create a look ahead mask so the model doesn't look at future words."""
      seq_len = seq.size(1)
      mask = torch.tril(torch.ones((seq_len, seq_len),dtype=torch.bool))
      return mask


    def forward(self, src, tgt):
      """Forward pass for the Transformer model"""
      src_mask = self.generate_mask(src)
      tgt_mask = self.generate_tgt_mask(tgt)
      enc_output = self.encoder(src, src_mask)
      output = self.decoder(tgt[:, :-1], enc_output, src_mask, tgt_mask) #remove last token for target since the target model will not predict that in training
      return output

# Model hyperparameters
d_model = 256
num_heads = 8
d_ff = 1024
num_layers = 4
dropout = 0.1
max_len = 512
learning_rate = 0.0005

# Initialize model and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Transformer(vocab_size, vocab_size, d_model, num_heads, d_ff, num_layers, dropout, max_len).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index=0)  # ignore padding tokens
print("  Model Architecture Setup Complete")

# --- Step 4: Training ---
print("\n" + "=" * 70)
print("Step 4: Training")
print("=" * 70)

def train_step(model, iterator, optimizer, criterion, clip = 1):
    """Performs a single training step"""
    model.train()
    epoch_loss = 0
    for src, tgt in iterator:
        src = src.to(device)
        tgt = tgt.to(device)
        optimizer.zero_grad()
        output = model(src, tgt)

        output_reshape = output.contiguous().view(-1, output.shape[-1]) # reshaping to (batch*seq_len, vocab_size)
        tgt_reshape = tgt[:, 1:].contiguous().view(-1) # reshaping to (batch*seq_len)

        loss = criterion(output_reshape, tgt_reshape)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip) # prevents exploding gradient
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def evaluate_step(model, iterator, criterion):
    """Performs a single evaluation step"""
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, tgt in iterator:
          src = src.to(device)
          tgt = tgt.to(device)
          output = model(src, tgt)

          output_reshape = output.contiguous().view(-1, output.shape[-1])
          tgt_reshape = tgt[:, 1:].contiguous().view(-1)

          loss = criterion(output_reshape, tgt_reshape)
          epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def train_model(model, train_loader, val_loader, optimizer, criterion, num_epochs = 10, clip=1):
  """Main training loop"""
  train_losses = []
  val_losses = []

  for epoch in range(num_epochs):
      start_time = time.time()
      train_loss = train_step(model, train_loader, optimizer, criterion, clip)
      val_loss = evaluate_step(model, val_loader, criterion)
      end_time = time.time()
      train_losses.append(train_loss)
      val_losses.append(val_loss)
      epoch_mins, epoch_secs = int((end_time - start_time) / 60), int((end_time - start_time) % 60)
      print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
      print(f'\tTrain Loss: {train_loss:.3f} | Val Loss: {val_loss:.3f}')
  return train_losses, val_losses

num_epochs = 10
clip = 1
train_losses, val_losses = train_model(model, train_loader, val_loader, optimizer, criterion, num_epochs, clip)

# Plotting the losses
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title('Training and Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid()
plt.show()

print("  Training Complete.")

# --- Step 5: Text Generation / Summarization ---
print("\n" + "=" * 70)
print("Step 5: Text Generation / Summarization")
print("=" * 70)

def generate_summary(model, src, vocabulary, max_len=150):
    """Generates a summary from the given input text"""
    model.eval()
    with torch.no_grad():
        src_tensor = torch.tensor(numericalize_text(preprocess_text(src), vocabulary)).unsqueeze(0).to(device)
        src_mask = model.generate_mask(src_tensor)
        enc_output = model.encoder(src_tensor, src_mask)

        tgt_tokens = [vocabulary['<pad>']] #start with the padding token
        for _ in range(max_len):
            tgt_tensor = torch.tensor(tgt_tokens).unsqueeze(0).to(device)
            tgt_mask = model.generate_tgt_mask(tgt_tensor)
            output = model.decoder(tgt_tensor, enc_output, src_mask, tgt_mask)
            output = output.argmax(dim=-1) #get the word with max probability
            next_token = output[:,-1].item()
            tgt_tokens.append(next_token)
            if next_token == vocabulary['<pad>']:
                break # if the next token is padding, then break

        # remove pad token and translate numericalized to tokens
        generated_tokens = [token for token, idx in vocabulary.items() if idx in tgt_tokens[1:]]
        generated_summary = ' '.join(generated_tokens)
    return generated_summary.strip()

def show_random_summarization(df, model, vocabulary):
  """Generates the model's predicted summary against actual summary."""
  rand_index = random.randint(0,len(df)-1)
  sample = df.iloc[rand_index]
  src = sample['text']
  tgt = sample['summary']
  pred_tgt = generate_summary(model, src, vocabulary)
  print("\nExample Summarization")
  print("-----------------------")
  print(f"  Original Text:\n{src}")
  print(f"\n  Predicted Summary:\n{pred_tgt}")
  print(f"\n  Actual Summary:\n{tgt}")

show_random_summarization(val_df, model, vocabulary)

print("Text Generation / Summarization Complete.")

# --- Step 6: Evaluation Metrics (BLEU) ---
print("\n" + "=" * 70)
print("Step 6: Evaluation Metrics (BLEU)")
print("=" * 70)
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

def calculate_bleu(model, val_df, vocabulary):
  """Calculates BLEU score for the entire validation set"""
  actual_summaries = val_df['summary'].tolist()
  predicted_summaries = []
  for src in val_df['text'].tolist():
    predicted_summaries.append(generate_summary(model, src, vocabulary))

  actual_summaries_tokens = [preprocess_text(summary) for summary in actual_summaries]
  predicted_summaries_tokens = [preprocess_text(summary) for summary in predicted_summaries]

  smoothing = SmoothingFunction().method4
  bleu_score = corpus_bleu(actual_summaries_tokens, predicted_summaries_tokens, smoothing_function=smoothing)
  print(f"  BLEU Score: {bleu_score:.4f}")

calculate_bleu(model, val_df, vocabulary)

print("Evaluation Complete.")

# --- Step 7: Model Export ---
print("\n" + "=" * 70)
print("Step 7: Model Export")
print("=" * 70)
# Optional - saving model to file
try:
    torch.save(model.state_dict(), 'transformer_summarization_model.pth')
    print("  Model successfully exported to transformer_summarization_model.pth")
except Exception as e:
    print("Error saving model", e)

print("Data processing, training, and evaluation complete.")