# A. Dataset and Preprocessing

In [1]:
import os
import pandas as pd
import re
import string
import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Define the text preprocessing function with stemming
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[{}]'.format(re.escape(string.punctuation)), '', text)
    text = text.encode("ascii", "ignore").decode()  # Remove non-ASCII characters
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Initialize the stemmer
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]  # Apply stemming to tokens
    
    processed_text = " ".join(tokens)
    return processed_text

# Load the datasets (assuming the downloaded files are in the current directory)
train_df = pd.read_csv('/kaggle/input/wiki-data/train.csv')
test_df = pd.read_csv('/kaggle/input/wiki-data/test.csv')

# Create a validation set by randomly sampling 500 articles from the training set
validation_df = train_df.sample(n=500, random_state=71)

# Remove the selected validation articles from the training set
train_df = train_df.drop(validation_df.index)

# Preprocess the 'text' column for both training and validation sets
train_df['processed_text'] = train_df['text'].apply(preprocess_text)
validation_df['processed_text'] = validation_df['text'].apply(preprocess_text)

# Saving the preprocessed datasets to new CSV files
train_df.to_csv('train_preprocessed.csv', index=False)
validation_df.to_csv('val_preprocessed.csv', index=False)

# Print the shape of the datasets
print("Training set shape:", train_df.shape)
print("Validation set shape:", validation_df.shape)

# Sample rows from the training set
print("Sample rows from the training set:")
print(train_df.sample(2))
print("Sample rows from the validation set:")
print(validation_df.sample(2))

print("Preprocessing complete. Training and validation sets are saved.")

Training set shape: (13379, 3)
Validation set shape: (500, 3)
Sample rows from the training set:
                   title                                               text  \
4314        Stefan Moore  Stefan Leroy Moore (born 28 September 1983) is...   
6214  Columbus, Nebraska  Columbus is a city in and the county seat of P...   

                                         processed_text  
4314  stefan leroy moor born 28 septemb 1983 english...  
6214  columbu citi counti seat platt counti state ne...  
Sample rows from the validation set:
                title                                               text  \
5646   FC Dynamo Kyiv  Football Club Dynamo Kyiv (, ) is a Ukrainian ...   
5676  Ottmar Hitzfeld  Ottmar Hitzfeld (; born 12 January 1949) is a ...   

                                         processed_text  
5646  footbal club dynamo kyiv ukrainian profession ...  
5676  ottmar hitzfeld born 12 januari 1949 german fo...  
Preprocessing complete. Training and validation set

# B1. Setting up a basic RNN Seq2seq model

### Imports, Package Installation, and Seed Setup

In [2]:
# Imports and package installation
import re
import random
import math
import numpy as np
from collections import Counter

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Install ROUGE for evaluation
!pip install rouge-score -q
from rouge_score import rouge_scorer

# Set random seeds for reproducibility
random.seed(71)
np.random.seed(71)
torch.manual_seed(71)

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone


<torch._C.Generator at 0x79537d108410>

### Data Loading, Preprocessing, and Vocabulary Creation

In [3]:
# Load preprocessed training and validation data
# (Assuming 'train_preprocessed.csv' has columns 'processed_text' for article bodies and 'title' for target titles.)
train_df = pd.read_csv('train_preprocessed.csv')
validation_df = pd.read_csv('val_preprocessed.csv')

# Simple whitespace tokenizer function
def tokenize(text):
    return text.strip().split()

# Build vocabulary from training data based on tokens appearing in at least 1% of the documents.
min_occurrence = math.ceil(0.01 * len(train_df))

def build_vocab(texts, min_occurrence):
    counter = Counter()
    for text in texts:
        tokens = set(tokenize(text))  # count once per document
        counter.update(tokens)
    vocab = {token for token, count in counter.items() if count >= min_occurrence}
    return vocab

source_vocab = build_vocab(train_df['processed_text'], min_occurrence)
target_vocab = build_vocab(train_df['title'], min_occurrence)

print(len(source_vocab))
print(len(target_vocab))

# Create token-to-index mappings (reserve <pad>, <sos>, <eos>, <unk>)
def create_token2idx(vocab):
    token2idx = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
    for token in sorted(vocab):
        token2idx[token] = len(token2idx)
    return token2idx

src_token2idx = create_token2idx(source_vocab)
tgt_token2idx = create_token2idx(target_vocab)

# For decoding, create an inverse mapping for target vocabulary
tgt_idx2token = {idx: token for token, idx in tgt_token2idx.items()}

# Define maximum sequence lengths for source and target
SRC_MAX_LEN = 300   # article body max tokens
TGT_MAX_LEN = 20    # title max tokens

# Function to convert text to list of indices
def numericalize(text, token2idx, add_sos_eos=False, max_len=None):
    tokens = tokenize(text)
    if add_sos_eos:
        tokens = ['<sos>'] + tokens + ['<eos>']
    indices = [token2idx.get(token, token2idx['<unk>']) for token in tokens]
    if max_len is not None:
        indices = indices[:max_len]
    return indices

8081
15


### Dataset and DataLoader Preparation

In [4]:
class WikiDataset(Dataset):
    def __init__(self, df, src_token2idx, tgt_token2idx, src_max_len=SRC_MAX_LEN, tgt_max_len=TGT_MAX_LEN):
        self.df = df
        self.src_token2idx = src_token2idx
        self.tgt_token2idx = tgt_token2idx
        self.src_max_len = src_max_len
        self.tgt_max_len = tgt_max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        src_text = self.df.iloc[idx]['processed_text']
        tgt_text = self.df.iloc[idx]['title']
        src_indices = numericalize(src_text, self.src_token2idx, add_sos_eos=False, max_len=self.src_max_len)
        tgt_indices = numericalize(tgt_text, self.tgt_token2idx, add_sos_eos=True, max_len=self.tgt_max_len)
        return torch.tensor(src_indices), torch.tensor(tgt_indices)

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_lens = [len(x) for x in src_batch]
    tgt_lens = [len(x) for x in tgt_batch]
    src_pad = nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=src_token2idx['<pad>'])
    tgt_pad = nn.utils.rnn.pad_sequence(tgt_batch, batch_first=True, padding_value=tgt_token2idx['<pad>'])
    return src_pad, torch.tensor(src_lens), tgt_pad, torch.tensor(tgt_lens)

BATCH_SIZE = 32
train_dataset = WikiDataset(train_df, src_token2idx, tgt_token2idx)
val_dataset = WikiDataset(validation_df, src_token2idx, tgt_token2idx)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

### Model Architecture – EncoderRNN, DecoderRNN, and Seq2seqRNN

In [5]:
HIDDEN_DIM = 300
EMB_DIM = 300

# 1. EncoderRNN class
class EncoderRNN(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, dropout=0.5):
        super(EncoderRNN, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=src_token2idx['<pad>'])
        self.gru = nn.GRU(emb_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_lengths):
        # src: [batch_size, src_len]
        embedded = self.dropout(self.embedding(src))
        # Pack padded sequence
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_outputs, hidden = self.gru(packed_embedded)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs, batch_first=True)
        # outputs: [batch_size, src_len, hidden_dim]
        # hidden: [1, batch_size, hidden_dim]
        return outputs, hidden

# 2. DecoderRNN class
class DecoderRNN(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, dropout=0.5):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=tgt_token2idx['<pad>'])
        self.gru = nn.GRU(emb_dim, hidden_dim, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        
    def forward(self, input, hidden):
        # input: [batch_size] -> current token indices
        input = input.unsqueeze(1)  # [batch_size, 1]
        embedded = self.dropout(self.embedding(input))  # [batch_size, 1, emb_dim]
        output, hidden = self.gru(embedded, hidden)  # output: [batch_size, 1, hidden_dim]
        output = self.relu(output)
        prediction = self.fc_out(output.squeeze(1))  # [batch_size, output_dim]
        return prediction, hidden

# 3. Seq2seqRNN class
class Seq2seqRNN(nn.Module):
    def __init__(self, encoder, decoder, device):
        super(Seq2seqRNN, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, src_lengths, tgt, teacher_forcing_ratio=0.5):
        # src: [batch_size, src_len]
        # tgt: [batch_size, tgt_len] with <sos> as first token
        batch_size = src.size(0)
        tgt_len = tgt.size(1)
        tgt_vocab_size = self.decoder.embedding.num_embeddings
        
        # Tensor to store decoder outputs
        outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)
        
        encoder_outputs, hidden = self.encoder(src, src_lengths)
        # First input token to decoder is <sos>
        input = tgt[:, 0]  # [batch_size]
        
        for t in range(1, tgt_len):
            output, hidden = self.decoder(input, hidden)
            outputs[:, t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = tgt[:, t] if teacher_force else top1
        return outputs

# Instantiate the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
INPUT_DIM = len(src_token2idx)
OUTPUT_DIM = len(tgt_token2idx)

encoder = EncoderRNN(INPUT_DIM, EMB_DIM, HIDDEN_DIM).to(device)
decoder = DecoderRNN(OUTPUT_DIM, EMB_DIM, HIDDEN_DIM).to(device)
model = Seq2seqRNN(encoder, decoder, device).to(device)

### Training and Evaluation Functions

In [6]:
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=tgt_token2idx['<pad>'])

def train(model, loader, optimizer, criterion, clip=1):
    model.train()
    epoch_loss = 0
    for src, src_lens, tgt, tgt_lens in loader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, src_lens, tgt)  # output: [batch_size, tgt_len, output_dim]
        output_dim = output.shape[-1]
        # Exclude the first token (<sos>) for loss computation
        output = output[:, 1:].reshape(-1, output_dim)
        tgt = tgt[:, 1:].reshape(-1)
        loss = criterion(output, tgt)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(loader)

def evaluate(model, loader, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, src_lens, tgt, tgt_lens in loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, src_lens, tgt, teacher_forcing_ratio=0)
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            tgt = tgt[:, 1:].reshape(-1)
            loss = criterion(output, tgt)
            epoch_loss += loss.item()
    return epoch_loss / len(loader)

N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion, clip=CLIP)
    valid_loss = evaluate(model, val_loader, criterion)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'seq2seq_model.pt')
    print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Val Loss: {valid_loss:.3f}')

Epoch: 01 | Train Loss: 0.698 | Val Loss: 0.706
Epoch: 02 | Train Loss: 0.531 | Val Loss: 0.534
Epoch: 03 | Train Loss: 0.404 | Val Loss: 0.410
Epoch: 04 | Train Loss: 0.346 | Val Loss: 0.406
Epoch: 05 | Train Loss: 0.312 | Val Loss: 0.394
Epoch: 06 | Train Loss: 0.289 | Val Loss: 0.367
Epoch: 07 | Train Loss: 0.267 | Val Loss: 0.372
Epoch: 08 | Train Loss: 0.249 | Val Loss: 0.364
Epoch: 09 | Train Loss: 0.236 | Val Loss: 0.368
Epoch: 10 | Train Loss: 0.227 | Val Loss: 0.355


### Inference and ROUGE Evaluation

In [7]:
def generate_title(model, src_sequence, src_length, max_len=TGT_MAX_LEN):
    model.eval()
    src_sequence = src_sequence.unsqueeze(0).to(device)  # [1, src_len]
    src_length = torch.tensor([src_length]).to(device)
    with torch.no_grad():
        encoder_outputs, hidden = model.encoder(src_sequence, src_length)
    # Start with <sos> token
    input_token = torch.tensor([tgt_token2idx['<sos>']]).to(device)
    generated_tokens = []
    for _ in range(max_len):
        with torch.no_grad():
            output, hidden = model.decoder(input_token, hidden)
            top1 = output.argmax(1)
        if top1.item() == tgt_token2idx['<eos>']:
            break
        generated_tokens.append(top1.item())
        input_token = top1
    title = " ".join([tgt_idx2token.get(idx, '<unk>') for idx in generated_tokens])
    return title

# Load test set (assuming a 'text' column exists)
test_df = pd.read_csv('/kaggle/input/wiki-data/test.csv')
# If not preprocessed, apply a simple preprocessing (e.g., lowercasing and punctuation removal)
if 'processed_text' not in test_df.columns:
    test_df['processed_text'] = test_df['text'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x.lower().encode('ascii', errors='ignore').decode()))

predictions = []
references = []  # Populate if test set includes reference titles

for idx, row in test_df.iterrows():
    src_indices = numericalize(row['processed_text'], src_token2idx, add_sos_eos=False, max_len=SRC_MAX_LEN)
    title_pred = generate_title(model, torch.tensor(src_indices), len(src_indices))
    predictions.append(title_pred)
    # If test set has reference titles in a 'title' column
    if 'title' in test_df.columns:
        references.append(row['title'])
    else:
        references.append("")

# Compute ROUGE scores if reference titles exist
if any(references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge1_scores, rouge2_scores, rougeL_scores = [], [], []
    for ref, pred in zip(references, predictions):
        scores = scorer.score(ref, pred)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)
    avg_rouge1 = np.mean(rouge1_scores)
    avg_rouge2 = np.mean(rouge2_scores)
    avg_rougeL = np.mean(rougeL_scores)
    print(f"ROUGE-1 F1: {avg_rouge1:.4f}")
    print(f"ROUGE-2 F1: {avg_rouge2:.4f}")
    print(f"ROUGE-L F1: {avg_rougeL:.4f}")
else:
    print("No reference titles available for ROUGE evaluation.")

print("Title generation complete.")

ROUGE-1 F1: 0.0327
ROUGE-2 F1: 0.0000
ROUGE-L F1: 0.0327
Title generation complete.


# B2. Improving the RNN model

### Imports

In [8]:
import os
import re
import random
import math
import numpy as np
from collections import Counter

import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Install ROUGE for evaluation
!pip install rouge-score -q
from rouge_score import rouge_scorer

# Set seeds for reproducibility
random.seed(71)
np.random.seed(71)
torch.manual_seed(71)

<torch._C.Generator at 0x79537d108410>

### Data Loading, Preprocessing, and Vocabulary Creation

In [9]:
# Assume train_preprocessed.csv and val_preprocessed.csv exist with columns 'processed_text' and 'title'
train_df = pd.read_csv('train_preprocessed.csv')
validation_df = pd.read_csv('val_preprocessed.csv')

def tokenize(text):
    return text.strip().split()

min_occurrence = math.ceil(0.01 * len(train_df))
def build_vocab(texts, min_occurrence):
    counter = Counter()
    for text in texts:
        tokens = set(tokenize(text))
        counter.update(tokens)
    vocab = {token for token, count in counter.items() if count >= min_occurrence}
    return vocab

source_vocab = build_vocab(train_df['processed_text'], min_occurrence)
target_vocab = build_vocab(train_df['title'], min_occurrence)

def create_token2idx(vocab):
    token2idx = {'<pad>': 0, '<sos>': 1, '<eos>': 2, '<unk>': 3}
    for token in sorted(vocab):
        token2idx[token] = len(token2idx)
    return token2idx

src_token2idx = create_token2idx(source_vocab)
tgt_token2idx = create_token2idx(target_vocab)
tgt_idx2token = {idx: token for token, idx in tgt_token2idx.items()}

SRC_MAX_LEN = 300  # maximum tokens for article body
TGT_MAX_LEN = 20   # maximum tokens for title

def numericalize(text, token2idx, add_sos_eos=False, max_len=None):
    tokens = tokenize(text)
    if add_sos_eos:
        tokens = ['<sos>'] + tokens + ['<eos>']
    indices = [token2idx.get(token, token2idx['<unk>']) for token in tokens]
    if max_len is not None:
        indices = indices[:max_len]
    return indices

### Dataset and DataLoader Preparation

In [10]:
class WikiDataset(Dataset):
    def __init__(self, df, src_token2idx, tgt_token2idx, src_max_len=SRC_MAX_LEN, tgt_max_len=TGT_MAX_LEN):
        self.df = df
        self.src_token2idx = src_token2idx
        self.tgt_token2idx = tgt_token2idx
        self.src_max_len = src_max_len
        self.tgt_max_len = tgt_max_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        src_text = self.df.iloc[idx]['processed_text']
        tgt_text = self.df.iloc[idx]['title']
        src_indices = numericalize(src_text, self.src_token2idx, add_sos_eos=False, max_len=self.src_max_len)
        tgt_indices = numericalize(tgt_text, self.tgt_token2idx, add_sos_eos=True, max_len=self.tgt_max_len)
        return torch.tensor(src_indices), torch.tensor(tgt_indices)

def collate_fn(batch):
    src_batch, tgt_batch = zip(*batch)
    src_lens = [len(x) for x in src_batch]
    tgt_lens = [len(x) for x in tgt_batch]
    src_pad = nn.utils.rnn.pad_sequence(src_batch, batch_first=True, padding_value=src_token2idx['<pad>'])
    tgt_pad = nn.utils.rnn.pad_sequence(tgt_batch, batch_first=True, padding_value=tgt_token2idx['<pad>'])
    return src_pad, torch.tensor(src_lens), tgt_pad, torch.tensor(tgt_lens)

BATCH_SIZE = 32
train_dataset = WikiDataset(train_df, src_token2idx, tgt_token2idx)
val_dataset = WikiDataset(validation_df, src_token2idx, tgt_token2idx)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)


### Model Architecture

#### EncoderRNN with GloVe Loader

In [11]:
HIDDEN_DIM = 300
EMB_DIM = 300

class EncoderRNN(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, dropout=0.5):
        super(EncoderRNN, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=src_token2idx['<pad>'])
        self.gru = nn.GRU(emb_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)
    
    def load_embeddings(self, glove_path, token2idx):
        """
        glove_path: path to GloVe file (e.g., 'glove.6B.300d.txt')
        token2idx: vocabulary mapping used in this encoder.
        """
        print("Loading GloVe embeddings...")
        # Create a random embedding matrix
        embedding_matrix = np.random.normal(size=(len(token2idx), EMB_DIM))
        # Read the GloVe file and update the matrix for known tokens
        with open(glove_path, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split()
                word = parts[0]
                if word in token2idx:
                    vector = np.array(parts[1:], dtype='float32')
                    embedding_matrix[token2idx[word]] = vector
        # Set the embedding weights and freeze them if desired
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
        print("GloVe embeddings loaded.")
    
    def forward(self, src, src_lengths):
        embedded = self.dropout(self.embedding(src))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_outputs, hidden = self.gru(packed_embedded)
        outputs, _ = nn.utils.rnn.pad_packed_sequence(packed_outputs, batch_first=True)
        return outputs, hidden

# Define the expected path for the GloVe file.
glove_path = '/kaggle/working/glove.6B.300d.txt'

# If the GloVe file does not exist, download and unzip it.
if not os.path.exists(glove_path):
    print("GloVe file not found. Downloading and extracting...")
    # Download the GloVe zip file from the Stanford NLP website.
    !wget http://nlp.stanford.edu/data/glove.6B.zip -O /kaggle/working/glove.6B.zip
    # Unzip the downloaded file into the working directory.
    !unzip /kaggle/working/glove.6B.zip -d /kaggle/working/
    print("GloVe file downloaded and extracted.")
else:
    print("GloVe file found.")

# Instantiate the encoder (input_dim equals the size of src_token2idx)
encoder = EncoderRNN(input_dim=len(src_token2idx), emb_dim=EMB_DIM, hidden_dim=HIDDEN_DIM)

# Load GloVe embeddings into the encoder.
encoder.load_embeddings(glove_path, src_token2idx)

GloVe file not found. Downloading and extracting...
--2025-03-27 15:36:28--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-03-27 15:36:28--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-03-27 15:36:29--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822

#### HierEncoderRNN

In [12]:
class HierEncoderRNN(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, dropout=0.5):
        super(HierEncoderRNN, self).__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim, padding_idx=src_token2idx['<pad>'])
        self.sentence_gru = nn.GRU(emb_dim, hidden_dim, batch_first=True)
        self.document_gru = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, src, src_lengths):
        # Here, we assume each article is a string of tokens that we split into sentences by period.
        # In practice, you should pre-segment sentences.
        batch_size = src.size(0)
        outputs_all = []
        hidden_sentences = []
        for i in range(batch_size):
            # Convert indices back to tokens (simple join, then split by period)
            tokens = [tgt_idx2token.get(idx.item(), '<unk>') for idx in src[i]]
            text = " ".join(tokens)
            sentences = text.split('.')
            sentence_embeddings = []
            for sent in sentences:
                sent_tokens = tokenize(sent.strip())
                if not sent_tokens:
                    continue
                # Convert sentence tokens to indices (ignoring OOV for simplicity)
                indices = [src_token2idx.get(token, src_token2idx['<unk>']) for token in sent_tokens]
                sent_tensor = torch.tensor(indices).unsqueeze(0).to(src.device)
                sent_emb = self.dropout(self.embedding(sent_tensor))
                _, sent_hidden = self.sentence_gru(sent_emb)
                sentence_embeddings.append(sent_hidden.squeeze(0))
            if len(sentence_embeddings) == 0:
                sentence_embeddings.append(torch.zeros(self.sentence_gru.hidden_size).to(src.device))
            sentence_embeddings = torch.stack(sentence_embeddings).unsqueeze(0)  # [1, num_sent, hidden_dim]
            _, doc_hidden = self.document_gru(sentence_embeddings)
            outputs_all.append(sentence_embeddings)
            hidden_sentences.append(doc_hidden)
        # Stack hidden states
        hidden = torch.stack(hidden_sentences, dim=1)  # [1, batch, hidden_dim]
        # Note: outputs are not used in our basic decoder; only final hidden state is returned.
        return None, hidden

#### DecoderRNN and Decoder2RNN

In [13]:
# Standard decoder (already defined before)
class DecoderRNN(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, dropout=0.5):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=tgt_token2idx['<pad>'])
        self.gru = nn.GRU(emb_dim, hidden_dim, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        
    def forward(self, input, hidden):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded, hidden)
        output = self.relu(output)
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden

# Alternative decoder with two GRU layers
class Decoder2RNN(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, dropout=0.5):
        super(Decoder2RNN, self).__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim, padding_idx=tgt_token2idx['<pad>'])
        self.gru1 = nn.GRU(emb_dim, hidden_dim, batch_first=True)
        self.gru2 = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()
        
    def forward(self, input, hidden):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))
        output1, hidden1 = self.gru1(embedded, hidden)
        output2, hidden2 = self.gru2(output1, hidden1)
        output2 = self.relu(output2)
        prediction = self.fc_out(output2.squeeze(1))
        return prediction, hidden2

#### Seq2seqRNN with Encoder/Decoder Choice and Beam Search

In [14]:
class Seq2seqRNN(nn.Module):
    def __init__(self, encoder, decoder, device, use_beam_search=False):
        """
        encoder: an instance of EncoderRNN or HierEncoderRNN
        decoder: an instance of DecoderRNN or Decoder2RNN
        use_beam_search: default decoding mode is greedy if False.
        """
        super(Seq2seqRNN, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.use_beam_search = use_beam_search

    def forward(self, src, src_lengths, tgt, teacher_forcing_ratio=0.5, beam_width=3):
        # If not using beam search, perform standard (greedy) decoding during training.
        if not self.use_beam_search:
            batch_size = src.size(0)
            tgt_len = tgt.size(1)
            tgt_vocab_size = self.decoder.embedding.num_embeddings
            outputs = torch.zeros(batch_size, tgt_len, tgt_vocab_size).to(self.device)
            encoder_outputs, hidden = self.encoder(src, src_lengths)
            input = tgt[:, 0]  # <sos>
            for t in range(1, tgt_len):
                output, hidden = self.decoder(input, hidden)
                outputs[:, t] = output
                teacher_force = random.random() < teacher_forcing_ratio
                top1 = output.argmax(1)
                input = tgt[:, t] if teacher_force else top1
            return outputs
        else:
            # Beam search decoding (applied at inference only)
            return self.beam_search_decode(src, src_lengths, beam_width=beam_width)

    def beam_search_decode(self, src, src_lengths, beam_width=3, max_len=TGT_MAX_LEN):
        self.encoder.eval()
        self.decoder.eval()
        batch_size = src.size(0)
        # We assume batch_size=1 for beam search decoding
        assert batch_size == 1, "Beam search decoding is implemented for batch_size=1."
        encoder_outputs, hidden = self.encoder(src, src_lengths)
        # Start with <sos>
        init_token = tgt_token2idx['<sos>']
        beams = [([init_token], hidden, 0)]  # (sequence, hidden state, cumulative log prob)

        for _ in range(max_len):
            new_beams = []
            for seq, hidden_state, score in beams:
                last_token = torch.tensor([seq[-1]]).to(self.device)
                # Stop if <eos> already generated
                if seq[-1] == tgt_token2idx['<eos>']:
                    new_beams.append((seq, hidden_state, score))
                    continue
                output, hidden_new = self.decoder(last_token, hidden_state)
                log_probs = torch.log_softmax(output, dim=1)
                top_log_probs, top_indices = log_probs.topk(beam_width)
                for log_prob, idx in zip(top_log_probs[0], top_indices[0]):
                    new_seq = seq + [idx.item()]
                    new_score = score + log_prob.item()
                    new_beams.append((new_seq, hidden_new, new_score))
            # Keep top beams
            beams = sorted(new_beams, key=lambda x: x[2], reverse=True)[:beam_width]
            # If all beams ended with <eos>, break early
            if all(seq[-1] == tgt_token2idx['<eos>'] for seq, _, _ in beams):
                break
        # Return the highest scoring sequence (excluding the <sos>)
        best_seq = beams[0][0][1:]
        return best_seq  # list of token indices

### Training and Evaluation Functions

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(ignore_index=tgt_token2idx['<pad>'])

def train(model, loader, optimizer, criterion, clip=1):
    model.train()
    epoch_loss = 0
    for src, src_lens, tgt, tgt_lens in loader:
        src, tgt = src.to(device), tgt.to(device)
        optimizer.zero_grad()
        output = model(src, src_lens, tgt, teacher_forcing_ratio=0.5)
        output_dim = output.shape[-1]
        output = output[:, 1:].reshape(-1, output_dim)
        tgt = tgt[:, 1:].reshape(-1)
        loss = criterion(output, tgt)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(loader)

def evaluate(model, loader, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for src, src_lens, tgt, tgt_lens in loader:
            src, tgt = src.to(device), tgt.to(device)
            output = model(src, src_lens, tgt, teacher_forcing_ratio=0)
            output_dim = output.shape[-1]
            output = output[:, 1:].reshape(-1, output_dim)
            tgt = tgt[:, 1:].reshape(-1)
            loss = criterion(output, tgt)
            epoch_loss += loss.item()
    return epoch_loss / len(loader)

N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion, clip=CLIP)
    valid_loss = evaluate(model, val_loader, criterion)
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'seq2seq_model.pt')
    print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Val Loss: {valid_loss:.3f}')

Epoch: 01 | Train Loss: 0.218 | Val Loss: 0.394
Epoch: 02 | Train Loss: 0.205 | Val Loss: 0.369
Epoch: 03 | Train Loss: 0.199 | Val Loss: 0.379
Epoch: 04 | Train Loss: 0.190 | Val Loss: 0.409
Epoch: 05 | Train Loss: 0.185 | Val Loss: 0.411
Epoch: 06 | Train Loss: 0.180 | Val Loss: 0.432
Epoch: 07 | Train Loss: 0.176 | Val Loss: 0.396
Epoch: 08 | Train Loss: 0.170 | Val Loss: 0.411
Epoch: 09 | Train Loss: 0.169 | Val Loss: 0.422
Epoch: 10 | Train Loss: 0.162 | Val Loss: 0.411


### Inference and ROUGE Evaluation

In [19]:
def generate_title(model, src_sequence, src_length, use_beam_search_flag=False, beam_width=3):
    model.eval()
    src_sequence = src_sequence.unsqueeze(0).to(device)
    src_length = torch.tensor([src_length]).to(device)
    
    if not use_beam_search_flag:
        with torch.no_grad():
            encoder_outputs, hidden = model.encoder(src_sequence, src_length)
        input_token = torch.tensor([tgt_token2idx['<sos>']]).to(device)
        generated_tokens = []
        for _ in range(TGT_MAX_LEN):
            with torch.no_grad():
                output, hidden = model.decoder(input_token, hidden)
                top1 = output.argmax(1)
            if top1.item() == tgt_token2idx['<eos>']:
                break
            generated_tokens.append(top1.item())
            input_token = top1
    else:
        with torch.no_grad():
            best_seq = model.beam_search_decode(src_sequence, src_length, beam_width=beam_width, max_len=TGT_MAX_LEN)
        generated_tokens = best_seq
    
    title = " ".join([tgt_idx2token.get(idx, '<unk>') for idx in generated_tokens])
    return title

# Load test set (assumes a 'text' column exists)
test_df = pd.read_csv('/kaggle/input/wiki-data/test.csv')
if 'processed_text' not in test_df.columns:
    test_df['processed_text'] = test_df['text'].apply(lambda x: re.sub(r'[^\w\s]', ' ', x.lower().encode('ascii', errors='ignore').decode()))

predictions = []
references = []  # populate if test set has reference titles

for idx, row in test_df.iterrows():
    src_indices = numericalize(row['processed_text'], src_token2idx, add_sos_eos=False, max_len=SRC_MAX_LEN)
    title_pred = generate_title(model, torch.tensor(src_indices), len(src_indices), use_beam_search_flag=False, beam_width=3)
    predictions.append(title_pred)
    if 'title' in test_df.columns:
        references.append(row['title'])
    else:
        references.append("")

if any(references):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge1_scores, rouge2_scores, rougeL_scores = [], [], []
    for ref, pred in zip(references, predictions):
        scores = scorer.score(ref, pred)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)
    print(f"ROUGE-1 F1: {np.mean(rouge1_scores):.4f}")
    print(f"ROUGE-2 F1: {np.mean(rouge2_scores):.4f}")
    print(f"ROUGE-L F1: {np.mean(rougeL_scores):.4f}")
else:
    print("No reference titles available for ROUGE evaluation.")

print("Title generation complete.")

ROUGE-1 F1: 0.0397
ROUGE-2 F1: 0.0033
ROUGE-L F1: 0.0397
Title generation complete.


# C1. Attention is all you need!

### Imports and Environment Setup

In [21]:
import os
import numpy as np
import pandas as pd
!pip install evaluate
import evaluate
import nltk
from datasets import Dataset, DatasetDict
from transformers import (AutoTokenizer, AutoModelForSeq2SeqLM, 
                          Seq2SeqTrainer, Seq2SeqTrainingArguments, DataCollatorForSeq2Seq)
import torch

# Set random seeds for reproducibility
np.random.seed(71)

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


### Load Pretrained T5 Model

In [22]:
model_name = "google-t5/t5-small"  # Pretrained T5-small
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

### Load Raw Dataset

In [23]:
# Load the training data
train_df = pd.read_csv('/kaggle/input/wiki-data/train.csv')

# Extract a validation set (e.g., 10% of the training data)
validation_ratio = 0.1
val_df = train_df.sample(frac=validation_ratio, random_state=42)
train_df = train_df.drop(val_df.index).reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

print("Training set size:", len(train_df))
print("Validation set size:", len(validation_df))


test_df = pd.read_csv('/kaggle/input/wiki-data/test.csv')

Training set size: 12491
Validation set size: 500


### Prepare the Hugging Face Dataset and Tokenizer

In [24]:
def preprocess_function(example):
    # Create the input by prepending a task-specific prefix to the text.
    input_text = "generate title: " + example["text"]
    target_text = example["title"]
    
    # Tokenize inputs and targets; truncate inputs to max_length=512.
    model_inputs = tokenizer(input_text, max_length=512, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(target_text, max_length=64, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Convert dataframes to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Map preprocessing over the dataset
train_dataset = train_dataset.map(preprocess_function, batched=False)
val_dataset = val_dataset.map(preprocess_function, batched=False)
test_dataset = test_dataset.map(preprocess_function, batched=False)

datasets = DatasetDict({"train": train_dataset, "validation": val_dataset, "test": test_dataset})

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Map:   0%|          | 0/12491 [00:00<?, ? examples/s]



Map:   0%|          | 0/1388 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

### Set up Training Arguments & Trainer

In [25]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./t5-finetuned-titlegen",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,  # if using GPU with mixed precision
    logging_dir='./logs',
)

# ROUGE metric: using the evaluate library
rouge_metric = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in labels so they can be decoded
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Compute ROUGE scores.
    result = rouge_metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    # Multiply each score by 100 to get percentages
    result = {key: value * 100 for key, value in result.items()}
    return result

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)



Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

  trainer = Seq2SeqTrainer(


### Generate Predictions and Evaluate ROUGE

In [26]:
def generate_predictions(dataset, decoding_method="greedy", beam_size=5):
    generated_titles = []
    references = []
    for example in dataset:
        input_ids = example["input_ids"]
        input_tensor = torch.tensor([input_ids]).to(model.device)
        if decoding_method == "greedy":
            outputs = model.generate(input_tensor, max_length=64)
        elif decoding_method == "beam":
            outputs = model.generate(input_tensor, max_length=64, num_beams=beam_size, early_stopping=True)
        else:
            raise ValueError("Unknown decoding method")
        pred = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_titles.append(pred)
        ref = tokenizer.decode(example["labels"], skip_special_tokens=True)
        references.append(ref)
    return generated_titles, references

sample_test = test_dataset.select(range(50))

greedy_preds, greedy_refs = generate_predictions(sample_test, decoding_method="greedy")
beam_preds, beam_refs = generate_predictions(sample_test, decoding_method="beam", beam_size=5)

rouge_scores_greedy = rouge_metric.compute(predictions=greedy_preds, references=greedy_refs, use_stemmer=True)
rouge_scores_beam = rouge_metric.compute(predictions=beam_preds, references=beam_refs, use_stemmer=True)

print("Greedy Decoding ROUGE Scores:")
for key, score in rouge_scores_greedy.items():
    print(f"{key}: {score*100:.2f}")

print("\nBeam Search Decoding ROUGE Scores:")
for key, score in rouge_scores_beam.items():
    print(f"{key}: {score*100:.2f}")

Greedy Decoding ROUGE Scores:
rouge1: 8.57
rouge2: 2.49
rougeL: 8.38
rougeLsum: 8.45

Beam Search Decoding ROUGE Scores:
rouge1: 8.08
rouge2: 2.33
rougeL: 7.77
rougeLsum: 7.81


# C2. Prompt engineering

### Imports

In [27]:
import evaluate
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

### Load sample dataset

In [28]:
# Load ROUGE metric
rouge_metric = evaluate.load("rouge")


# Assume test CSV has columns "text" (article body) and "title" (reference title)
test_df = pd.read_csv("/kaggle/input/wiki-data/test.csv")
# For demonstration, select a small sample (e.g., 10 examples)
sample_test = test_df.sample(n=10, random_state=71)

### Define Prompt variations

In [29]:
# Define two prompt variations
prompts = [
    "Generate a concise title for the following article: ",
    "Write an appropriate, short title for this article: "
]

def generate_titles(model, tokenizer, article_text, prompt):
    # Construct input by concatenating prompt and article text.
    # Optionally, you may truncate article_text to fit model's max_length.
    input_text = prompt + article_text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    # Generate title with greedy decoding
    outputs = model.generate(**inputs, max_length=64)
    title = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return title

def evaluate_titles(generated, references):
    result = rouge_metric.compute(predictions=generated, references=references, use_stemmer=True)
    # Multiply by 100 for percentages
    result = {key: value * 100 for key, value in result.items()}
    return result

### Load Flan-T5 models

In [30]:
#  (base and large) along with their tokenizers
model_names = {
    "flan-t5-base": "google/flan-t5-base",
    "flan-t5-large": "google/flan-t5-large"
}

results = {}

for model_key, model_id in model_names.items():
    print(f"\nLoading model: {model_id}")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    for prompt in prompts:
        print(f"\nPrompt Variation: '{prompt}'")
        generated_titles = []
        reference_titles = []  # from dataset for evaluation
        for _, row in sample_test.iterrows():
            article_text = row["text"]
            ref_title = row["title"]
            gen_title = generate_titles(model, tokenizer, article_text, prompt)
            generated_titles.append(gen_title)
            reference_titles.append(ref_title)
            print(f"Article: {article_text[:100]}...")
            print(f"Generated Title: {gen_title}")
            print(f"Reference Title: {ref_title}")
            print("-"*50)
        
        rouge_scores = evaluate_titles(generated_titles, reference_titles)
        print("ROUGE Scores:")
        for key, score in rouge_scores.items():
            print(f"{key}: {score:.2f}")
        
        # Store results for later reference
        results[(model_key, prompt)] = {
            "generated_titles": generated_titles,
            "reference_titles": reference_titles,
            "rouge": rouge_scores
        }


Loading model: google/flan-t5-base


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]


Prompt Variation: 'Generate a concise title for the following article: '
Article: Gottschalk or Godescalc (Old High German) is a male German name that can be translated literally as ...
Generated Title: Gottschalk
Reference Title: Gottschalk
--------------------------------------------------
Article: Indiana Wesleyan University (IWU) is a private evangelical Christian university headquartered in Mar...
Generated Title: Indiana Wesleyan University
Reference Title: Indiana Wesleyan University
--------------------------------------------------
Article: Abia State () is a state in the South-East geopolitical zone of Nigeria, bordered to the north and n...
Generated Title: Abia State
Reference Title: Abia State
--------------------------------------------------
Article: Paul Thompson may refer to:

Education
Paul Thompson (professor) (born 1951), British management pro...
Generated Title: Paul Thompson may also refer to:
Reference Title: Paul Thompson
--------------------------------------

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]


Prompt Variation: 'Generate a concise title for the following article: '
Article: Gottschalk or Godescalc (Old High German) is a male German name that can be translated literally as ...
Generated Title: Gottschalk
Reference Title: Gottschalk
--------------------------------------------------
Article: Indiana Wesleyan University (IWU) is a private evangelical Christian university headquartered in Mar...
Generated Title: Indiana Wesleyan University
Reference Title: Indiana Wesleyan University
--------------------------------------------------
Article: Abia State () is a state in the South-East geopolitical zone of Nigeria, bordered to the north and n...
Generated Title: Abia State
Reference Title: Abia State
--------------------------------------------------
Article: Paul Thompson may refer to:

Education
Paul Thompson (professor) (born 1951), British management pro...
Generated Title: Paul Thompson
Reference Title: Paul Thompson
--------------------------------------------------
Articl