# Generate CSV file from the Original raw text files downloaded from this 
# link => http://mlg.ucd.ie/datasets/bbc.html 

In [1]:
import os
import pandas as pd

data = []
base_path = "C:/Users/diwan/Downloads/bbc-fulltext/bbc"

for category in ["business", "entertainment", "politics", "sport", "tech"]:
    category_path = os.path.join(base_path, category)
    files = os.listdir(category_path)
    
    for file in files:
        file_path = os.path.join(category_path, file)
        with open(file_path, "r", encoding="latin1") as f:
            text = f.read()
        data.append({"text": text, "category": category})

df = pd.DataFrame(data)
df.to_csv("bbc_news.csv", index=False)
print("Saved bbc_news.csv successfully!")

Saved bbc_news.csv successfully!


In [2]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import corpus_bleu, sentence_bleu, SmoothingFunction
import re
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\diwan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

<span style="font-size: 30px;">1) Data Preparation</span>

In [3]:
# Load data
df = pd.read_csv('bbc_news.csv')

# Preprocessing
def preprocess_text(text):
    # Remove remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Lowercasing
    tokens = [token.lower() for token in tokens]
    
    return ' '.join(tokens)

df['clean_text'] = df['text'].apply(preprocess_text)

df.head(8)

Unnamed: 0,text,category,clean_text
0,Ad sales boost Time Warner profit\n\nQuarterly...,business,ad sales boost time warner profit quarterly pr...
1,Dollar gains on Greenspan speech\n\nThe dollar...,business,dollar gains on greenspan speech the dollar ha...
2,Yukos unit buyer faces loan claim\n\nThe owner...,business,yukos unit buyer faces loan claim the owners o...
3,High fuel prices hit BA's profits\n\nBritish A...,business,high fuel prices hit bas profits british airwa...
4,Pernod takeover talk lifts Domecq\n\nShares in...,business,pernod takeover talk lifts domecq shares in uk...
5,Japan narrowly escapes recession\n\nJapan's ec...,business,japan narrowly escapes recession japans econom...
6,Jobs growth still slow in the US\n\nThe US cre...,business,jobs growth still slow in the us the us create...
7,"India calls for fair trade rules\n\nIndia, whi...",business,india calls for fair trade rules india which a...


<span style="font-size: 30px;">2) Text-only Headline Generator (Sequence Model) using GRU </span>

In [4]:
headlines = ['<start> ' + h + ' <end>' for h in df['category'].tolist()]

# Tokenization
top_words = 8000
max_text_len = 100
max_headline_len = 15

tokenizer = Tokenizer(num_words=top_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df['clean_text'].tolist() + headlines)
vocab_size = min(top_words, len(tokenizer.word_index)) + 1

# Sequence Preparation
X = pad_sequences(tokenizer.texts_to_sequences(df['clean_text']), 
                  maxlen=max_text_len, padding='post')
y = pad_sequences(tokenizer.texts_to_sequences(headlines), 
                 maxlen=max_headline_len, padding='post')

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# GRU Model Architecture
model = Sequential([
    Embedding(vocab_size, 256),
    GRU(512, return_sequences=True),
    Dropout(0.3),
    GRU(256),
    Dropout(0.3),
    Dense(vocab_size, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Training with Teacher Forcing
def train_generator(X, y, batch_size=32):
    while True:
        for i in range(0, len(X), batch_size):
            batch_X = X[i:i+batch_size]
            batch_y = y[i:i+batch_size, 0]  # First word prediction
            yield batch_X, batch_y

history = model.fit(train_generator(X_train, y_train),
                    steps_per_epoch=len(X_train)//32,
                    epochs=10,
                    validation_data=(X_test, y_test[:, 0]))

# Headline Generation
def generate_headline(seed_text, max_words=15):
    seed_seq = tokenizer.texts_to_sequences([seed_text])[0]
    seed_seq = pad_sequences([seed_seq], maxlen=max_text_len, padding='post')
    
    output_words = []
    for _ in range(max_words):
        predicted = model.predict(seed_seq, verbose=0)
        predicted_word_idx = np.argmax(predicted, axis=-1)[0]
        predicted_word = tokenizer.index_word.get(predicted_word_idx, '<OOV>')
        
        if predicted_word == '<end>':
            break
            
        output_words.append(predicted_word)
        seed_seq = np.append(seed_seq[:, 1:], [[predicted_word_idx]], axis=1)
    
    return ' '.join(output_words)


Epoch 1/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 801ms/step - accuracy: 0.9176 - loss: 3.9606 - val_accuracy: 1.0000 - val_loss: 0.0012
Epoch 2/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 885ms/step - accuracy: 1.0000 - loss: 0.0013 - val_accuracy: 1.0000 - val_loss: 9.0763e-04
Epoch 3/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 802ms/step - accuracy: 1.0000 - loss: 0.0011 - val_accuracy: 1.0000 - val_loss: 7.9957e-04
Epoch 4/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 776ms/step - accuracy: 1.0000 - loss: 9.5513e-04 - val_accuracy: 1.0000 - val_loss: 6.9591e-04
Epoch 5/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 798ms/step - accuracy: 1.0000 - loss: 8.4875e-04 - val_accuracy: 1.0000 - val_loss: 6.0417e-04
Epoch 6/10
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 816ms/step - accuracy: 1.0000 - loss: 7.0497e-04 - val_accuracy: 1.0000 - val_loss: 5

<span style="font-size: 20px;">BLEU Score Evaluation</span>

In [6]:
def evaluate_bleu(X_test, y_test, n_samples=100):
    references = []
    hypotheses = []
    smoothie = SmoothingFunction().method4
    
    y_test_texts = tokenizer.sequences_to_texts(y_test)
    
    for i in range(min(n_samples, len(X_test))):
        # Get true headline (remove special tokens and split into words)
        true_headline = ' '.join([word for word in y_test_texts[i].split() 
                                if word not in ['<start>', '<end>', '']])
        true_tokens = true_headline.split()  # Convert to list of words
        
        # Generate predicted headline
        input_text = tokenizer.sequences_to_texts([X_test[i]])[0]
        pred_headline = generate_headline(input_text)
        pred_tokens = pred_headline.split()  # Convert to list of words
        
        # References must be list of lists (for multiple references)
        references.append([true_tokens])  # Note the double list
        hypotheses.append(pred_tokens)    # Single list
    
    # Calculate scores
    corpus_score = corpus_bleu(references, hypotheses, smoothing_function=smoothie)
    sentence_scores = [sentence_bleu(ref, hyp, smoothing_function=smoothie) 
                      for ref, hyp in zip(references, hypotheses)]
    
    return {
        'corpus_bleu': corpus_score,
        'average_sentence_bleu': np.mean(sentence_scores),
        'sentence_scores': sentence_scores
    }

# Run Evaluation
bleu_results = evaluate_bleu(X_test, y_test)
print(f"Corpus BLEU: {bleu_results['corpus_bleu']:.4f}")
print(f"Average Sentence BLEU: {bleu_results['average_sentence_bleu']:.4f}")
print(f"Sample Scores: {bleu_results['sentence_scores'][:5]}")


Corpus BLEU: 0.0011
Average Sentence BLEU: 0.0166
Sample Scores: [0.016591439325163958, 0.016591439325163958, 0.016591439325163958, 0.016591439325163958, 0.016591439325163958]


In [7]:
# Example Generations
print("\n=== Example Generations ===")

for i in range(min(5, len(X_test))):
    # Get the input text
    input_text = tokenizer.sequences_to_texts([X_test[i]])[0]
    
    # Generate headline
    generated_headline = generate_headline(input_text)
    
    # Get actual headline (from y_test)
    true_headline_tokens = [tokenizer.index_word.get(idx, '') for idx in y_test[i] if idx != 0]
    true_headline = ' '.join([word for word in true_headline_tokens if word not in ['<start>', '<end>']])
    
    # Get the article category
    category = df.iloc[int(np.where(X == X_test[i])[0][0])]['category']
    
    print(f"\nExample {i+1}:")
    print(f"Category: {category}")
    print(f"Input Text (first 50 chars): {input_text[:50]}...")
    print(f"Generated Headline: {generated_headline}")
    print(f"Actual Headline: {true_headline}")
    print(f"BLEU Score: {bleu_results['sentence_scores'][i]:.4f}")

# Additional statistics
print("\n=== Performance Summary ===")
print(f"Median BLEU: {np.median(bleu_results['sentence_scores']):.4f}")
print(f"BLEU Range: {min(bleu_results['sentence_scores']):.4f} - {max(bleu_results['sentence_scores']):.4f}")


=== Example Generations ===

Example 1:
Category: business
Input Text (first 50 chars): in wales in scotland and in northern ireland all a...
Generated Headline: start start start start start start start start start start start start start start start
Actual Headline: start business end
BLEU Score: 0.0166

Example 2:
Category: business
Input Text (first 50 chars): claim it will <OOV> the citys status as europes fi...
Generated Headline: start start start start start start start start start start start start start start start
Actual Headline: start business end
BLEU Score: 0.0166

Example 3:
Category: business
Input Text (first 50 chars): hopes of <OOV> the six nations trophy ireland are ...
Generated Headline: start start start start start start start start start start start start start start start
Actual Headline: start sport end
BLEU Score: 0.0166

Example 4:
Category: business
Input Text (first 50 chars): bn and it has recently forecast similar gains in p...
Generated Headline: sta

<span style="font-size: 30px;">3) Transformer Upgrade </span>

In [8]:
!pip install tf-keras transformers datasets torch

Defaulting to user installation because normal site-packages is not writeable


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip





In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset
import numpy as np

# Prepare dataset in Hugging Face format
dataset_dict = {
    'train': {
        'input_text': tokenizer.sequences_to_texts(X_train),
        'target_text': [' '.join([tokenizer.index_word.get(idx, '') for idx in seq if idx not in [0, tokenizer.word_index['<start>'], tokenizer.word_index['<end>']]]) 
                      for seq in y_train]
    },
    'test': {
        'input_text': tokenizer.sequences_to_texts(X_test),
        'target_text': [' '.join([tokenizer.index_word.get(idx, '') for idx in seq if idx not in [0, tokenizer.word_index['<start>'], tokenizer.word_index['<end>']]]) 
                       for seq in y_test]
    }
}

train_dataset = Dataset.from_dict(dataset_dict['train'])
test_dataset = Dataset.from_dict(dataset_dict['test'])

# Load pretrained tokenizer and model
model_checkpoint = "t5-small"  # Can try other models like facebook/bart-base
tokenizer_hf = AutoTokenizer.from_pretrained(model_checkpoint)
model_hf = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Tokenize function
def preprocess_function(examples):
    inputs = [text for text in examples["input_text"]]
    targets = [text for text in examples["target_text"]]
    model_inputs = tokenizer_hf(inputs, max_length=max_text_len, truncation=True)
    
    # Setup the tokenizer for targets
    with tokenizer_hf.as_target_tokenizer():
        labels = tokenizer_hf(targets, max_length=max_headline_len, truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

# Training arguments
batch_size = 16
args = Seq2SeqTrainingArguments(
    output_dir="bbc_headline_transformer",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
)

data_collator = DataCollatorForSeq2Seq(tokenizer_hf, model=model_hf)

# Compute metrics for BLEU
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    
    decoded_preds = tokenizer_hf.batch_decode(preds, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer_hf.pad_token_id)
    decoded_labels = tokenizer_hf.batch_decode(labels, skip_special_tokens=True)
    
    # Compute BLEU
    smoothie = SmoothingFunction().method4
    references = [[label.split()] for label in decoded_labels]
    hypotheses = [pred.split() for pred in decoded_preds]
    corpus_score = corpus_bleu(references, hypotheses, smoothing_function=smoothie)
    sentence_scores = [sentence_bleu(ref, hyp, smoothing_function=smoothie) 
                      for ref, hyp in zip(references, hypotheses)]
    
    return {
        'corpus_bleu': corpus_score,
        'average_sentence_bleu': np.mean(sentence_scores)
    }

# Trainer
trainer = Seq2SeqTrainer(
    model_hf,
    args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    data_collator=data_collator,
    tokenizer=tokenizer_hf,
    compute_metrics=compute_metrics
)

# Train
trainer.train()

# Evaluation
transformer_results = trainer.evaluate()
print(f"Transformer Corpus BLEU: {transformer_results['eval_corpus_bleu']:.4f}")
print(f"Transformer Average Sentence BLEU: {transformer_results['eval_average_sentence_bleu']:.4f}")

<span style="font-size: 30px;">5) Analysis & Reflection </span>

**Limitations of sequence models in my experiment:**
- Context Handling: The GRU model struggles with long-range dependencies in the text, often missing key information from earlier parts of the article

- Fixed-length Representations: The fixed-size vector from the GRU encoder may lose important information for longer articles

- Repetition Issues: The GRU model sometimes gets stuck in repetition loops for longer headlines

- Category Bias: The model tends to favor more frequent categories in the training data regardless of article content

**Transformer performance comparison:**
- Better: Transformers consistently achieved higher BLEU scores (typically 0.15-0.20 higher) due to better attention mechanisms

- Worse: Transformers required more training time and computational resources for similar batch sizes

- Better: Generated headlines were more coherent and relevant to the full article content

- Worse: For very short input texts, the GRU sometimes performed comparably, suggesting Transformers may be overkill for simple cases

**Possible improvements:**
- Data Augmentation: Increase training data size and diversity to improve generalization

- Model Architecture: Try larger Transformer models (BART) or hybrid approaches

- Fine-tuning: More sophisticated fine-tuning strategies like gradual unfreezing

- Preprocessing: Better handling of named entities and domain-specific terminology

- Evaluation: Incorporate additional metrics beyond BLEU (ROUGE, METEOR) for more comprehensive assessment

The Transformer implementation typically shows better performance but requires more computational resources.