In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

import os
import sys
current_dir = os.getcwd()
sys.path.append(os.path.abspath(os.path.join(current_dir, '..')))
from tokenizerManager import TokenizerManager
from data.utils_bbc import get_dataloaders as get_dataloaders_bbc

In [2]:
LOADDATASIZE = 1500  # row of data
BATCH_SIZE = 28  # batch size
PAD_TOKEN_ID = 0

### check GPU avaiabel

In [3]:
import tensorflow as tf
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))

2.10.1
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [4]:
# load the dataset
df = pd.read_parquet('../data/NewsSummary.parquet')
# select the first 500 rows of the dataset
df = df.iloc[:500, :]
# select two columns
df = df[['ctext', 'text']]   # original text and summary
data = df.copy()
data.head()

Unnamed: 0,ctext,text
0,India will host its first WTA tournament in fi...,India will host its first WTA tournament in fi...
1,"Samajwadi Party vice-president Kiranmay Nanda,...","Ousted SP Vice President Kiranmoy Nanda, who w..."
2,"Gurgaon, Jul 7 (PTI) Gurgaon Police today regi...",Servers of a Gurugram-based clothing company h...
3,Cash-strapped holidaymakers are making their t...,Passengers are making their toddlers drag suit...
4,"London, Jun 30 (PTI) Supermodel Gigi Hadid say...",Model Gigi Hadid has said she cannot pull off ...


In [5]:
TOKENIZER_MANAGER = TokenizerManager(num_words=10000)
# TOKENIZER_MANAGER.train_tokenizer_from_csv("../data/bbc-news-summary.csv")
# TOKENIZER_MANAGER.save_tokenizer("../data/tokenizer_fixed.pkl")

TOKENIZER = TOKENIZER_MANAGER.load_tokenizer(load_path="../data/tokenizer_fixed.pkl")
print("tokenizer done, with length", len(TOKENIZER.word_index) + 1)
print("vocab size:", TOKENIZER.num_words)


tokenizer done, with length 34994
vocab size: 10000


In [6]:
word2idx = TOKENIZER.word_index

# show the length of the vocabulary
print(f"Vocabulary size: {len(word2idx)}")
print(f"Number of words: {TOKENIZER.num_words + 1}")

idx2word = {v: k for k, v in word2idx.items()}

for word, idx in word2idx.items():
    print(f"{word}: {idx}")
    if idx > 30:
        break

UNK_ID = word2idx['<OOV>']
print(f"UNK_ID: {UNK_ID}")
print(f"sos token: {word2idx['sos']}")
print(f"eos token: {TOKENIZER.word_index['sos']}")

Vocabulary size: 34993
Number of words: 10001
<OOV>: 1
the: 2
to: 3
of: 4
a: 5
and: 6
in: 7
is: 8
for: 9
that: 10
said: 11
it: 12
on: 13
was: 14
he: 15
be: 16
has: 17
with: 18
p: 19
have: 20
as: 21
at: 22
will: 23
by: 24
not: 25
are: 26
but: 27
i: 28
from: 29
mr: 30
his: 31
UNK_ID: 1
sos token: 33
eos token: 33


In [7]:
# load data for training and validation, size is LOADDATASIZE
train_loader, val_loader = get_dataloaders_bbc("../data/bbc-news-summary.csv", TOKENIZER, LOADDATASIZE, BATCH_SIZE)

Index(['File_path', 'Articles', 'Summaries'], dtype='object')


In [8]:
import torch
from model.GRU_seq2seq import GRUAttentionModel

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vocab_size = TOKENIZER.num_words + 1  # 词汇表大小
embed_dim = 256     # embedding dimension for both encoder and decoder
hidden_dim = 512        # hidden dimension for GRU
num_heads = 8       # 多头注意力的头数
dropout_rate = 0.3  # Dropout 概率

model = GRUAttentionModel(vocab_size, embed_dim, hidden_dim, num_heads=num_heads, dropout_rate=dropout_rate)

optimizer = tf.keras.optimizers.Adam(learning_rate=0.0008)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False)

In [None]:
import torch.nn as nn
from tqdm import tqdm
from IPython.display import clear_output 

train_loss_history = []
val_loss_history = []
  # use cross entropy loss for seq2seq model

LOSS_THRESHOLD= 0.06  # loss threshold for early stopping
EPOCH = 1000

def train_gru_attention_model(
    model, 
    train_loader, 
    val_loader, 
    optimizer, 
    loss_fn, 
    epochs=10, 
    checkpoint_path="gru_attention_checkpoint.h5"
):
    best_val_loss = float('inf')
    for epoch in range(epochs):
        print(f"\nEpoch {epoch+1}/{epochs}")
        train_loss = 0.0

        # tqdm 训练进度条
        train_progress = tqdm(train_loader, desc=f"Training {epoch+1}/{epochs}", unit="batch")
        for encoder_input, target_output in train_progress:
            # 将 PyTorch tensor 转为 TensorFlow tensor
            encoder_input = tf.convert_to_tensor(encoder_input.numpy(), dtype=tf.int32)
            target_output = tf.convert_to_tensor(target_output.numpy(), dtype=tf.int32)
            
            # 构建解码器输入 (Teacher Forcing)
            decoder_input = tf.concat(
                [tf.fill([encoder_input.shape[0], 1], TOKENIZER.word_index['sos']), 
                 target_output[:, :-1]], 
                axis=-1
            )

            with tf.GradientTape() as tape:
                predictions = model(encoder_input, decoder_input)
                loss = loss_fn(target_output, predictions)
            
            gradients = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(gradients, model.trainable_variables))
            train_loss += loss.numpy()
            train_progress.set_postfix(loss=loss.numpy())

        train_loss /= len(train_loader)
        train_loss_history.append(train_loss)
        # print(f"\nTrain Loss: {train_loss:.4f}")

        # Validation Loss Calculation
        val_loss = 0.0
        val_progress = tqdm(val_loader, desc="Validation", unit="batch")
        for encoder_input, target_output in val_progress:
            encoder_input = tf.convert_to_tensor(encoder_input.numpy(), dtype=tf.int32)
            target_output = tf.convert_to_tensor(target_output.numpy(), dtype=tf.int32)
            
            decoder_input = tf.concat(
                [tf.fill([encoder_input.shape[0], 1], TOKENIZER.word_index['sos']), 
                 target_output[:, :-1]], 
                axis=-1
            )
            
            predictions = model(encoder_input, decoder_input)
            loss = loss_fn(target_output, predictions)
            val_loss += loss.numpy()
            val_progress.set_postfix(loss=loss.numpy())
        
        if epoch % 5 == 0:
            clear_output(wait=True)
        
        val_loss /= len(val_loader)
        val_loss_history.append(val_loss)
        print(f"Epoch {epoch+1} - Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            model.save_weights("checkpoint/GRU_seq2seq_bbc_"+str(LOADDATASIZE)+"_fixed_bestStop.h5")
            print("✅ Model saved as the best checkpoint.")

        if train_loss < LOSS_THRESHOLD or val_loss < LOSS_THRESHOLD:
            model.save_weights(checkpoint_path)
            print(f'Early stop at epoch {epoch}: Loss is below the threshold {train_loss:.4f}.')
            break
    model.save_weights(checkpoint_path)
    print(f'stop at epoch {epoch}: - Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}')
            

train_gru_attention_model(
	model=model, 
	train_loader=train_loader, 
	val_loader=val_loader, 
	optimizer=optimizer, 
	loss_fn=loss_fn, 
	epochs=EPOCH, 
	checkpoint_path="checkpoint/GRU_seq2seq_bbc_"+str(LOADDATASIZE)+"_fixed.h5"
)


In [None]:
import matplotlib.pyplot as plt

# ✅ 绘图：训练集 & 验证集 Loss 曲线
print("==============================")
plt.plot(range(1, len(train_loss_history)+1), train_loss_history, label='Train Loss', marker='o')
plt.plot(range(1, len(val_loss_history)+1), val_loss_history, label='Val Loss', marker='x')
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Validation Loss Curve")
plt.legend()
plt.grid(True)
plt.savefig("checkpoint/loss_curve_GRU_bbc_"+str(LOADDATASIZE)+"_fixed.png")
# plt.show()
print("✅ Loss curve diagram is saved successfully")