## Phase 1: Setup

In [2]:
import os
cwd = os.getcwd()
print(f"Current Working Directory: {cwd}")

Current Working Directory: /home/jupyter/V4_seq2seq/notebooks


In [5]:
# Phase 1: Setup (Revised Paths)

# --- Standard & ML Libraries ---
# (Keep all imports the same)
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, LSTM, GRU, Embedding, Dense, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import json
import re
import time
import pickle
import matplotlib.pyplot as plt
import sentencepiece as spm
from langdetect import detect, LangDetectException
import os
import traceback

print(f"--- Setup ---")
print(f"TensorFlow Version: {tf.__version__}")
print(f"SentencePiece Version: {spm.__version__}")

# -- GPU Check & Configuration --
# (Keep GPU check code the same)
gpu_devices = tf.config.experimental.list_physical_devices('GPU')
if gpu_devices:
    print(f"Num GPUs Available: {len(gpu_devices)}")
    try:
        for gpu in gpu_devices: tf.config.experimental.set_memory_growth(gpu, True)
        print("GPU memory growth enabled.")
    except RuntimeError as e: print(f"Could not set memory growth: {e}")
else:
    print("WARNING: No GPU detected by TensorFlow. Training will proceed on CPU.")

# --- Parameters ---
# Project Base Directory (relative to notebook location)
BASE_DIR = ".." # Goes up one level from 'notebooks/' to 'pib_summarizer/'


# File Paths (relative to BASE_DIR now)
DATA_PATH = os.path.join(BASE_DIR, 'data', 'mergedt04.jsonl')
GLOVE_PATH = os.path.join(BASE_DIR, 'data', 'glove.6B', 'glove.6B.100d.txt') # <--- Verify filename and 100d

# SentencePiece Parameters
SP_MODEL_PREFIX = os.path.join(BASE_DIR, 'models', 'pib_spm_70k') # Save model in 'models' folder
SP_VOCAB_SIZE = 16000
SP_TEMP_INPUT_FILE = os.path.join(BASE_DIR, 'spm_training_data_70k.txt') # Temp file in base project dir

# Embedding Parameters
EMBEDDING_DIM = 100        # *** MUST MATCH the dimension of your GloVe file (e.g., 100 for 100d) ***

# Sequence Lengths
MAXLEN_INPUT = 1000
MAXLEN_OUTPUT = 200

# Model Hyperparameters
LSTM_UNITS = 256

# Training Parameters
BATCH_SIZE = 64
EPOCHS = 20
VALIDATION_SPLIT = 0.15
LEARNING_RATE = 0.001
EARLY_STOPPING_PATIENCE = 3

# Scheduled Sampling (Placeholder)
INITIAL_SCHEDULED_SAMPLING_PROB = 1.0
SCHEDULED_SAMPLING_K = 10.0

# Special tokens managed by SentencePiece: UNK_ID=0, BOS_ID=1 (start), EOS_ID=2 (end)
# Keras Padding ID will be 0.
START_TOKEN = '<s>'
END_TOKEN = '</s>'

# Word count limits for filtering AFTER cleaning
MIN_TEXT_WORDS = 15
MIN_SUMMARY_WORDS = 7

# File paths for saving artifacts (relative to BASE_DIR)
SP_MODEL_PATH = f'{SP_MODEL_PREFIX}.model' # Will be BASE_DIR/models/pib_spm_70k.model
EMBEDDING_MATRIX_PATH = os.path.join(BASE_DIR, 'models', f'embedding_matrix_{SP_VOCAB_SIZE}v_{EMBEDDING_DIM}d.npy')
MODEL_SAVE_PATH = os.path.join(BASE_DIR, 'models', f'adv_seq2seq_{SP_VOCAB_SIZE}v_{LSTM_UNITS}u_no_attention.keras')
HISTORY_PLOT_PATH = os.path.join(BASE_DIR, 'output', f'adv_training_history_{SP_VOCAB_SIZE}v.png') # Save plot in 'output'

print("Parameters defined using relative paths.")
# Check if paths exist now
print(f"Checking DATA_PATH: {DATA_PATH} - Exists: {os.path.exists(DATA_PATH)}")
print(f"Checking GLOVE_PATH: {GLOVE_PATH} - Exists: {os.path.exists(GLOVE_PATH)}")
if not os.path.exists(GLOVE_PATH):
     print(f"CRITICAL WARNING: GloVe file NOT FOUND at the specified path!")

--- Setup ---
TensorFlow Version: 2.11.0
SentencePiece Version: 0.2.0
Num GPUs Available: 1
GPU memory growth enabled.
Parameters defined using relative paths.
Checking DATA_PATH: ../data/mergedt04.jsonl - Exists: True
Checking GLOVE_PATH: ../data/glove.6B/glove.6B.100d.txt - Exists: True


## Phase 2: Data Loading & Initial Preprocessing (Adapted for SP)

In [6]:
# Phase 2: Data Loading & Initial Preprocessing (Corrected Final Selection)

print(f"\n--- Phase 2: Data Loading & Initial Preprocessing ---")

# --- Preprocessing Function (Simplified for SentencePiece Training) ---
def initial_preprocess(text):
    """Basic cleaning suitable BEFORE SentencePiece tokenization."""
    if not isinstance(text, str): return ""
    text = text.lower()
    text = text.replace('\u2018', "'").replace('\u2019', "'")
    text = text.replace('\u201c', '"').replace('\u201d', '"')
    text = re.sub(r'c:\\users\\.*?(\.jpg|\.png|\.doc|\.docx)\b', ' ', text, flags=re.IGNORECASE)
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)
    text = re.sub(r'\[\d+\]', ' ', text)
    # Keep headers/footers for now, let SP handle
    # text = re.sub(r'^ministry of [^\n]+\n?', '', text)
    # text = re.sub(r'\*+[\n\s]+[a-z]{2}/[a-z]{2,3}\s*$', '', text)
    text = text.replace('\n', ' ')
    text = re.sub(r'[^\w\s.,!?"\'\-]', ' ', text) # Keep word chars, whitespace, basic punc
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# --- Main Loading Logic ---
all_records = []
df_model = None
skipped_json_error = 0
skipped_missing_key = 0
skipped_invalid_type = 0
skipped_empty_content = 0
skipped_language = 0
skipped_short = 0
line_count = 0
sp_file_created = False # Flag to track if SP input file was created

try:
    if not os.path.exists(DATA_PATH):
        raise FileNotFoundError(f"Input data file not found at {DATA_PATH}")

    print(f"Loading and processing data from {DATA_PATH}...")
    with open(DATA_PATH, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            line_count = i + 1
            if line_count % 5000 == 0: print(f"  Processed {line_count} lines...")

            record = None
            try:
                record = json.loads(line)
                if not isinstance(record, dict): raise ValueError("Line not dict")

                raw_text = record.get('extracted_text')
                raw_summary = record.get('gemini_summary')

                if raw_text is None or raw_summary is None: skipped_missing_key += 1; continue
                if not (isinstance(raw_text, str) and isinstance(raw_summary, str)): skipped_invalid_type += 1; continue
                if not raw_text.strip() or not raw_summary.strip(): skipped_empty_content += 1; continue

                # Language Detection
                try:
                    sample_text = raw_text[:1000]
                    if len(sample_text) < 50: lang = 'en'
                    else: lang = detect(sample_text)
                    if lang != 'en': skipped_language += 1; continue
                except: skipped_language += 1; continue

                cleaned_input_text = initial_preprocess(raw_text)
                cleaned_summary = initial_preprocess(raw_summary)

                input_word_count = len(cleaned_input_text.split())
                summary_word_count = len(cleaned_summary.split())
                if not (input_word_count >= MIN_TEXT_WORDS and summary_word_count >= MIN_SUMMARY_WORDS):
                    skipped_short += 1; continue

                all_records.append({
                     'encoder_input_text': cleaned_input_text, # Store cleaned text
                     'cleaned_summary': cleaned_summary,     # Store cleaned summary
                 })

            except json.JSONDecodeError: skipped_json_error += 1
            except Exception: skipped_invalid_type += 1 # Or a more general error counter

    print(f"\n--- Data Loading & Initial Cleaning Summary ---")
    # (Keep print statements for skipped counts)
    print(f"Total lines read: {line_count}")
    print(f"Valid English records loaded & cleaned: {len(all_records)}")
    print(f"Skipped due to JSON Error: {skipped_json_error}")
    print(f"Skipped due to Missing Keys ('extracted_text'/'gemini_summary'): {skipped_missing_key}")
    print(f"Skipped due to Invalid Data Types (not str): {skipped_invalid_type}")
    print(f"Skipped due to Empty Content (after strip): {skipped_empty_content}")
    print(f"Skipped non-English / lang detect fail: {skipped_language}")
    print(f"Skipped records failing length filter: {skipped_short}")


    if not all_records: raise ValueError("CRITICAL: No valid data records remain.")

    # --- Convert to DataFrame ---
    df = pd.DataFrame(all_records)
    del all_records # Free memory

    # --- Prepare text file for SentencePiece training ---
    # This needs the cleaned columns from the DataFrame 'df'
    print(f"\nPreparing text file '{SP_TEMP_INPUT_FILE}' for SentencePiece training...")
    lines_written_sp = 0
    try:
        with open(SP_TEMP_INPUT_FILE, 'w', encoding='utf-8') as f:
            # Write encoder input text
            for text in df['encoder_input_text']:
                if isinstance(text, str) and text.strip():
                     f.write(text + '\n')
                     lines_written_sp += 1
            # Write cleaned summary text
            for text in df['cleaned_summary']:
                if isinstance(text, str) and text.strip():
                     f.write(text + '\n')
                     lines_written_sp += 1
        print(f"SentencePiece training file created with approx {lines_written_sp} non-empty lines.")
        sp_file_created = True
    except Exception as e:
        print(f"ERROR creating SentencePiece training file: {e}")
        sp_file_created = False


    # --- Finalize DataFrame for Modeling ---
    # ** CORRECTED PART **
    # Create the tagged columns FIRST, using the 'cleaned_summary' column which exists in 'df'
    print("\nAdding start/end tokens to DataFrame...")
    # Make sure START_TOKEN and END_TOKEN are defined (they should be from Phase 1)
    df['decoder_input_text'] = START_TOKEN + ' ' + df['cleaned_summary']
    df['decoder_target_text'] = df['cleaned_summary'] + ' ' + END_TOKEN

    # NOW select the final columns needed for the next phases into df_model
    # We need 'encoder_input_text' for input tokenizer/padding (Phase 4)
    # We need 'decoder_input_text' and 'decoder_target_text' for output tokenizer/padding (Phase 4)
    # We do NOT need 'cleaned_summary' anymore in df_model itself after creating the tagged versions
    df_model = df[['encoder_input_text', 'decoder_input_text', 'decoder_target_text']].copy()
    print("Final DataFrame 'df_model' created for modeling.")
    # --- END CORRECTION ---

    del df # Free memory from the intermediate DataFrame

    print("\nInitial Preprocessing Complete.")
    print(f"Final DataFrame size for modeling: {len(df_model)}")
    if not df_model.empty:
        print("\nSample final data (first 3 rows):")
        pd.set_option('display.max_colwidth', 80)
        print(df_model.head(3))
    else:
        print("CRITICAL WARNING: DataFrame 'df_model' is empty after final selection!")


except FileNotFoundError: print(f"ERROR: Data file not found at {DATA_PATH}. Please check the path."); df_model = None
except ValueError as ve: print(ve); df_model = None
except Exception as e:
    print(f"An critical error occurred during data loading/preprocessing: {e}")
    traceback.print_exc(); df_model = None


--- Phase 2: Data Loading & Initial Preprocessing ---
Loading and processing data from ../data/mergedt04.jsonl...
  Processed 5000 lines...
  Processed 10000 lines...
  Processed 15000 lines...
  Processed 20000 lines...
  Processed 25000 lines...
  Processed 30000 lines...
  Processed 35000 lines...
  Processed 40000 lines...
  Processed 45000 lines...
  Processed 50000 lines...
  Processed 55000 lines...
  Processed 60000 lines...
  Processed 65000 lines...
  Processed 70000 lines...

--- Data Loading & Initial Cleaning Summary ---
Total lines read: 74128
Valid English records loaded & cleaned: 73935
Skipped due to JSON Error: 0
Skipped due to Missing Keys ('extracted_text'/'gemini_summary'): 44
Skipped due to Invalid Data Types (not str): 0
Skipped due to Empty Content (after strip): 0
Skipped non-English / lang detect fail: 17
Skipped records failing length filter: 132

Preparing text file '../spm_training_data_70k.txt' for SentencePiece training...
SentencePiece training file cre

## Phase 3: SentencePiece Tokenizer Training

In [14]:
# Phase 3: SentencePiece Tokenizer Training (Using Relative Paths)

print(f"\n--- Phase 3: Training SentencePiece Tokenizer ---")

sp_model_path_check = f"{SP_MODEL_PREFIX}.model" # Path includes directory e.g., ../models/pib_spm_70k.model

# Check if prerequisites are available
sp_file_created = 'sp_file_created' in locals() and sp_file_created
if sp_file_created and os.path.exists(SP_TEMP_INPUT_FILE):
    try:
        # Check if model already exists to avoid retraining unnecessarily
        if os.path.exists(sp_model_path_check):
             print(f"SentencePiece model '{sp_model_path_check}' already exists. Skipping training.")
        else:
            print(f"Training SentencePiece model...")
            print(f"  Input file: {SP_TEMP_INPUT_FILE}")
            print(f"  Model prefix: {SP_MODEL_PREFIX} (Output directory: {os.path.dirname(SP_MODEL_PREFIX)})")
            print(f"  Vocab size: {SP_VOCAB_SIZE}")

            # Ensure the output directory for the model exists
            os.makedirs(os.path.dirname(SP_MODEL_PREFIX), exist_ok=True)

            # Define SentencePiece training arguments using variables from Phase 1
            # START_TOKEN and END_TOKEN strings are used for bos/eos piece representation
            spm_command = (
                f'--input={SP_TEMP_INPUT_FILE} '
                f'--model_prefix={SP_MODEL_PREFIX} ' # Includes path to models dir
                f'--vocab_size={SP_VOCAB_SIZE} '
                f'--model_type=unigram '
                f'--character_coverage=1.0 '
                f'--unk_id=0 '
                f'--bos_id=1 ' # This ID will be used for START_TOKEN
                f'--eos_id=2 ' # This ID will be used for END_TOKEN
                f'--pad_id=-1 ' # SP internal ignore ID, Keras uses 0 for padding later
                f'--unk_piece=<unk> '
                f'--bos_piece={START_TOKEN} ' # Visual representation in vocab file
                f'--eos_piece={END_TOKEN} ' # Visual representation in vocab file
                f'--remove_extra_whitespaces=true '
                f'--normalization_rule_name=nmt_nfkc_cf'
            )

            # Train the model
            spm.SentencePieceTrainer.train(spm_command)

            print(f"\nSentencePiece training complete.")
            print(f"Model saved to: {SP_MODEL_PREFIX}.model")
            print(f"Vocabulary saved to: {SP_MODEL_PREFIX}.vocab")

        # Optional: Clean up the temporary training file now that SP model is trained
        # try:
        #     if os.path.exists(SP_TEMP_INPUT_FILE):
        #          os.remove(SP_TEMP_INPUT_FILE)
        #          print(f"Removed temporary file: {SP_TEMP_INPUT_FILE}")
        # except OSError as e:
        #     print(f"Warning: Could not remove temporary file {SP_TEMP_INPUT_FILE}: {e}")

    except Exception as e:
        print(f"An error occurred during SentencePiece training: {e}")
        traceback.print_exc()
else:
    if not sp_file_created:
         print("Skipping SentencePiece training because the temporary input file was not created successfully.")
    elif not os.path.exists(SP_TEMP_INPUT_FILE):
         print(f"Skipping SentencePiece training because the temporary input file '{SP_TEMP_INPUT_FILE}' does not exist.")
    else:
         print("Skipping SentencePiece training due to other missing prerequisites.")


--- Phase 3: Training SentencePiece Tokenizer ---
SentencePiece model '../models/pib_spm_70k.model' already exists. Skipping training.


## Phase 4: Tokenization & Embedding Matrix Preparation

In [15]:
# Phase 4: Tokenization using SentencePiece & Embedding Matrix Prep

print(f"\n--- Phase 4: Tokenization using SentencePiece & Embedding Matrix Prep ---")

# Initialize variables for this phase's outputs
encoder_input_padded, decoder_input_padded, decoder_target_padded = None, None, None
sp = None
embedding_matrix = None
actual_sp_vocab_size = 0 # Use this for Embedding layer dim later

# Check if prerequisites from previous phases are available
# SP_MODEL_PATH was defined in Phase 1 using SP_MODEL_PREFIX
if ('df_model' in locals() and df_model is not None and not df_model.empty and
    'SP_MODEL_PATH' in locals() and os.path.exists(SP_MODEL_PATH) and
    'GLOVE_PATH' in locals() and os.path.exists(GLOVE_PATH)):
    try:
        # --- 1. Load SentencePiece Model ---
        print(f"Loading SentencePiece model from {SP_MODEL_PATH}...")
        sp = spm.SentencePieceProcessor()
        sp.load(SP_MODEL_PATH)
        actual_sp_vocab_size = sp.get_piece_size() # Get actual vocab size from loaded model
        if actual_sp_vocab_size <= 0:
             raise ValueError("Loaded SentencePiece model has zero or negative vocabulary size!")
        print(f"SentencePiece model loaded. Actual Vocabulary size: {actual_sp_vocab_size}")
        print(f"  UNK ID: {sp.unk_id()}, BOS ID (start): {sp.bos_id()}, EOS ID (end): {sp.eos_id()}")

        # --- 2. Tokenize Data using loaded SP model ---
        print("\nTokenizing text data using loaded SP model...")
        # Check if the required columns exist in df_model (created at the end of Phase 2)
        required_cols = ['encoder_input_text', 'decoder_input_text', 'decoder_target_text']
        if not all(col in df_model.columns for col in required_cols):
             raise KeyError(f"Missing one or more required columns in df_model: {required_cols}")

        # Use SentencePiece to encode the text columns into integer IDs
        # Using list comprehension for potentially better performance than pandas apply
        print("  Tokenizing encoder input...")
        encoder_input_sequences = [sp.encode(text, out_type=int) for text in df_model['encoder_input_text']]

        print("  Tokenizing decoder input...")
        # The 'decoder_input_text' column already has START_TOKEN prepended as a string
        # SentencePiece encoding will handle converting START_TOKEN ('<s>') to its BOS ID (1)
        decoder_input_sequences = [sp.encode(text, out_type=int) for text in df_model['decoder_input_text']]

        print("  Tokenizing decoder target...")
        # The 'decoder_target_text' column already has END_TOKEN appended as a string
        # SentencePiece encoding will handle converting END_TOKEN ('</s>') to its EOS ID (2)
        decoder_target_sequences = [sp.encode(text, out_type=int) for text in df_model['decoder_target_text']]

        print("Tokenization complete.")
        # Print samples to verify tokenization, especially start/end IDs
        if encoder_input_sequences: print(f"  Sample encoder sequence[0]: {encoder_input_sequences[0][:15]}...")
        if decoder_input_sequences: print(f"  Sample decoder input sequence[0]: {decoder_input_sequences[0][:15]}... (Should start with BOS ID: {sp.bos_id()})")
        if decoder_target_sequences: print(f"  Sample decoder target sequence[0]: {decoder_target_sequences[0][:15]}... (Should end with EOS ID: {sp.eos_id()})")


        # --- 3. Padding ---
        print("\nPadding sequences...")
        # Pad sequences using 0. This is the standard value Keras Embedding expects for masking.
        # SentencePiece uses ID 0 for <unk>, ID 1 for <s> (BOS), ID 2 for </s> (EOS).
        # So, padding with 0 means we are padding with the <unk> ID, which is acceptable,
        # as the Embedding layer's mask_zero=True will ignore these 0s anyway.
        encoder_input_padded = pad_sequences(encoder_input_sequences, maxlen=MAXLEN_INPUT, padding='post', truncating='post', value=0)
        decoder_input_padded = pad_sequences(decoder_input_sequences, maxlen=MAXLEN_OUTPUT, padding='post', truncating='post', value=0)
        decoder_target_padded = pad_sequences(decoder_target_sequences, maxlen=MAXLEN_OUTPUT, padding='post', truncating='post', value=0)
        # Clear large intermediate lists to free memory
        del encoder_input_sequences, decoder_input_sequences, decoder_target_sequences
        print("Padding complete.")
        print(f"  Padded Encoder Input Shape: {encoder_input_padded.shape}")
        print(f"  Padded Decoder Input Shape: {decoder_input_padded.shape}")
        print(f"  Padded Decoder Target Shape: {decoder_target_padded.shape}")


        # --- 4. Prepare GloVe Embedding Matrix ---
        print(f"\nLoading GloVe embeddings from: {GLOVE_PATH}")
        embeddings_index = {}
        loaded_vector_count = 0
        print("  Reading GloVe file (can take a minute)...")
        try:
            with open(GLOVE_PATH, 'r', encoding='utf-8') as f:
                for line_num, line in enumerate(f):
                     # Print progress occasionally for large GloVe files
                     if (line_num + 1) % 100000 == 0: print(f"    Processed {line_num+1} GloVe lines...")
                     values = line.split()
                     word = values[0]
                     try:
                         coefs = np.asarray(values[1:], dtype='float32')
                         # Check dimension ONLY if word is likely relevant
                         if len(coefs) == EMBEDDING_DIM:
                              embeddings_index[word] = coefs
                              loaded_vector_count += 1
                     except ValueError: pass # Ignore lines with parsing errors
            print(f"  Found {loaded_vector_count} word vectors of dimension {EMBEDDING_DIM} in GloVe file.")
            if loaded_vector_count == 0:
                 raise ValueError("No vectors loaded from GloVe file. Check file path, format, and EMBEDDING_DIM.")
        except Exception as e:
             print(f"ERROR loading or processing GloVe file: {e}")
             raise

        print(f"\nCreating embedding matrix with shape ({actual_sp_vocab_size}, {EMBEDDING_DIM})...")
        # Initialize matrix (e.g., with small random values)
        # Using float32 for compatibility with TensorFlow/Keras layers
        embedding_matrix = np.random.uniform(-0.05, 0.05, (actual_sp_vocab_size, EMBEDDING_DIM)).astype(np.float32)

        hits = 0
        misses = 0
        # Iterate through the SentencePiece vocabulary (IDs 0 to vocab_size-1)
        for i in range(actual_sp_vocab_size):
             piece = sp.id_to_piece(i) # Get the actual word/subword piece string
             embedding_vector = embeddings_index.get(piece) # Look it up in GloVe
             if embedding_vector is not None:
                 # If found in GloVe, use the GloVe vector
                 embedding_matrix[i] = embedding_vector
                 hits += 1
             else:
                 # If not found, keep the random initialization (model will learn these)
                 misses += 1

        # Crucial Step: Ensure the padding index (0) has a zero vector for Keras masking
        # Regardless of what SP assigns to ID 0 (<unk>), Keras mask_zero=True needs the
        # vector at index 0 to be all zeros when the input value is 0.
        embedding_matrix[0] = np.zeros((EMBEDDING_DIM,), dtype=np.float32)

        print("Embedding matrix created.")
        print(f"  GloVe vectors mapped (Hits): {hits}")
        print(f"  SP Tokens not in GloVe (Misses, incl <unk>,<s>,</s>): {misses}")
        print(f"  Padding index 0 vector explicitly zeroed out.")


        # Save the embedding matrix using the path defined in Phase 1
        print(f"\nSaving embedding matrix to '{EMBEDDING_MATRIX_PATH}'...")
        # Ensure directory exists
        os.makedirs(os.path.dirname(EMBEDDING_MATRIX_PATH), exist_ok=True)
        np.save(EMBEDDING_MATRIX_PATH, embedding_matrix)
        print("Embedding matrix saved.")


    except FileNotFoundError as e:
         print(f"ERROR: A required file was not found: {e}")
         encoder_input_padded, decoder_input_padded, decoder_target_padded = None, None, None
         sp = None; embedding_matrix = None; actual_sp_vocab_size = 0
    except KeyError as e:
         print(f"ERROR: Missing expected column in DataFrame during tokenization: {e}")
         encoder_input_padded, decoder_input_padded, decoder_target_padded = None, None, None
         sp = None; embedding_matrix = None; actual_sp_vocab_size = 0
    except Exception as e:
        print(f"An error occurred during Phase 4: {e}")
        traceback.print_exc()
        encoder_input_padded, decoder_input_padded, decoder_target_padded = None, None, None
        sp = None; embedding_matrix = None; actual_sp_vocab_size = 0
else:
    print("Skipping Phase 4 due to missing prerequisites (df_model, SP model file, or GloVe file).")


--- Phase 4: Tokenization using SentencePiece & Embedding Matrix Prep ---
Loading SentencePiece model from ../models/pib_spm_70k.model...
SentencePiece model loaded. Actual Vocabulary size: 16000
  UNK ID: 0, BOS ID (start): 1, EOS ID (end): 2

Tokenizing text data using loaded SP model...
  Tokenizing encoder input...
  Tokenizing decoder input...
  Tokenizing decoder target...
Tokenization complete.
  Sample encoder sequence[0]: [21, 5, 615, 239, 85, 76, 229, 11785, 3143, 176, 325, 5, 74, 250, 5]...
  Sample decoder input sequence[0]: [60, 0, 10, 0, 12, 96, 5, 3, 1159, 2660, 142, 637, 10820, 9, 3]... (Should start with BOS ID: 1)
  Sample decoder target sequence[0]: [12, 96, 5, 3, 1159, 2660, 142, 637, 10820, 9, 3, 3313, 2217, 239, 554]... (Should end with EOS ID: 2)

Padding sequences...
Padding complete.
  Padded Encoder Input Shape: (73935, 1000)
  Padded Decoder Input Shape: (73935, 200)
  Padded Decoder Target Shape: (73935, 200)

Loading GloVe embeddings from: ../data/glove.6B

In [16]:
# Phase 4a: Data Splitting

print(f"\n--- Phase 4a: Splitting Padded Data into Training and Validation sets ---")

# Initialize split variables to None
encoder_input_train, decoder_input_train, decoder_target_train = None, None, None
encoder_input_val, decoder_input_val, decoder_target_val = None, None, None

# Check if the padded data from Phase 4 exists
if ('encoder_input_padded' in locals() and encoder_input_padded is not None and
    'decoder_input_padded' in locals() and decoder_input_padded is not None and
    'decoder_target_padded' in locals() and decoder_target_padded is not None):

    print(f"Original dataset size before split: {len(encoder_input_padded)} samples")
    print(f"Using validation split: {VALIDATION_SPLIT*100:.1f}%")

    try:
        # Perform the split
        encoder_input_train, encoder_input_val, \
        decoder_input_train, decoder_input_val, \
        decoder_target_train, decoder_target_val = train_test_split(
            encoder_input_padded,      # Input texts for encoder
            decoder_input_padded,      # Input summaries for decoder (<start> + summary)
            decoder_target_padded,     # Target summaries for decoder (summary + <end>)
            test_size=VALIDATION_SPLIT, # Fraction for validation set
            random_state=42             # Reproducibility
        )

        print("\nData split successful. Shapes:")
        print(f"Encoder Train:      {encoder_input_train.shape}")
        print(f"Decoder Input Train:{decoder_input_train.shape}")
        print(f"Decoder Target Train:{decoder_target_train.shape}")
        print("---")
        print(f"Encoder Validation: {encoder_input_val.shape}")
        print(f"Decoder Input Val:  {decoder_input_val.shape}")
        print(f"Decoder Target Val: {decoder_target_val.shape}")

        # Sanity check sizes
        assert len(encoder_input_train) + len(encoder_input_val) == len(encoder_input_padded)
        print("\nSplit sizes verified.")

        # Optional: Delete the large padded arrays if memory is tight,
        # as they are now stored in the train/val splits
        # del encoder_input_padded, decoder_input_padded, decoder_target_padded
        # print("Original padded arrays deleted to free memory.")


    except Exception as e:
        print(f"An error occurred during data splitting: {e}")
        traceback.print_exc()
        # Ensure subsequent cells know the split failed
        encoder_input_train, decoder_input_train, decoder_target_train = None, None, None

else:
    print("Skipping data split due to missing padded data from Phase 4.")


--- Phase 4a: Splitting Padded Data into Training and Validation sets ---
Original dataset size before split: 73935 samples
Using validation split: 15.0%

Data split successful. Shapes:
Encoder Train:      (62844, 1000)
Decoder Input Train:(62844, 200)
Decoder Target Train:(62844, 200)
---
Encoder Validation: (11091, 1000)
Decoder Input Val:  (11091, 200)
Decoder Target Val: (11091, 200)

Split sizes verified.


## Phase 5: Model Building (BiLSTM Encoder, Pre-trained Embeddings)


In [17]:
# Cell 6: Phase 5 - Model Building (Revised for Full BiLSTM State Usage)

print(f"\n--- Phase 5: Building the Seq2Seq model (BiLSTM Encoder - Combined States, GloVe Embeddings) ---")

model = None # Initialize model variable

# Check prerequisites
if ('encoder_input_train' in locals() and encoder_input_train is not None and # Using existence of train data as proxy
    'embedding_matrix' in locals() and embedding_matrix is not None and
    'actual_sp_vocab_size' in locals() and actual_sp_vocab_size > 0):

    try:
        # --- Shared Embedding Layer ---
        # (Remains the same as before)
        shared_embedding_layer = Embedding(
            input_dim=actual_sp_vocab_size, output_dim=EMBEDDING_DIM,
            weights=[embedding_matrix], trainable=False, mask_zero=True,
            name='shared_glove_embedding'
        )
        print(f"Shared Embedding layer created. Trainable: {shared_embedding_layer.trainable}")

        # --- Encoder ---
        print("Building Encoder...")
        encoder_inputs = Input(shape=(MAXLEN_INPUT,), name='encoder_input')
        encoder_embedding = shared_embedding_layer(encoder_inputs)
        # Bidirectional LSTM Layer
        encoder_bilstm = Bidirectional(
            LSTM(LSTM_UNITS, return_state=True, name='encoder_lstm'), # Core LSTM
            name='bidirectional_encoder_lstm',
            merge_mode='concat' # Although merge_mode affects sequence output (which we discard),
                               # return_state=True always returns separate fwd/bwd states
        )
        # Output: [combined_sequence (discarded)], h_fwd, c_fwd, h_bwd, c_bwd
        _, state_h_fwd, state_c_fwd, state_h_bwd, state_c_bwd = encoder_bilstm(encoder_embedding)

        # --- Combine Forward and Backward States ---
        # Concatenate hidden states (axis=-1 merges the feature dimension)
        state_h_combined = tf.keras.layers.Concatenate(name='concat_h')([state_h_fwd, state_h_bwd])
        # Concatenate cell states
        state_c_combined = tf.keras.layers.Concatenate(name='concat_c')([state_c_fwd, state_c_bwd])

        # --- Project Combined States (Optional but Recommended) ---
        # Add Dense layers to map concatenated states (2*LSTM_UNITS) back to LSTM_UNITS
        # This allows the decoder LSTM to keep the original LSTM_UNITS size.
        # Use activation like 'tanh' which is common for hidden states
        decoder_init_state_h = Dense(LSTM_UNITS, activation='tanh', name='project_state_h')(state_h_combined)
        decoder_init_state_c = Dense(LSTM_UNITS, activation='tanh', name='project_state_c')(state_c_combined)

        # The final encoder states passed to the decoder
        encoder_states = [decoder_init_state_h, decoder_init_state_c]
        print(f"Encoder configured to pass COMBINED & PROJECTED states (h:{decoder_init_state_h.shape}, c:{decoder_init_state_c.shape}) to decoder.")


        # --- Decoder ---
        # (Decoder definition remains the same, but now receives the projected states)
        print("\nBuilding Decoder...")
        decoder_inputs = Input(shape=(MAXLEN_OUTPUT,), name='decoder_input')
        decoder_embedding = shared_embedding_layer(decoder_inputs) # Reuse shared embedding
        decoder_lstm = LSTM(LSTM_UNITS, return_sequences=True, return_state=True, name='decoder_lstm')
        # Initialize with the NEW projected encoder_states
        decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
        decoder_dense = Dense(actual_sp_vocab_size, activation='softmax', name='output_dense')
        decoder_outputs = decoder_dense(decoder_outputs)

        # --- Define the Training Model ---
        print("\nDefining Keras Model...")
        model = Model(inputs=[encoder_inputs, decoder_inputs],
                      outputs=decoder_outputs,
                      name='seq2seq_bilstm_proj_glove_no_attention') # Updated model name

        print("\nModel Architecture Summary:")
        model.summary(line_length=120) # Slightly wider print for new layers

    except Exception as e:
        print(f"An error occurred during model building: {e}")
        traceback.print_exc()
        model = None
else:
    # Refine the prerequisite check message (same as before)
    missing_prereqs = []
    # ... (add checks as before) ...
    if 'embedding_matrix' not in locals() or embedding_matrix is None: missing_prereqs.append("Embedding Matrix")
    # ... etc ...
    print(f"Skipping model building due to missing prerequisites: {', '.join(missing_prereqs)}.")


--- Phase 5: Building the Seq2Seq model (BiLSTM Encoder - Combined States, GloVe Embeddings) ---
Shared Embedding layer created. Trainable: False
Building Encoder...
Encoder configured to pass COMBINED & PROJECTED states (h:(None, 256), c:(None, 256)) to decoder.

Building Decoder...

Defining Keras Model...

Model Architecture Summary:
Model: "seq2seq_bilstm_proj_glove_no_attention"
________________________________________________________________________________________________________________________
 Layer (type)                          Output Shape               Param #       Connected to                            
 decoder_input (InputLayer)            [(None, 200)]              0             []                                      
                                                                                                                        
 encoder_input (InputLayer)            [(None, 1000)]             0             []                                      
       

## Phase 6: Model Compilation

In [18]:
# Phase 6: Model Compilation

# Check if the model object exists from the previous phase
if 'model' in locals() and model is not None:
    print("\n--- Phase 6: Compiling the model ---")
    try:
        # Define the optimizer with the specified learning rate
        optimizer = Adam(learning_rate=LEARNING_RATE)

        # Configure the model for training
        model.compile(
            optimizer=optimizer,                   # Use Adam optimizer with specified LR
            loss='sparse_categorical_crossentropy', # Correct loss for integer targets
            metrics=['accuracy']                   # Monitor basic accuracy
            )
        print(f"Model compiled successfully with Adam optimizer (LR={LEARNING_RATE}) and sparse categorical crossentropy loss.")
        print("Accuracy will be monitored during training.")

    except Exception as e:
        print(f"An error occurred during model compilation: {e}")
        traceback.print_exc()
        # Invalidate model if compilation fails
        model = None
else:
    print("Skipping model compilation as the model object ('model') was not created successfully in Phase 5.")


--- Phase 6: Compiling the model ---
Model compiled successfully with Adam optimizer (LR=0.001) and sparse categorical crossentropy loss.
Accuracy will be monitored during training.


##  Phase 7: Model Training

In [21]:
# Cell 8: Phase 7 - Model Training (Custom Loop)

print(f"\n--- Phase 7: Custom Training Loop ---")

# Ensure rouge_scorer class exists (from Phase 1 import/install)
# We will instantiate the scorer object later inside the main check
rouge_scorer_class_available = 'rouge_scorer' in globals() and rouge_scorer is not None

history_custom = None # Initialize history

# Check main prerequisites: compiled model, split training data, SP processor
if ('model' in locals() and model is not None and
    'encoder_input_train' in locals() and encoder_input_train is not None and
    'decoder_input_train' in locals() and decoder_input_train is not None and
    'decoder_target_train' in locals() and decoder_target_train is not None and
    'encoder_input_val' in locals() and encoder_input_val is not None and
    'decoder_input_val' in locals() and decoder_input_val is not None and
    'decoder_target_val' in locals() and decoder_target_val is not None and
    'sp' in locals() and sp is not None ):

    print("Setting up training components...")

    # --- Optimizer, Loss, Metrics ---
    optimizer = Adam(learning_rate=LEARNING_RATE)
    loss_object = SparseCategoricalCrossentropy(from_logits=False, reduction='none')

    def masked_loss(real, pred):
        mask = tf.math.logical_not(tf.math.equal(real, 0))
        loss_ = loss_object(real, pred)
        mask = tf.cast(mask, dtype=loss_.dtype)
        loss_ *= mask
        return tf.reduce_sum(loss_) / (tf.reduce_sum(mask) + 1e-9)

    def masked_accuracy(real, pred):
        pred_ids = tf.argmax(pred, axis=-1, output_type=real.dtype)
        match = tf.cast(tf.equal(real, pred_ids), dtype=tf.float32)
        mask = tf.cast(tf.math.logical_not(tf.math.equal(real, 0)), dtype=tf.float32)
        match *= mask
        return tf.reduce_sum(match) / (tf.reduce_sum(mask) + 1e-9)

    train_loss_metric = tf.keras.metrics.Mean(name='train_loss')
    train_accuracy_metric = tf.keras.metrics.Mean(name='train_accuracy')
    val_loss_metric = tf.keras.metrics.Mean(name='val_loss')
    val_accuracy_metric = tf.keras.metrics.Mean(name='val_accuracy')

    # --- Scheduled Sampling Probability Function ---
    def get_sampling_prob(epoch, k=SCHEDULED_SAMPLING_K):
        # Inverse sigmoid decay: p decreases as epoch increases
        prob = k / (k + tf.exp(epoch / k))
        return float(prob)

    # --- Rebuild Inference Models (Needed for train_step and greedy_decode) ---
    encoder_model_inf = None
    decoder_model_inf = None
    try:
        print("Building inference models for training step...")
        # --- Inference Encoder ---
        encoder_inputs_inf = model.get_layer('encoder_input').input
        # Access states from the Bidirectional layer output
        # Output: [combined_sequence], state_h_fwd, state_c_fwd, state_h_bwd, state_c_bwd
        _enc_seq_out, state_h_fwd, state_c_fwd, state_h_bwd, state_c_bwd = model.get_layer('bidirectional_encoder_lstm').output

        # Use the *projected* states if they exist (from the corrected Phase 5)
        if 'project_state_h' in [l.name for l in model.layers]:
             print("  Using projected encoder states for decoder init.")
             state_h_inf = model.get_layer('project_state_h').output
             state_c_inf = model.get_layer('project_state_c').output
        else:
             # Fallback to using forward states if projection layers weren't added (old Phase 5)
             print("  Warning: Projection layers not found, using only FORWARD encoder states.")
             state_h_inf = state_h_fwd
             state_c_inf = state_c_fwd

        encoder_states_inf = [state_h_inf, state_c_inf]
        encoder_model_inf = Model(inputs=encoder_inputs_inf, outputs=encoder_states_inf, name="train_inference_encoder")
        print("Inference Encoder built.")

        # --- Inference Decoder ---
        decoder_state_input_h = Input(shape=(LSTM_UNITS,), name='train_inf_dec_state_h')
        decoder_state_input_c = Input(shape=(LSTM_UNITS,), name='train_inf_dec_state_c')
        decoder_states_inputs_inf = [decoder_state_input_h, decoder_state_input_c]
        decoder_inputs_inf_single = Input(shape=(1,), name='train_inf_dec_input_single')

        # Reuse layers from the main 'model'
        decoder_embedding_layer_inf = model.get_layer('shared_glove_embedding')
        decoder_lstm_inf_layer = model.get_layer('decoder_lstm')
        decoder_dense_inf_layer = model.get_layer('output_dense')

        # Connect inference decoder layers
        decoder_embedding_inf = decoder_embedding_layer_inf(decoder_inputs_inf_single)
        decoder_lstm_outputs_inf, state_h_out_inf, state_c_out_inf = decoder_lstm_inf_layer(decoder_embedding_inf, initial_state=decoder_states_inputs_inf)
        decoder_states_out_inf = [state_h_out_inf, state_c_out_inf]
        decoder_pred_inf = decoder_dense_inf_layer(decoder_lstm_outputs_inf)

        # Define the inference decoder model
        decoder_model_inf = Model(inputs=[decoder_inputs_inf_single] + decoder_states_inputs_inf,
                                  outputs=[decoder_pred_inf] + decoder_states_out_inf,
                                  name="train_inference_decoder")
        print("Inference Decoder built.")

    except Exception as e:
        print(f"ERROR: Failed to build inference models for training step: {e}")
        traceback.print_exc()
        encoder_model_inf = None; decoder_model_inf = None # Ensure they are None


    # --- Custom Training Step Function ---
    # Only define if inference models were built successfully
    if encoder_model_inf and decoder_model_inf:
        # @tf.function # Compile to graph for speed
        def train_step(enc_input_batch, dec_input_batch, dec_target_batch, sampling_prob_tensor):
            batch_loss = tf.constant(0.0, dtype=tf.float32)
            batch_acc_match = tf.constant(0.0, dtype=tf.float32)
            batch_acc_total = tf.constant(0.0, dtype=tf.float32)

            with tf.GradientTape() as tape:
                # Get initial states from the inference encoder model
                enc_states = encoder_model_inf(enc_input_batch, training=True)
                dec_state = enc_states # Initialize decoder state

                # Start token is the first token in the decoder input sequence
                dec_input_token = tf.expand_dims(dec_input_batch[:, 0], 1)

                # Loop through the target sequence (ignoring the first token, typically <start>)
                for t in range(1, dec_target_batch.shape[1]):
                    # Run the inference decoder model for one step
                    predictions, state_h, state_c = decoder_model_inf([dec_input_token] + dec_state, training=True)
                    dec_state = [state_h, state_c] # Update the state for the next step

                    # Calculate loss against the actual target token for this timestep
                    real_target_token = dec_target_batch[:, t]
                    loss_t = masked_loss(real_target_token, predictions)
                    batch_loss += loss_t # Accumulate loss over the sequence

                    # Calculate accuracy for this timestep
                    pred_ids_t = tf.argmax(predictions, axis=-1, output_type=real_target_token.dtype)
                    mask_t = tf.math.logical_not(tf.math.equal(real_target_token, 0))
                    match_t = tf.cast(tf.equal(real_target_token, pred_ids_t), dtype=tf.float32) * tf.cast(mask_t, dtype=tf.float32)
                    batch_acc_match += tf.reduce_sum(match_t)
                    batch_acc_total += tf.reduce_sum(tf.cast(mask_t, dtype=tf.float32))

                    # --- Scheduled Sampling Decision ---
                    # Decide whether to use the ground truth or the model's prediction as the next input
                    use_teacher_forcing = tf.random.uniform(()) < sampling_prob_tensor

                    if use_teacher_forcing:
                        # Use the actual target token from the *current* step as the *next* input
                        dec_input_token = tf.expand_dims(real_target_token, 1)
                    else:
                        # Use the model's prediction (highest probability token ID) as the *next* input
                        dec_input_token = tf.expand_dims(pred_ids_t, 1)

            # Calculate gradients based on the total loss for the sequence
            variables = model.trainable_variables
            gradients = tape.gradient(batch_loss, variables)
             # Clip gradients to prevent exploding gradients (optional but often helpful)
            gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
            # Apply gradients to update model weights
            optimizer.apply_gradients(zip(gradients, variables))

            # Calculate average loss & accuracy for the batch (over non-padded tokens)
            avg_batch_loss = batch_loss / batch_acc_total if batch_acc_total > 0 else 0.0
            avg_batch_acc = batch_acc_match / batch_acc_total if batch_acc_total > 0 else 0.0

            return avg_batch_loss, avg_batch_acc
    else:
        print("ERROR: Cannot define train_step because inference model building failed.")
        encoder_input_train = None # Prevent training loop start


    # --- Greedy Decoding Function (for validation ROUGE check) ---
    if encoder_model_inf and decoder_model_inf:
        # Use numpy for greedy decoding loop as it runs outside tf.function
        def greedy_decode_sequence(input_seq_padded, maxlen_out, start_token_id, end_token_id):
            # Ensure input has batch dimension
            if input_seq_padded.ndim == 1:
                 input_seq_padded = np.expand_dims(input_seq_padded, 0)
            # Predict initial states
            states_value = encoder_model_inf.predict(input_seq_padded, verbose=0)
            # Start sequence with BOS token ID
            target_seq = np.array([[start_token_id]])
            decoded_ids = []
            # Loop until max length or EOS token
            for _ in range(maxlen_out):
                # Predict next token and states
                output_tokens_dist, h, c = decoder_model_inf.predict([target_seq] + states_value, verbose=0)
                # Get the token ID with highest probability (greedy choice)
                sampled_token_id = np.argmax(output_tokens_dist[0, -1, :])
                # Stop if EOS or padding token is predicted
                if sampled_token_id == end_token_id or sampled_token_id == 0: break
                decoded_ids.append(sampled_token_id)
                # Update the target sequence for the next step
                target_seq = np.array([[sampled_token_id]])
                # Update the states
                states_value = [h, c]
            return decoded_ids
    else:
        print("ERROR: Cannot define greedy_decode_sequence because inference models failed.")


    # --- ROUGE Scorer Initialization ---
    scorer = None # Initialize
    if rouge_scorer_class_available:
        try:
            rouge_types = ['rouge1', 'rouge2', 'rougeL']
            scorer = rouge_scorer.RougeScorer(rouge_types=rouge_types, use_stemmer=True)
            print("ROUGE scorer initialized.")
        except Exception as e:
            print(f"Warning: Failed to initialize ROUGE scorer: {e}. Skipping ROUGE calculation.")
            scorer = None
    else:
        print("Warning: rouge-score library not available. Skipping ROUGE calculation.")


    # --- Training Loop ---
    # Final check before starting the potentially long loop
    if encoder_model_inf and decoder_model_inf and encoder_input_train is not None:
        print("\n--- Starting Custom Training Loop ---")
        start_train_time = time.time()
        best_val_metric = -np.inf # Initialize for maximizing ROUGE or -Loss
        metric_to_monitor = 'ROUGE-L F1' if scorer else 'Neg Validation Loss'
        epochs_no_improve = 0
        # Store history manually
        history_custom = {'loss': [], 'accuracy': [], 'val_loss': [], 'val_accuracy': [],
                          'val_rouge1': [], 'val_rouge2': [], 'val_rougeL': []}

        # --- Create TensorFlow Datasets ---
        print("Creating TensorFlow Datasets for efficient batching...")
        try:
            buffer_size = len(encoder_input_train)
            # Training dataset: shuffle, batch, prefetch
            train_dataset = tf.data.Dataset.from_tensor_slices(
                (encoder_input_train, decoder_input_train, decoder_target_train)
            )
            train_dataset = train_dataset.shuffle(buffer_size).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

            # Validation dataset: batch, prefetch (no shuffle)
            val_dataset = tf.data.Dataset.from_tensor_slices(
                (encoder_input_val, decoder_input_val, decoder_target_val)
            )
            val_dataset = val_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
            print("Datasets created.")
            # Get number of batches for progress display
            train_steps_per_epoch = tf.data.experimental.cardinality(train_dataset).numpy()
            val_steps_per_epoch = tf.data.experimental.cardinality(val_dataset).numpy()
            print(f"Train steps per epoch: {train_steps_per_epoch}, Val steps per epoch: {val_steps_per_epoch}")

        except Exception as e:
            print(f"ERROR creating TensorFlow Datasets: {e}")
            traceback.print_exc()
            encoder_input_train = None # Prevent loop start

        # --- Epoch Loop ---
        if encoder_input_train is not None: # Check dataset creation worked
            for epoch in range(EPOCHS):
                epoch_start_time = time.time()
                print(f"\nEpoch {epoch + 1}/{EPOCHS}")

                # Reset metrics at the start of each epoch
                train_loss_metric.reset_states(); train_accuracy_metric.reset_states()
                val_loss_metric.reset_states(); val_accuracy_metric.reset_states()

                # --- Training Batch Loop ---
                current_sampling_prob = get_sampling_prob(epoch, SCHEDULED_SAMPLING_K)
                sampling_prob_tensor = tf.constant(current_sampling_prob, dtype=tf.float32)
                print(f"  Scheduled Sampling Probability (Teacher Forcing): {current_sampling_prob:.4f}")

                for batch, (enc_in, dec_in, dec_target) in enumerate(train_dataset):
                    batch_loss, batch_acc = train_step(enc_in, dec_in, dec_target, sampling_prob_tensor)
                    train_loss_metric(batch_loss)
                    train_accuracy_metric(batch_acc)
                    # Print progress every N batches
                    if (batch + 1) % 100 == 0 or (batch + 1) == train_steps_per_epoch:
                        print(f'  Batch {batch + 1}/{train_steps_per_epoch}, Loss: {train_loss_metric.result():.4f}, Accuracy: {train_accuracy_metric.result():.4f}', end='\r')

                # End of Training Epoch
                epoch_train_loss = train_loss_metric.result().numpy()
                epoch_train_acc = train_accuracy_metric.result().numpy()
                history_custom['loss'].append(epoch_train_loss)
                history_custom['accuracy'].append(epoch_train_acc)
                print(f'\nEpoch {epoch + 1} Training ---- Loss: {epoch_train_loss:.4f}, Accuracy: {epoch_train_acc:.4f}')

                # --- Validation Loop ---
                print("  Running Validation...")
                all_preds_text = []
                all_reals_text = []
                # Limit ROUGE calculation for speed (e.g., first N batches or N samples)
                num_val_samples_for_rouge = BATCH_SIZE * 2 # Example: first 2 batches
                samples_evaluated_rouge = 0

                for val_batch_num, (enc_in_val, dec_in_val, dec_target_val) in enumerate(val_dataset):
                    # Calculate standard val loss/accuracy on ALL validation data
                    val_predictions = model([enc_in_val, dec_in_val], training=False) # Use main model
                    batch_val_loss = masked_loss(dec_target_val, val_predictions)
                    batch_val_acc = masked_accuracy(dec_target_val, val_predictions)
                    val_loss_metric(batch_val_loss)
                    val_accuracy_metric(batch_val_acc)

                    # Generate summaries & collect text for ROUGE on a SUBSET
                    if scorer and samples_evaluated_rouge < num_val_samples_for_rouge:
                        if val_batch_num == 0: print(f"    Generating summaries for ROUGE (approx {num_val_samples_for_rouge} samples)...")
                        for i in range(enc_in_val.shape[0]):
                             if samples_evaluated_rouge < num_val_samples_for_rouge:
                                 single_enc_input = tf.expand_dims(enc_in_val[i], 0)
                                 decoded_ids = greedy_decode_sequence(single_enc_input, MAXLEN_OUTPUT, sp.bos_id(), sp.eos_id())
                                 pred_text = sp.decode(decoded_ids)
                                 # Get non-padding target tokens for reference text
                                 real_target_tokens = [int(t) for t in dec_target_val[i].numpy() if t != 0 ]
                                 real_text = sp.decode(real_target_tokens)
                                 all_preds_text.append(pred_text)
                                 all_reals_text.append(real_text)
                                 samples_evaluated_rouge += 1

                epoch_val_loss = val_loss_metric.result().numpy()
                epoch_val_acc = val_accuracy_metric.result().numpy()
                history_custom['val_loss'].append(epoch_val_loss)
                history_custom['val_accuracy'].append(epoch_val_acc)

                # --- Calculate and Record ROUGE ---
                val_rouge1, val_rouge2, val_rougeL = np.nan, np.nan, np.nan # Default to NaN
                if scorer and all_preds_text:
                    try:
                        aggregator = rouge_scorer.scoring.BootstrapAggregator()
                        print(f"    Calculating ROUGE on {len(all_preds_text)} generated summaries...")
                        for pred, real in zip(all_preds_text, all_reals_text):
                            pred = pred if pred else " " # Handle empty strings
                            real = real if real else " "
                            scores = scorer.score(target=real, prediction=pred)
                            aggregator.add_scores(scores)
                        result = aggregator.aggregate()
                        val_rouge1 = result['rouge1'].mid.fmeasure * 100
                        val_rouge2 = result['rouge2'].mid.fmeasure * 100
                        val_rougeL = result['rougeL'].mid.fmeasure * 100
                        print(f'Epoch {epoch + 1} Validation -- Loss: {epoch_val_loss:.4f}, Acc: {epoch_val_acc:.4f}, ROUGE-L: {val_rougeL:.2f}')
                    except Exception as rouge_e:
                        print(f"Warning: Error calculating ROUGE scores: {rouge_e}")
                        # Keep ROUGE scores as NaN
                else:
                    print(f'Epoch {epoch + 1} Validation -- Loss: {epoch_val_loss:.4f}, Acc: {epoch_val_acc:.4f} (ROUGE not calculated or scorer unavailable)')

                # Append scores (will append NaN if not calculated)
                history_custom['val_rouge1'].append(val_rouge1)
                history_custom['val_rouge2'].append(val_rouge2)
                history_custom['val_rougeL'].append(val_rougeL)


                # --- Early Stopping & Model Saving Logic ---
                # Monitor ROUGE-L F1 score if available and not NaN, otherwise monitor inverted validation loss
                current_val_metric = np.nan # Default to NaN
                if scorer and not np.isnan(val_rougeL):
                    current_val_metric = val_rougeL
                    monitor_metric_name = 'ROUGE-L F1'
                    is_better = current_val_metric > best_val_metric # Higher ROUGE is better
                else:
                    current_val_metric = -epoch_val_loss # Use negative loss (higher is better)
                    monitor_metric_name = 'Neg Validation Loss'
                    is_better = current_val_metric > best_val_metric # Higher neg loss (lower actual loss) is better


                if is_better:
                    print(f'Validation {monitor_metric_name} improved from {best_val_metric:.4f} to {current_val_metric:.4f}. Saving model weights to {MODEL_WEIGHTS_SAVE_PATH}...')
                    best_val_metric = current_val_metric
                    epochs_no_improve = 0
                    # Save only weights
                    model.save_weights(MODEL_WEIGHTS_SAVE_PATH)
                else:
                    epochs_no_improve += 1
                    print(f'Validation {monitor_metric_name} did not improve ({current_val_metric:.4f} vs best {best_val_metric:.4f}). Patience: {epochs_no_improve}/{EARLY_STOPPING_PATIENCE}')

                if epochs_no_improve >= EARLY_STOPPING_PATIENCE:
                    print(f"\nEarly stopping triggered after epoch {epoch + 1}.")
                    break # Exit epoch loop

                print(f"Epoch {epoch + 1} Time: {time.time() - epoch_start_time:.2f} sec")


            # --- End of Epoch Loop ---
            total_train_time = time.time() - start_train_time
            print(f"\n--- Custom Training Loop Finished ---")
            print(f"Total training time: {total_train_time:.2f} seconds ({total_train_time / 60:.2f} minutes)")
            print(f"Best Validation Metric ({monitor_metric_name}): {best_val_metric:.4f}")
            if os.path.exists(MODEL_WEIGHTS_SAVE_PATH):
                 print(f"Model weights for best epoch saved to '{MODEL_WEIGHTS_SAVE_PATH}'")
            else:
                 print("Warning: Best model weights file not found (Training might have stopped before improvement).")


        else: # Handle dataset creation failure
             print("ERROR: Cannot start training loop because TensorFlow dataset creation failed.")
             history_custom = None

    else: # Handle failure to build inference models
        print("ERROR: Cannot start training loop because inference model building failed.")
        history_custom = None

else:
    print("Skipping custom training loop due to missing prerequisites (check previous cell outputs for errors).")
    history_custom = None # Ensure history is None


--- Phase 7: Custom Training Loop ---
Setting up training components...
Building inference models for training step...
  Using projected encoder states for decoder init.
Inference Encoder built.
Inference Decoder built.

--- Starting Custom Training Loop ---
Creating TensorFlow Datasets for efficient batching...
Datasets created.
Train steps per epoch: 982, Val steps per epoch: 174

Epoch 1/20
  Scheduled Sampling Probability (Teacher Forcing): 0.9091


2025-04-23 21:04:52.003699: I tensorflow/compiler/xla/service/service.cc:173] XLA service 0x55f0a24018a0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2025-04-23 21:04:52.003731: I tensorflow/compiler/xla/service/service.cc:181]   StreamExecutor device (0): NVIDIA L4, Compute Capability 8.9
2025-04-23 21:04:52.041755: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:268] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2025-04-23 21:04:52.491709: I tensorflow/compiler/jit/xla_compilation_cache.cc:477] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


  Batch 982/982, Loss: 6.6794, Accuracy: 2.4836
Epoch 1 Training ---- Loss: 6.6794, Accuracy: 2.4836
  Running Validation...
Epoch 1 Validation -- Loss: 6.9219, Acc: 0.0527 (ROUGE not calculated or scorer unavailable)


NameError: name 'MODEL_WEIGHTS_SAVE_PATH' is not defined