## Setup and Configuration

In [1]:
# Block 1: Setup and Configuration (Corrected with Definitions)
import os
import re
import json
import time
import logging
from datetime import datetime
import importlib.metadata # Use for getting package versions (Python 3.8+)
import gc # For garbage collection

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
# Callbacks might be replaced by custom loop logic later, but keep imports for now
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard

# Check if SentencePiece is available
try:
    import sentencepiece as spm
except ImportError:
    print("SentencePiece not found. You might need to install it (`pip install sentencepiece`)")
    spm = None

# Check if langdetect is available
try:
    from langdetect import detect, DetectorFactory
    from langdetect.lang_detect_exception import LangDetectException
    DetectorFactory.seed = 0
    _langdetect_installed = True
except ImportError:
    print("langdetect not found. You might need to install it (`pip install langdetect`)")
    _langdetect_installed = False


# Check if rouge-score is available (for evaluation later)
try:
    from rouge_score import rouge_scorer, scoring
except ImportError:
    print("rouge-score not found. You might need to install it (`pip install rouge-score nltk`)")
    try:
        import nltk
        nltk.download('punkt', quiet=True)
    except ImportError:
        print("NLTK not found, which might be needed for rouge-score.")


from tqdm.notebook import tqdm # Use tqdm.notebook for Jupyter/Vertex AI Notebooks

# --- Configuration ---
OUTPUT_DIR = 'model_attention_files' # Changed directory name
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Ensuring output directory exists: {os.path.abspath(OUTPUT_DIR)}")

# File Paths (relative to OUTPUT_DIR)
INPUT_JSONL = 'mergedt02.jsonl' # Input dataset (Make sure this is correct)
OUTPUT_PARQUET = os.path.join(OUTPUT_DIR, 'processed_dataframe.parquet') # Cached processed data
TOKENIZER_MODEL_PREFIX = os.path.join(OUTPUT_DIR, 'pib_summarizer_spm_50k') # Prefix for SentencePiece model
TOKENIZER_MODEL_FILE = f'{TOKENIZER_MODEL_PREFIX}.model'
# LOG_DIR will be used for TensorBoard path in custom loop
LOG_DIR = os.path.join(OUTPUT_DIR, "logs", "custom_train", datetime.now().strftime("%Y%m%d-%H%M%S"))
# MODEL_SAVE_PATH is now used for the *best* model path in custom loop
BEST_MODEL_SAVE_PATH = os.path.join(OUTPUT_DIR, 'pib_summarizer_attention_best.keras')
# Checkpoint directory for intermediate saves
CHECKPOINT_DIR = os.path.join(OUTPUT_DIR, 'training_checkpoints')


# Data Processing Params (Keep Same)
MIN_INPUT_WORDS = 20
MIN_SUMMARY_WORDS = 5
LANG_DETECT_THRESHOLD = 0.90

# Tokenizer Params (Keep Same)
VOCAB_SIZE = 30000 # Target vocabulary size
PAD_ID = 0
UNK_ID = 1
START_ID = 2
END_ID = 3

# Model Hyperparameters (Adjust as needed, add Attention Units)
EMBEDDING_DIM = 100 # Adjusted based on previous log output
LSTM_UNITS = 128   # Encoder units per direction (Adjusted based on previous log)
DECODER_LSTM_UNITS = LSTM_UNITS * 2 # 512 (Adjusted based on previous log)
NUM_ENCODER_LAYERS = 1
NUM_DECODER_LAYERS = 1
DROPOUT_RATE = 0.2
MAX_LEN_INPUT = 1024
MAX_LEN_SUMMARY = 150
ATTENTION_UNITS = 512 # Size of the dense layer in attention calculation (Adjusted based on previous log)

# Training Params (Adjust Batch Size, Epochs may be indicative for custom loop)
BATCH_SIZE = 32 # Adjust based on GPU memory with attention
EPOCHS = 30 # Max epochs for custom loop, early stopping will be manual
LEARNING_RATE = 0.001
# <<< NEW: Scheduled Sampling Params (Example values) >>>
INITIAL_SAMPLING_PROB = 1.0 # Start with 100% teacher forcing
SAMPLING_PROB_DECAY_RATE = 0.99 # Example: Multiplicative decay per epoch/step
MIN_SAMPLING_PROB = 0.1 # Minimum probability to use ground truth

# <<< NEW: Manual Early Stopping/LR Reduction Params (Example values) >>>
# --- ADDED THESE THREE LINES ---
EARLY_STOPPING_PATIENCE_MANUAL = 5 # Epochs to wait for val_loss improvement
REDUCE_LR_PATIENCE_MANUAL = 3    # Epochs to wait for val_loss improvement before reducing LR
REDUCE_LR_FACTOR_MANUAL = 0.2    # Factor to reduce LR by
# --- END OF ADDED LINES ---

# Inference Params
# <<< NEW: Beam Search Width >>>
BEAM_WIDTH = 4 # Set desired beam width

# --- Setup Logging ---
LOG_FILE_PATH = os.path.join(OUTPUT_DIR, 'training_log_attention.log') # Changed log name
# Ensure logging handlers are cleared if re-running the cell in the same kernel session
root_logger = logging.getLogger()
if root_logger.hasHandlers():
    root_logger.handlers.clear()

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler(LOG_FILE_PATH),
                        logging.StreamHandler()
                    ])
print(f"Logging setup complete. Log file: {LOG_FILE_PATH}")


# --- Debugging Info ---
print("\n--- Configuration (Attention, SS, Beam Search) ---")
logging.info("--- Configuration (Attention, SS, Beam Search) ---")
print(f"TensorFlow Version: {tf.__version__}"); logging.info(f"TensorFlow Version: {tf.__version__}")
if spm: print(f"SentencePiece Version: {spm.__version__}"); logging.info(f"SentencePiece Version: {spm.__version__}")
else: print("SentencePiece not imported."); logging.warning("SentencePiece not imported.")

if _langdetect_installed:
    try:
        langdetect_version = importlib.metadata.version("langdetect")
        print(f"Langdetect Version: {langdetect_version}"); logging.info(f"Langdetect Version: {langdetect_version}")
    except Exception as e:
        print(f"Langdetect Version: Could not determine version ({e})"); logging.warning(f"Langdetect Version: Could not determine version ({e})")
else:
     print("Langdetect: Not installed or failed to import."); logging.warning("Langdetect: Not installed or failed to import.")

gpu_devices = tf.config.list_physical_devices('GPU')
print(f"GPU Available: {gpu_devices}"); logging.info(f"GPU Available: {gpu_devices}")
print(f"Input JSONL: {INPUT_JSONL}"); logging.info(f"Input JSONL: {INPUT_JSONL}")
print(f"Output Parquet Cache: {OUTPUT_PARQUET}"); logging.info(f"Output Parquet Cache: {OUTPUT_PARQUET}")
print(f"Tokenizer Model Prefix: {TOKENIZER_MODEL_PREFIX}"); logging.info(f"Tokenizer Model Prefix: {TOKENIZER_MODEL_PREFIX}")
print(f"Model Checkpoint Dir: {CHECKPOINT_DIR}"); logging.info(f"Model Checkpoint Dir: {CHECKPOINT_DIR}")
print(f"Best Model Save Path: {BEST_MODEL_SAVE_PATH}"); logging.info(f"Best Model Save Path: {BEST_MODEL_SAVE_PATH}")
print(f"TensorBoard Log Dir: {LOG_DIR}"); logging.info(f"TensorBoard Log Dir: {LOG_DIR}")
print(f"Vocab Size: {VOCAB_SIZE}"); logging.info(f"Vocab Size: {VOCAB_SIZE}")
print(f"Embedding Dim: {EMBEDDING_DIM}"); logging.info(f"Embedding Dim: {EMBEDDING_DIM}")
print(f"Encoder LSTM Units (per dir): {LSTM_UNITS}"); logging.info(f"Encoder LSTM Units (per dir): {LSTM_UNITS}")
print(f"Decoder LSTM Units: {DECODER_LSTM_UNITS}"); logging.info(f"Decoder LSTM Units: {DECODER_LSTM_UNITS}")
print(f"Attention Units: {ATTENTION_UNITS}"); logging.info(f"Attention Units: {ATTENTION_UNITS}")
print(f"Encoder Layers: {NUM_ENCODER_LAYERS}"); logging.info(f"Encoder Layers: {NUM_ENCODER_LAYERS}")
print(f"Decoder Layers: {NUM_DECODER_LAYERS}"); logging.info(f"Decoder Layers: {NUM_DECODER_LAYERS}")
print(f"Max Input Length: {MAX_LEN_INPUT}"); logging.info(f"Max Input Length: {MAX_LEN_INPUT}")
print(f"Max Summary Length: {MAX_LEN_SUMMARY}"); logging.info(f"Max Summary Length: {MAX_LEN_SUMMARY}")
print(f"Batch Size: {BATCH_SIZE}"); logging.info(f"Batch Size: {BATCH_SIZE}")
print(f"Max Epochs: {EPOCHS}"); logging.info(f"Max Epochs: {EPOCHS}")
print(f"Learning Rate: {LEARNING_RATE}"); logging.info(f"Learning Rate: {LEARNING_RATE}")
print(f"Initial Sampling Prob: {INITIAL_SAMPLING_PROB}"); logging.info(f"Initial Sampling Prob: {INITIAL_SAMPLING_PROB}")
print(f"Sampling Prob Decay: {SAMPLING_PROB_DECAY_RATE}"); logging.info(f"Sampling Prob Decay: {SAMPLING_PROB_DECAY_RATE}")
# Print the newly added manual parameters
print(f"Manual Early Stopping Patience: {EARLY_STOPPING_PATIENCE_MANUAL}"); logging.info(f"Manual Early Stopping Patience: {EARLY_STOPPING_PATIENCE_MANUAL}")
print(f"Manual Reduce LR Patience: {REDUCE_LR_PATIENCE_MANUAL}"); logging.info(f"Manual Reduce LR Patience: {REDUCE_LR_PATIENCE_MANUAL}")
print(f"Manual Reduce LR Factor: {REDUCE_LR_FACTOR_MANUAL}"); logging.info(f"Manual Reduce LR Factor: {REDUCE_LR_FACTOR_MANUAL}")
print(f"Beam Width: {BEAM_WIDTH}"); logging.info(f"Beam Width: {BEAM_WIDTH}")
print("-" * 30); logging.info("-" * 30)

# Also print the paths used in Block 8 for confirmation
print(f"Checkpoint directory (for custom loop): {CHECKPOINT_DIR}")
print(f"Best model save path (for custom loop): {BEST_MODEL_SAVE_PATH}")
print(f"TensorBoard Log Directory (for custom loop): {LOG_DIR}")

2025-04-24 18:30:30.952820: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-04-24 18:30:31.078033: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-24 18:30:31.911768: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/lib/

Ensuring output directory exists: /home/jupyter/model_attention_files
Logging setup complete. Log file: model_attention_files/training_log_attention.log

--- Configuration (Attention, SS, Beam Search) ---
TensorFlow Version: 2.11.0
SentencePiece Version: 0.2.0
Langdetect Version: 1.0.9
GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Input JSONL: mergedt02.jsonl
Output Parquet Cache: model_attention_files/processed_dataframe.parquet
Tokenizer Model Prefix: model_attention_files/pib_summarizer_spm_50k
Model Checkpoint Dir: model_attention_files/training_checkpoints
Best Model Save Path: model_attention_files/pib_summarizer_attention_best.keras
TensorBoard Log Dir: model_attention_files/logs/custom_train/20250424-183033
Vocab Size: 30000
Embedding Dim: 100
Encoder LSTM Units (per dir): 128
Decoder LSTM Units: 256
Attention Units: 512
Encoder Layers: 1
Decoder Layers: 1
Max Input Length: 1024
Max Summary Length: 150
Batch Size: 32
Max Epochs: 30
Learning R

## Data Loading and Initial Cleaning Functions + Execution

In [2]:
# Block 2: Data Loading and Initial Cleaning Functions + Execution
# (This block remains unchanged from the original working code)

# --- Function Definitions ---
def load_data(jsonl_path):
    """Loads data from a JSONL file."""
    logging.info(f"Attempting to load data from: {jsonl_path}")
    data = []
    lines_processed = 0
    lines_failed = 0
    if not os.path.exists(jsonl_path):
        logging.error(f"Input file not found: {jsonl_path}")
        print(f"\n--- Data Loading Error ---")
        print(f"Error: Input file not found at {jsonl_path}")
        print("Please ensure the file exists and the path is correct.")
        print("-" * 30)
        return pd.DataFrame()

    try:
        with open(jsonl_path, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                lines_processed += 1
                try:
                    # Skip empty lines
                    if not line.strip():
                        logging.warning(f"Skipping empty line: {i+1}")
                        lines_failed +=1
                        continue
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    logging.warning(f"Skipping malformed JSON line: {i+1}. Error: {e}")
                    lines_failed += 1
                    continue
        df = pd.DataFrame(data)
        logging.info(f"Loaded {len(df)} records successfully out of {lines_processed} lines ({lines_failed} failed/skipped).")
        # --- Debugging Info ---
        print("\n--- Data Loading ---")
        print(f"Processed {lines_processed} lines from {jsonl_path}.")
        print(f"Successfully loaded {len(df)} records.")
        print(f"Skipped/failed {lines_failed} lines.")
        if not df.empty:
            print("Columns:", df.columns.tolist())
            print("Data Types:\n", df.dtypes)
            print("Sample record (first 5 rows raw_df):\n", df.head())
        else:
            print("Loaded DataFrame (raw_df) is empty. Check input file content and format.")
            logging.warning("Loaded DataFrame (raw_df) is empty after processing the file.")
        print("-" * 30)
        return df
    except FileNotFoundError: # This case is already handled above, but keep for robustness
        logging.error(f"Error: Input file not found at {jsonl_path}")
        print(f"\n--- Data Loading Error ---")
        print(f"Error: Input file not found at {jsonl_path}")
        print("-" * 30)
        return pd.DataFrame()
    except Exception as e:
        logging.error(f"An unexpected error occurred during data loading: {e}", exc_info=True)
        print(f"\n--- Data Loading Error ---")
        print(f"An unexpected error occurred: {e}")
        print("-" * 30)
        return pd.DataFrame()


def clean_text(text):
    """Applies cleaning steps to a single text string."""
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = text.replace('“', '"').replace('”', '"').replace("‘", "'").replace("’", "'")
    text = re.sub(r'file:///[^ ]+\.pdf', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\[\s*\d+\s*\]', '', text)
    headers_footers = [
        r"press information bureau", r"government of india", r"ministry of [\w\s]+",
        r"posted on:\s*\d{1,2}\s+\w{3,}\s+\d{4}\s+\d{1,2}:\d{2}\s*[ap]m\s*(by pib \w+)?",
        r"release id: \d+",
        r"\(release id.*?\)","pib \w+",
        r"\*{3,}\s*[a-z\\/]+\s*\*{3,}", # Footer pattern (e.g., ***DS/AK***) - Need double backslash for literal \
    ]
    for pattern in headers_footers:
       text = re.sub(pattern, '', text, flags=re.IGNORECASE)

    text = text.replace('\\n', ' ') # Replace literal '\n' strings if they exist
    text = text.replace('\n', ' ') # Replace actual newline characters
    text = re.sub(r'\s+', ' ', text) # Collapse multiple whitespace characters
    # Keep basic punctuation useful for summarization
    text = re.sub(r"[^a-z0-9\s.,!?'\"-]", "", text) # Allow letters, numbers, space, and .,!?'"-
    text = text.strip()
    return text

# --- Execution for Block 2 ---

# --- Debugging: Test cleaning function ---
print("\n--- Cleaning Function Test (Block 2) ---")
test_text = """
Press Information Bureau\nGovernment of India\nMinistry of Finance\nPosted on: 25 JUL 2024 6:00PM by PIB Delhi
This is a [ 1] test document from file:///path/to/doc.pdf. Check www.example.com.
It has “quotes” and ‘apostrophes’.   Extra spaces. And some !?.,'\"- punctuation.
Bad chars: #$%^&*()_+={}[]|\\:;<>~/
***DS/AK***
(Release ID: 12345)
"""
cleaned_test = clean_text(test_text)
print(f"Original:\n{test_text}")
print(f"\nCleaned:\n{cleaned_test}")
print("-" * 30)

# Load the raw data into a DataFrame called 'raw_df'
raw_df = load_data(INPUT_JSONL)

# <<< CRUCIAL CHECK >>>
if 'raw_df' not in locals() or raw_df.empty:
    print("************************************************************")
    print("ERROR: Block 2 failed to load data into `raw_df`.")
    print("Cannot proceed to Block 3. Please check the 'Data Loading' output above.")
    print(f"Verify the INPUT_JSONL path ('{INPUT_JSONL}') and the file content.")
    print("************************************************************")
    # Optional: Stop execution here in a notebook context if desired
    # raise RuntimeError("Failed to load raw data. Stopping execution.")
else:
    print("Block 2 completed. `raw_df` created successfully.")
    logging.info("Block 2 completed. `raw_df` created successfully.")

# A variable 'processed_df' will be created in the next block IF raw_df is valid

2025-04-24 18:30:33,319 - INFO - Attempting to load data from: mergedt02.jsonl



--- Cleaning Function Test (Block 2) ---
Original:

Press Information Bureau
Government of India
Ministry of Finance
Posted on: 25 JUL 2024 6:00PM by PIB Delhi
This is a [ 1] test document from file:///path/to/doc.pdf. Check www.example.com.
It has “quotes” and ‘apostrophes’.   Extra spaces. And some !?.,'"- punctuation.
Bad chars: #$%^&*()_+={}[]|\:;<>~/
***DS/AK***
(Release ID: 12345)


Cleaned:
25 jul 2024 600pm by this is a test document from . check it has "quotes" and 'apostrophes'. extra spaces. and some !?.,'"- punctuation. bad chars
------------------------------


2025-04-24 18:30:34,447 - INFO - Loaded 37392 records successfully out of 37392 lines (0 failed/skipped).
2025-04-24 18:30:34,465 - INFO - Block 2 completed. `raw_df` created successfully.



--- Data Loading ---
Processed 37392 lines from mergedt02.jsonl.
Successfully loaded 37392 records.
Skipped/failed 0 lines.
Columns: ['pdf_filename', 'extracted_text', 'gemini_summary', 'gemini_topics', 'preserve_case', 'stopword_removed']
Data Types:
 pdf_filename        object
extracted_text      object
gemini_summary      object
gemini_topics       object
preserve_case         bool
stopword_removed      bool
dtype: object
Sample record (first 5 rows raw_df):
                 pdf_filename  \
0  PIB_115363_2015_02_11.pdf   
1  PIB_115365_2015_02_11.pdf   
2  PIB_115367_2015_02_12.pdf   
3  PIB_115368_2015_02_12.pdf   
4  PIB_115369_2015_02_12.pdf   

                                      extracted_text  \
0  States can flexibly use of central assistance ...   
1  Consultative Committee of the Ministry of Tour...   
2  Prime Minister's Office\nPM pays tributes to S...   
3  Prime Minister's Office\nPM appalled at the ne...   
4  Shri Bandaru Dattatreya Chairs Tripartite Meet...   

  

In [3]:
# Block 3: Preprocessing Pipeline + Execution
# (This block also remains unchanged from the original working code)

# --- Function Definition ---
def preprocess_data(df, text_col='extracted_text', summary_col='gemini_summary', cache_path=OUTPUT_PARQUET):
    """Applies the full preprocessing pipeline to the dataframe."""
    logging.info(f"Preprocessing started. Cache path: {cache_path}")
    start_time = time.time()

    # --- Cache Check ---
    if os.path.exists(cache_path):
        try:
            logging.info(f"Loading processed data from cache: {cache_path}")
            processed_df_from_cache = pd.read_parquet(cache_path)
            # Basic validation of cached data
            if not isinstance(processed_df_from_cache, pd.DataFrame) or processed_df_from_cache.empty:
                 raise ValueError("Cached file did not contain a valid DataFrame.")
            if 'cleaned_text' not in processed_df_from_cache.columns or 'target_summary' not in processed_df_from_cache.columns:
                 raise ValueError("Cached DataFrame missing required columns ('cleaned_text', 'target_summary').")

            logging.info(f"Successfully loaded {len(processed_df_from_cache)} records from cache.")
            # --- Debugging Info ---
            print("\n--- Preprocessing Pipeline (Block 3) ---")
            print(f"Loaded {len(processed_df_from_cache)} records from cache: {cache_path}")
            print("Columns:", processed_df_from_cache.columns.tolist())
            print("Sample processed data (from cache):\n", processed_df_from_cache.head())
            print("-" * 30)
            return processed_df_from_cache # Return cached data
        except Exception as e:
            logging.error(f"Failed to load or parse cache file {cache_path}: {e}", exc_info=True)
            print(f"\n--- Preprocessing Cache Error ---")
            print(f"Error reading cache file {cache_path}: {e}. Proceeding with reprocessing.")
            # If cache fails, delete it to force reprocessing next time
            try:
                os.remove(cache_path)
                logging.info(f"Removed corrupted cache file: {cache_path}")
            except OSError:
                 pass # Ignore error if file couldn't be removed


    # --- Preprocessing Steps (if cache doesn't exist or failed) ---
    print("\n--- Preprocessing Pipeline (Block 3) ---")
    print(f"Cache not used or failed. Starting processing of {len(df)} raw records...")
    logging.info(f"Cache not used or failed. Starting processing of {len(df)} raw records...")


    # Ensure required columns exist in the input df
    if text_col not in df.columns or summary_col not in df.columns:
        logging.error(f"Input DataFrame missing required columns: '{text_col}' or '{summary_col}'. Available: {df.columns.tolist()}")
        print("\n--- Preprocessing Pipeline Error ---")
        print(f"Input DataFrame missing required columns: '{text_col}' or '{summary_col}'. Available: {df.columns.tolist()}")
        print("Cannot preprocess.")
        print("-" * 30)
        return pd.DataFrame() # Return empty dataframe

    # Make a copy to avoid SettingWithCopyWarning on the original raw_df
    df_processed = df[[text_col, summary_col]].copy() # Only copy necessary columns

    logging.info("Applying text cleaning...")
    tqdm.pandas(desc="Cleaning Text")
    df_processed['cleaned_text'] = df_processed[text_col].progress_apply(clean_text)
    tqdm.pandas(desc="Cleaning Summary")
    df_processed['cleaned_summary'] = df_processed[summary_col].progress_apply(clean_text)

    initial_count = len(df_processed)
    logging.info(f"Initial record count for processing: {initial_count}")
    print(f"Initial record count for processing: {initial_count}")
    print("Sample after basic cleaning:")
    cols_to_display = ['cleaned_text', 'cleaned_summary']
    if not df_processed.empty:
        print(df_processed[cols_to_display].head())


    # Language Filtering
    if _langdetect_installed: # Only run if library is available
        logging.info("Applying language filtering...")
        valid_indices = []
        skipped_lang = 0
        for index, text in tqdm(df_processed['cleaned_text'].items(), total=len(df_processed), desc="Language Filtering"):
            try:
                # Skip if text is too short or missing (less prone to langdetect errors)
                if not text or len(text.split()) < 5: # Use word count as proxy for meaningful text
                    skipped_lang += 1
                    continue
                # Detect on first 500 chars for efficiency
                lang = detect(text[:500])
                if lang == 'en': # Simple check for English
                    valid_indices.append(index)
                else:
                    skipped_lang += 1
            except LangDetectException:
                # This happens on very short/ambiguous text, treat as non-English
                skipped_lang += 1
            except Exception as e:
                # Catch any other unexpected errors during detection
                logging.warning(f"Language detection error on index {index}: {e}")
                skipped_lang += 1

        df_processed = df_processed.loc[valid_indices].copy() # Use .loc and .copy()
        lang_filtered_count = len(df_processed)
        logging.info(f"Language filtering: Kept {lang_filtered_count}, Removed {initial_count - lang_filtered_count} non-English/error/short records.")
        print(f"Count after language filtering: {lang_filtered_count} ({initial_count - lang_filtered_count} removed)")
    else:
        print("Skipping language filtering as langdetect is not available.")
        logging.warning("Skipping language filtering as langdetect is not available.")
        lang_filtered_count = len(df_processed) # Count remains the same


    # Length Filtering (only if records remain)
    original_count_before_len_filter = len(df_processed)
    if not df_processed.empty:
        logging.info("Applying length filtering...")
        # Avoid SettingWithCopyWarning using .loc for assignments
        df_processed.loc[:, 'text_word_count'] = df_processed['cleaned_text'].apply(lambda x: len(x.split()))
        df_processed.loc[:, 'summary_word_count'] = df_processed['cleaned_summary'].apply(lambda x: len(x.split()))

        df_processed = df_processed[
            (df_processed['text_word_count'] >= MIN_INPUT_WORDS) &
            (df_processed['summary_word_count'] >= MIN_SUMMARY_WORDS)
        ].copy() # Use boolean indexing and copy
        len_filtered_count = len(df_processed)
        logging.info(f"Length filtering: Kept {len_filtered_count}, Removed {original_count_before_len_filter - len_filtered_count} short records.")
        print(f"Count after length filtering: {len_filtered_count} ({original_count_before_len_filter - len_filtered_count} removed)")
    else:
        print("Skipping length filtering as DataFrame is empty after previous steps.")
        logging.warning("Skipping length filtering due to empty DataFrame.")
        len_filtered_count = 0

    # Add Start/End Tokens (only if records remain)
    final_processed_df = pd.DataFrame() # Initialize
    if not df_processed.empty:
        logging.info("Adding <start> and <end> tokens to summaries...")
        # Use .loc for assignment to avoid warnings
        df_processed.loc[:, 'cleaned_summary_tagged'] = df_processed['cleaned_summary'].apply(lambda x: f"<start> {x} <end>")
        # Final Selection and Renaming
        final_processed_df = df_processed[['cleaned_text', 'cleaned_summary_tagged']].rename(columns={'cleaned_summary_tagged': 'target_summary'})
    else:
        print("Skipping token tagging as DataFrame is empty.")
        logging.warning("Skipping token tagging due to empty DataFrame.")


    # --- Final Output and Caching ---
    final_count = len(final_processed_df)
    print(f"Final processed record count: {final_count}")
    logging.info(f"Final processed record count: {final_count}")
    if not final_processed_df.empty:
        print("Sample processed data (final):\n", final_processed_df.head())
        # Cache the result
        try:
            logging.info(f"Caching processed data to: {cache_path}")
            final_processed_df.to_parquet(cache_path, index=False)
            logging.info("Caching successful.")
            print(f"Processed data cached successfully to {cache_path}")
        except Exception as e:
            logging.error(f"Failed to cache processed data to {cache_path}: {e}", exc_info=True)
            print(f"\n--- Caching Error ---")
            print(f"Failed to cache data to {cache_path}: {e}")
    else:
        logging.warning("Processed DataFrame is empty. No data will be cached.")
        print("Warning: Processed DataFrame is empty. Nothing to cache.")

    end_time = time.time()
    logging.info(f"Preprocessing finished in {end_time - start_time:.2f} seconds.")
    print(f"Preprocessing finished in {end_time - start_time:.2f} seconds.")
    print("-" * 30)

    return final_processed_df

# --- Execution for Block 3 ---
processed_df = pd.DataFrame() # Initialize as empty DataFrame

# Only proceed if raw_df from Block 2 exists and is not empty
if 'raw_df' in locals() and isinstance(raw_df, pd.DataFrame) and not raw_df.empty:
    print("Proceeding with preprocessing using `raw_df`...")
    processed_df = preprocess_data(raw_df, cache_path=OUTPUT_PARQUET)
else:
    print("Skipping Block 3 execution because `raw_df` is not available or is empty.")
    print("Check the output of Block 2 for errors.")
    logging.error("Skipping Block 3 execution because `raw_df` is not available or is empty.")

# <<< CRUCIAL CHECK >>>
if 'processed_df' not in locals() or processed_df.empty:
    print("************************************************************")
    print("WARNING: Block 3 resulted in an empty `processed_df`.")
    print("This could be due to loading errors, aggressive filtering, or issues during processing.")
    print("Subsequent blocks (Tokenizer, Model Training) will likely fail or be skipped.")
    print("Review the 'Preprocessing Pipeline' output above.")
    print("************************************************************")
    logging.warning("Block 3 resulted in an empty `processed_df`.")
else:
    print("Block 3 completed. `processed_df` created or loaded successfully.")
    logging.info("Block 3 completed. `processed_df` created or loaded successfully.")

# Clean up raw_df if memory is a concern and processing was successful
if 'processed_df' in locals() and not processed_df.empty and 'raw_df' in locals():
   print("Cleaning up raw_df from memory...")
   del raw_df
   gc.collect()
   logging.info("Cleaned up raw_df from memory.")

2025-04-24 18:30:34,489 - INFO - Preprocessing started. Cache path: model_attention_files/processed_dataframe.parquet
2025-04-24 18:30:34,490 - INFO - Loading processed data from cache: model_attention_files/processed_dataframe.parquet


Proceeding with preprocessing using `raw_df`...


2025-04-24 18:30:35,017 - INFO - Successfully loaded 37310 records from cache.
2025-04-24 18:30:35,020 - INFO - Block 3 completed. `processed_df` created or loaded successfully.
2025-04-24 18:30:35,205 - INFO - Cleaned up raw_df from memory.



--- Preprocessing Pipeline (Block 3) ---
Loaded 37310 records from cache: model_attention_files/processed_dataframe.parquet
Columns: ['cleaned_text', 'target_summary']
Sample processed data (from cache):
                                         cleaned_text  \
0  states can flexibly use of central assistance ...   
1  consultative committee of the  dr mahesh sharm...   
2  prime minister's office pm pays tributes to sw...   
3  prime minister's office pm appalled at the new...   
4  shri bandaru dattatreya chairs tripartite meet...   

                                      target_summary  
0  <start> a meeting of the parliamentary consult...  
1  <start> a meeting of the parliamentary consult...  
2  <start> prime minister narendra modi paid trib...  
3  <start> prime minister narendra modi expressed...  
4  <start> this press release summarizes a tripar...  
------------------------------
Block 3 completed. `processed_df` created or loaded successfully.
Cleaning up raw_df from memory

## Block 4: Tokenizer Training and Usage + Execution

In [4]:
pip install --upgrade pyarrow

Note: you may need to restart the kernel to use updated packages.


In [5]:
# Block 4: Tokenizer Training and Usage + Execution
# (This block remains unchanged as tokenizer logic is independent of model architecture details like attention)

import os
import pandas as pd
import numpy as np
import logging
import time
import tensorflow as tf
from tqdm.notebook import tqdm
# Ensure sentencepiece is imported, handle if missing from Block 1
try:
    import sentencepiece as spm
except ImportError:
    spm = None # Already handled in Block 1, but good practice to check

# --- Function Definitions ---

def train_sentencepiece(data_series, model_prefix, vocab_size, special_tokens_map):
    """Trains a SentencePiece Unigram model."""
    if spm is None:
        logging.error("SentencePiece library not available. Cannot train tokenizer.")
        print("\n--- SentencePiece Training Error ---")
        print("SentencePiece library not available. Install it first.")
        print("-" * 30)
        return False

    logging.info(f"Starting SentencePiece training. Output prefix: {model_prefix}")
    start_time = time.time()

    # Create a temporary file to store the text data for training
    temp_dir = os.path.dirname(model_prefix)
    os.makedirs(temp_dir, exist_ok=True)
    temp_text_file = f"{model_prefix}_training_data.txt" # Use prefix for temp file name

    try:
        if data_series.empty:
            logging.error("Cannot train SentencePiece on empty data series.")
            print("\n--- SentencePiece Training Error ---")
            print("Input data series is empty. Cannot train tokenizer.")
            print("-" * 30)
            return False # Indicate failure

        # Write data ensuring strings and handling potential NaN
        with open(temp_text_file, 'w', encoding='utf-8') as f:
            for text in tqdm(data_series, desc="Writing Training Data"):
                 if pd.notna(text): # Check for NaN or None
                    f.write(str(text) + '\n') # Ensure text is string
        logging.info(f"Training data written to {temp_text_file}")

        # Build command string using the map for special tokens
        spm_command = (
            f'--input={temp_text_file} --model_prefix={model_prefix} '
            f'--vocab_size={vocab_size} --model_type=unigram '
            f'--pad_id={special_tokens_map["pad_id"]} --unk_id={special_tokens_map["unk_id"]} '
            f'--bos_id={special_tokens_map["bos_id"]} --eos_id={special_tokens_map["eos_id"]} '
            f'--unk_piece={special_tokens_map["unk_piece"]} --bos_piece={special_tokens_map["bos_piece"]} '
            f'--eos_piece={special_tokens_map["eos_piece"]} --pad_piece={special_tokens_map["pad_piece"]} '
            f'--hard_vocab_limit=false '
            f'--character_coverage=1.0 ' # Recommended default
            f'--shuffle_input_sentence=true --input_sentence_size=10000000 ' # Process up to 10M lines
            f'--seed_sentencepiece_size=1000000 ' # Use 1M sentences for seeding
            f'--shrinking_factor=0.75 '
            f'--num_threads=16 ' # Use multiple threads if available
            f'--num_sub_iterations=2 '
            f'--max_sentence_length=4192 ' # Default max length
            f'--model_type=unigram ' # Use Unigram model
        )

        print("\n--- SentencePiece Training ---")
        logging.info(f"Running SentencePiece with command args...")
        print(f"Running SentencePiece training command...") # Don't print full command with paths

        spm.SentencePieceTrainer.train(spm_command)

        training_duration = time.time() - start_time
        logging.info(f"SentencePiece training completed in {training_duration:.2f} seconds.")
        print(f"SentencePiece model files created: {model_prefix}.model, {model_prefix}.vocab")
        print(f"Training duration: {training_duration:.2f} seconds.")

        os.remove(temp_text_file)
        logging.info(f"Removed temporary training file: {temp_text_file}")
        print("-" * 30)
        return True # Indicate success

    except Exception as e:
        logging.error(f"SentencePiece training failed: {e}", exc_info=True)
        print(f"\n--- SentencePiece Training Error ---")
        print(f"SentencePiece training failed: {e}")
        # Clean up potentially created files on failure
        if os.path.exists(f"{model_prefix}.model"): os.remove(f"{model_prefix}.model")
        if os.path.exists(f"{model_prefix}.vocab"): os.remove(f"{model_prefix}.vocab")
        if os.path.exists(temp_text_file): os.remove(temp_text_file)
        print("-" * 30)
        return False # Indicate failure


def load_tokenizer(model_path):
    """Loads a trained SentencePiece model."""
    if spm is None:
        logging.error("SentencePiece library not available. Cannot load tokenizer.")
        print("\n--- Tokenizer Loading Error ---")
        print("SentencePiece library not available.")
        print("-" * 30)
        return None

    logging.info(f"Loading SentencePiece tokenizer from: {model_path}")
    if not os.path.exists(model_path):
        logging.error(f"Tokenizer model file not found at {model_path}")
        print(f"\n--- Tokenizer Loading Error ---")
        print(f"Error: Tokenizer model file not found at {model_path}")
        print("-" * 30)
        return None
    try:
        tokenizer = spm.SentencePieceProcessor()
        tokenizer.load(model_path)
        print("\n--- Tokenizer Loading ---")
        logging.info(f"Successfully loaded tokenizer: {model_path}")
        print(f"Successfully loaded tokenizer: {model_path}")

        # Verify special token IDs match configuration (using global constants from Block 1)
        pad_id_val = tokenizer.pad_id()
        unk_id_val = tokenizer.unk_id()
        bos_id_val = tokenizer.bos_id()
        eos_id_val = tokenizer.eos_id()

        pad_piece_str = tokenizer.id_to_piece(pad_id_val) if pad_id_val != -1 else 'N/A'
        unk_piece_str = tokenizer.id_to_piece(unk_id_val) if unk_id_val != -1 else 'N/A'
        bos_piece_str = tokenizer.id_to_piece(bos_id_val) if bos_id_val != -1 else 'N/A'
        eos_piece_str = tokenizer.id_to_piece(eos_id_val) if eos_id_val != -1 else 'N/A'

        print(f"Vocabulary Size: {tokenizer.vocab_size()}")
        print(f"PAD ID ({pad_piece_str}): {pad_id_val} (Config: {PAD_ID})")
        print(f"UNK ID ({unk_piece_str}): {unk_id_val} (Config: {UNK_ID})")
        print(f"BOS/Start ID ({bos_piece_str}): {bos_id_val} (Config: {START_ID})")
        print(f"EOS/End ID ({eos_piece_str}): {eos_id_val} (Config: {END_ID})")
        print("-" * 30)

        # Sanity check special token IDs
        id_mismatch = False
        if pad_id_val != PAD_ID: logging.warning("PAD ID mismatch!"); id_mismatch=True
        if unk_id_val != UNK_ID: logging.warning("UNK ID mismatch!"); id_mismatch=True
        if bos_id_val != START_ID: logging.warning("BOS ID mismatch!"); id_mismatch=True
        if eos_id_val != END_ID: logging.warning("EOS ID mismatch!"); id_mismatch=True
        if id_mismatch:
             print("WARNING: Loaded tokenizer special token IDs DO NOT match configured IDs!")
             # Optional: Treat as error? For now, just warn.
             # return None # Uncomment to treat ID mismatch as a fatal error

        return tokenizer
    except Exception as e:
        logging.error(f"Failed to load or process tokenizer model from {model_path}: {e}", exc_info=True)
        print(f"\n--- Tokenizer Loading Error ---")
        print(f"Failed to load or process tokenizer model from {model_path}: {e}")
        print("-" * 30)
        return None


def tokenize_texts(texts, tokenizer, max_len):
    """Tokenizes a list/series of texts and pads/truncates them."""
    if tokenizer is None:
        logging.error("Cannot tokenize texts: Tokenizer is None.")
        return np.array([])
    if texts is None or texts.empty:
        logging.warning("Attempted to tokenize an empty list/series of texts.")
        print("Warning: Input texts for tokenization is empty.")
        return np.array([]) # Return empty numpy array

    logging.info(f"Tokenizing {len(texts)} texts with max_len={max_len}...")
    try:
        # Ensure all items are strings, replace None/NaN with empty string
        texts_list = [str(text) if pd.notna(text) else '' for text in texts.tolist()]
        # Use encode_as_ids for efficiency
        tokenized_sequences = tokenizer.encode_as_ids(texts_list)

        padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(
            tokenized_sequences,
            maxlen=max_len,
            padding='post',      # Pad at the end
            truncating='post',   # Truncate at the end
            value=PAD_ID         # Use the defined PAD_ID
        )
        print("\n--- Tokenization ---")
        logging.info(f"Tokenization successful for {len(texts_list)} texts.")
        print(f"Tokenized {len(texts_list)} texts.")
        print(f"Shape of padded sequences: {padded_sequences.shape}")
        if len(texts_list) > 0 and len(padded_sequences) > 0:
            print(f"Original Text (sample 0): {texts_list[0][:100]}...")
            print(f"Tokenized IDs (sample 0): {padded_sequences[0][:20]}...")
        print("-" * 30)
        # Ensure output is int32 for TensorFlow compatibility
        return padded_sequences.astype(np.int32)
    except Exception as e:
        logging.error(f"Error during text tokenization: {e}", exc_info=True)
        print(f"\n--- Tokenization Error ---")
        print(f"An error occurred during tokenization: {e}")
        print("-" * 30)
        return np.array([])


def detokenize_sequences(sequences, tokenizer):
    """Converts sequences of token IDs back to text. Handles single sequence or batch."""
    if tokenizer is None:
        logging.error("Detokenization failed: Tokenizer is None.")
        return "[Detokenization Error: No Tokenizer]"
    if sequences is None: return [] if isinstance(sequences, list) else ""

    try:
        # Handle different input types (Tensor, ndarray, list)
        if isinstance(sequences, tf.Tensor): sequences = sequences.numpy()
        if isinstance(sequences, np.ndarray): sequences = sequences.tolist()

        # Check if it's a batch or a single sequence
        is_batch = isinstance(sequences, list) and (len(sequences) == 0 or isinstance(sequences[0], list) or isinstance(sequences[0], np.ndarray))
        if not is_batch: sequences = [sequences] # Wrap single list/ndarray for uniform processing

        texts = []
        for seq in sequences:
            # Filter out PAD, START, END tokens before decoding
            actual_tokens = [int(token_id) for token_id in seq
                             if int(token_id) not in [PAD_ID, START_ID, END_ID]]
            # Use decode_ids for efficiency
            texts.append(tokenizer.decode_ids(actual_tokens))

        return texts if is_batch else texts[0]
    except Exception as e:
        logging.error(f"Error during detokenization: {e}", exc_info=True)
        # Try to decode pieces individually in case of error
        try:
            problem_seq_str = [tokenizer.id_to_piece(int(t)) for t in seq]
            logging.error(f"Problematic sequence pieces: {problem_seq_str}")
        except: pass # Ignore errors during error reporting
        return "[Detokenization Error]"

# --- Execution for Block 4 ---
tokenizer = None
encoder_input_data = np.array([])
decoder_input_data = np.array([])
decoder_target_data = np.array([])
tokenization_failed = False # Flag to track status

# Only proceed if processed_df from Block 3 exists and is not empty
if 'processed_df' in locals() and isinstance(processed_df, pd.DataFrame) and not processed_df.empty:
    print("Block 4: `processed_df` is valid. Proceeding with tokenizer.")
    logging.info("Block 4: `processed_df` is valid. Proceeding with tokenizer.")

    # Combine text and summary for tokenizer training data
    if 'cleaned_text' in processed_df.columns and 'target_summary' in processed_df.columns:
        print("Preparing data for tokenizer training...")
        # Ensure consistency by dropping NaN before concatenation
        text_data = processed_df['cleaned_text'].dropna()
        summary_data = processed_df['target_summary'].dropna()
        full_corpus = pd.concat([text_data, summary_data], ignore_index=True)

        print(f"Full corpus size for tokenizer: {len(full_corpus)}")

        # Define special tokens map using constants from Block 1
        special_tokens_map = {
            "pad_id": PAD_ID, "unk_id": UNK_ID, "bos_id": START_ID, "eos_id": END_ID,
            "pad_piece": "<pad>", "unk_piece": "<unk>", "bos_piece": "<start>", "eos_piece": "<end>"
        }

        # Train only if model file doesn't exist
        if not os.path.exists(TOKENIZER_MODEL_FILE):
            print(f"Tokenizer model {TOKENIZER_MODEL_FILE} not found. Starting training...")
            training_successful = train_sentencepiece(full_corpus, TOKENIZER_MODEL_PREFIX, VOCAB_SIZE, special_tokens_map)
            if not training_successful:
                 print("Tokenizer training failed. Cannot proceed with tokenization.")
                 logging.error("Tokenizer training failed.")
                 tokenization_failed = True # Set failure flag
            else:
                 print("Tokenizer training successful.")
                 logging.info("Tokenizer training successful.")
        else:
            print(f"Tokenizer model {TOKENIZER_MODEL_FILE} already exists. Skipping training.")
            logging.info(f"Found existing tokenizer model: {TOKENIZER_MODEL_FILE}. Skipping training.")

        # Load the tokenizer (only if training didn't fail)
        if not tokenization_failed:
            tokenizer = load_tokenizer(TOKENIZER_MODEL_FILE)

            if tokenizer:
                # --- Tokenization ---
                print("Tokenizing cleaned text...")
                encoder_input_data = tokenize_texts(processed_df['cleaned_text'], tokenizer, MAX_LEN_INPUT)

                print("Tokenizing target summaries...")
                # Tokenize summaries (which include <start>/<end>) up to MAX_LEN_SUMMARY
                decoder_full_data = tokenize_texts(processed_df['target_summary'], tokenizer, MAX_LEN_SUMMARY)

                # --- Create Decoder Input/Target ---
                # Check if tokenization produced valid results before slicing
                if encoder_input_data.size > 0 and decoder_full_data.size > 0:
                    if encoder_input_data.shape[0] != decoder_full_data.shape[0]:
                        logging.error(f"Mismatch in tokenized samples: Encoder {encoder_input_data.shape[0]}, Decoder {decoder_full_data.shape[0]}")
                        print("ERROR: Mismatch in number of tokenized encoder/decoder samples. Check preprocessing/tokenization.")
                        tokenization_failed = True
                    else:
                        print("Creating decoder input/target sequences...")
                        # Decoder input: <start> token ... second-to-last token
                        # Note: We slice decoder_full_data, max length becomes MAX_LEN_SUMMARY-1
                        decoder_input_data = decoder_full_data[:, :-1]

                        # Decoder target: first token ... <end> token
                        # Note: We slice decoder_full_data, max length becomes MAX_LEN_SUMMARY-1
                        decoder_target_data = decoder_full_data[:, 1:]

                        print("\n--- Data Shapes After Tokenization & Shifting ---")
                        print("Encoder Input Shape:", encoder_input_data.shape)
                        print("Decoder Input Shape:", decoder_input_data.shape)
                        print("Decoder Target Shape:", decoder_target_data.shape)
                        print("-" * 30)
                        logging.info(f"Tokenization successful. Shapes: Encoder={encoder_input_data.shape}, DecoderIn={decoder_input_data.shape}, DecoderOut={decoder_target_data.shape}")
                else:
                    print("Tokenization resulted in empty arrays. Cannot proceed.")
                    logging.error("Tokenization resulted in empty arrays.")
                    tokenization_failed = True # Set failure flag
                    # Ensure arrays are empty
                    encoder_input_data = np.array([])
                    decoder_input_data = np.array([])
                    decoder_target_data = np.array([])
            else:
                print("Failed to load tokenizer. Cannot proceed.")
                logging.error("Failed to load tokenizer.")
                tokenization_failed = True # Set failure flag
    else:
         print("Skipping tokenizer step: required columns ('cleaned_text', 'target_summary') missing in processed_df.")
         logging.error("Skipping tokenizer step: required columns missing.")
         tokenization_failed = True # Set failure flag
else:
     # This case should have been caught earlier if Blocks 2/3 failed
     print("Skipping Block 4 execution because `processed_df` is not available or is empty.")
     print("Check the output of Block 3.")
     logging.error("Skipping Block 4 execution because `processed_df` is not available or is empty.")
     tokenization_failed = True # Set failure flag


# Final status check for Block 4
if tokenization_failed:
    print("************************************************************")
    print("ERROR: Block 4 failed or was skipped due to issues in previous blocks or during tokenization.")
    print("Cannot proceed to Block 5 (Dataset Creation).")
    print("************************************************************")
else:
    print("Block 4 completed successfully. Tokenized data created.")
    logging.info("Block 4 completed successfully. Tokenized data created.")

2025-04-24 18:30:37,075 - INFO - Block 4: `processed_df` is valid. Proceeding with tokenizer.


Block 4: `processed_df` is valid. Proceeding with tokenizer.
Preparing data for tokenizer training...


2025-04-24 18:30:37,114 - INFO - Found existing tokenizer model: model_attention_files/pib_summarizer_spm_50k.model. Skipping training.
2025-04-24 18:30:37,115 - INFO - Loading SentencePiece tokenizer from: model_attention_files/pib_summarizer_spm_50k.model


Full corpus size for tokenizer: 74620
Tokenizer model model_attention_files/pib_summarizer_spm_50k.model already exists. Skipping training.


2025-04-24 18:30:37,163 - INFO - Successfully loaded tokenizer: model_attention_files/pib_summarizer_spm_50k.model
2025-04-24 18:30:37,164 - INFO - Tokenizing 37310 texts with max_len=1024...



--- Tokenizer Loading ---
Successfully loaded tokenizer: model_attention_files/pib_summarizer_spm_50k.model
Vocabulary Size: 30000
PAD ID (<pad>): 0 (Config: 0)
UNK ID (<unk>): 1 (Config: 1)
BOS/Start ID (<start>): 2 (Config: 2)
EOS/End ID (<end>): 3 (Config: 3)
------------------------------
Tokenizing cleaned text...


2025-04-24 18:30:40,863 - INFO - Tokenization successful for 37310 texts.



--- Tokenization ---
Tokenized 37310 texts.
Shape of padded sequences: (37310, 1024)
Original Text (sample 0): states can flexibly use of central assistance of inr4,000 per toilet, says shri venkaiah naidu parli...
Tokenized IDs (sample 0): [   84   183 18762   759   296   271     7    86   291     7    69  7086
   105  3498     5  2017    31  1648   824   713]...
------------------------------


2025-04-24 18:30:41,129 - INFO - Tokenizing 37310 texts with max_len=150...


Tokenizing target summaries...


2025-04-24 18:30:42,161 - INFO - Tokenization successful for 37310 texts.
2025-04-24 18:30:42,225 - INFO - Tokenization successful. Shapes: Encoder=(37310, 1024), DecoderIn=(37310, 149), DecoderOut=(37310, 149)
2025-04-24 18:30:42,226 - INFO - Block 4 completed successfully. Tokenized data created.



--- Tokenization ---
Tokenized 37310 texts.
Shape of padded sequences: (37310, 150)
Original Text (sample 0): <start> a meeting of the parliamentary consultative committee discussed shortcomings in the jawaharl...
Tokenized IDs (sample 0): [   12     1  8371     1    14    88     7     4  1381    12  3207   195
   598 10431    11     4  3281  3181  2737   308]...
------------------------------
Creating decoder input/target sequences...

--- Data Shapes After Tokenization & Shifting ---
Encoder Input Shape: (37310, 1024)
Decoder Input Shape: (37310, 149)
Decoder Target Shape: (37310, 149)
------------------------------
Block 4 completed successfully. Tokenized data created.


## Block 5: Data Preparation for TensorFlow (tf.data.Dataset) + Execution

In [6]:
# Block 5: Data Preparation for TensorFlow (tf.data.Dataset) + Execution
# (This block also remains unchanged as it prepares data for consumption, compatible with both model.fit and custom loops)

import tensorflow as tf
import numpy as np
import logging

# --- Function Definition ---
def create_tf_dataset(encoder_inputs, decoder_inputs, decoder_targets, batch_size, shuffle=True):
    """Creates a tf.data.Dataset for training or validation."""
    if not isinstance(encoder_inputs, np.ndarray) or \
       not isinstance(decoder_inputs, np.ndarray) or \
       not isinstance(decoder_targets, np.ndarray):
        logging.error("Inputs to create_tf_dataset must be numpy arrays.")
        print("Error: Inputs for dataset creation are not numpy arrays.")
        return None

    if encoder_inputs.size == 0 or decoder_inputs.size == 0 or decoder_targets.size == 0:
        logging.error("Cannot create dataset from empty numpy arrays.")
        print("Error: Input arrays for dataset creation are empty.")
        return None
    if not (encoder_inputs.shape[0] == decoder_inputs.shape[0] == decoder_targets.shape[0]):
        logging.error(f"Mismatch in number of samples: Enc={encoder_inputs.shape[0]}, DecIn={decoder_inputs.shape[0]}, DecOut={decoder_targets.shape[0]}")
        print("Error: Mismatch in number of samples between input/output arrays.")
        return None

    logging.info(f"Creating tf.data.Dataset. Shuffle={shuffle}, Batch Size={batch_size}")
    print("\n--- tf.data.Dataset Creation ---")
    print(f"Input shapes: Encoder={encoder_inputs.shape}, DecoderIn={decoder_inputs.shape}, DecoderOut={decoder_targets.shape}")

    try:
        # Create slices for inputs (dictionary) and targets (dictionary)
        # Ensure keys match the input/output names expected by the model later
        # Keras Functional API uses layer names by default.
        # If using subclassing, define input/output names in the call method.
        # For custom loop, we'll unpack this dictionary later.
        dataset = tf.data.Dataset.from_tensor_slices(
            (
                {"encoder_inputs": encoder_inputs.astype(np.int32), # Ensure int32
                 "decoder_inputs": decoder_inputs.astype(np.int32)}, # Ensure int32
                {"output_layer": decoder_targets.astype(np.int32)}  # Target key matches final Dense layer name
            )
        )

        if shuffle:
            # Use a buffer size approx the size of the dataset for good shuffling
            buffer_size = len(encoder_inputs)
            dataset = dataset.shuffle(buffer_size=buffer_size, reshuffle_each_iteration=True)
            logging.info(f"Shuffling dataset with buffer size {buffer_size}")

        # Batch the dataset. drop_remainder=True is often good for stateful operations or consistent processing.
        # For custom loops, you might handle the last partial batch manually if needed.
        dataset = dataset.batch(batch_size, drop_remainder=True)

        # Prefetch for performance
        dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

        logging.info("tf.data.Dataset created successfully.")
        print(f"tf.data.Dataset created (shuffle={shuffle}).")
        print("Element Spec (structure of one batch):")
        print(dataset.element_spec)
        print("-" * 30)
        return dataset

    except Exception as e:
        logging.error(f"Failed to create tf.data.Dataset: {e}", exc_info=True)
        print(f"\n--- Dataset Creation Error ---")
        print(f"Failed to create tf.data.Dataset: {e}")
        print("-" * 30)
        return None

# --- Execution for Block 5 ---
train_dataset = None
val_dataset = None
num_train_samples = 0
num_val_samples = 0
dataset_creation_failed = False # Flag

# Check if Block 4 succeeded and data exists
if ('tokenization_failed' in locals() and not tokenization_failed and
   'encoder_input_data' in locals() and encoder_input_data.size > 0 and
   'decoder_input_data' in locals() and decoder_input_data.size > 0 and
   'decoder_target_data' in locals() and decoder_target_data.size > 0):

    print("Block 5: Tokenized data is valid. Proceeding with dataset creation.")
    logging.info("Block 5: Tokenized data is valid. Proceeding with dataset creation.")

    num_samples = encoder_input_data.shape[0]
    if num_samples > 0:
        # Simple 90/10 split, ensure validation set isn't empty if possible
        num_val_samples = max(1, int(0.1 * num_samples)) if num_samples > 1 else 0
        num_train_samples = num_samples - num_val_samples

        if num_train_samples <= 0:
             print(f"Error: Not enough samples ({num_samples}) for a train/validation split (Val samples = {num_val_samples}).")
             logging.error(f"Not enough samples ({num_samples}) for train/val split.")
             dataset_creation_failed = True
        else:
            print(f"Splitting data: {num_train_samples} train, {num_val_samples} validation.")
            logging.info(f"Splitting data: {num_train_samples} train, {num_val_samples} validation.")

            # Shuffle indices *before* splitting
            indices = np.arange(num_samples)
            np.random.seed(42) # for reproducibility
            np.random.shuffle(indices)
            encoder_input_data = encoder_input_data[indices]
            decoder_input_data = decoder_input_data[indices]
            decoder_target_data = decoder_target_data[indices]
            print("Shuffled data indices before splitting.")

            # Perform the split
            encoder_input_train = encoder_input_data[:num_train_samples]
            decoder_input_train = decoder_input_data[:num_train_samples]
            decoder_target_train = decoder_target_data[:num_train_samples]

            encoder_input_val = encoder_input_data[num_train_samples:]
            decoder_input_val = decoder_input_data[num_train_samples:]
            decoder_target_val = decoder_target_data[num_train_samples:]

            # Create datasets
            print("\nCreating training dataset...")
            train_dataset = create_tf_dataset(
                encoder_input_train, decoder_input_train, decoder_target_train, BATCH_SIZE, shuffle=True
            )
            print("\nCreating validation dataset...")
            val_dataset = create_tf_dataset(
                encoder_input_val, decoder_input_val, decoder_target_val, BATCH_SIZE, shuffle=False # No need to shuffle validation
            )

            if train_dataset is None or val_dataset is None:
                 print("Error: Failed to create train or validation dataset.")
                 logging.error("Failed to create train or validation dataset.")
                 dataset_creation_failed = True
            else:
                # Calculate steps per epoch (useful for custom loop)
                train_steps_per_epoch = len(train_dataset)
                val_steps_per_epoch = len(val_dataset)
                print("\n--- Dataset Splitting and Creation Summary ---")
                print(f"Total samples tokenized: {num_samples}")
                print(f"Training samples: {num_train_samples}, Validation samples: {num_val_samples}")
                print(f"Train dataset created: Yes (Steps per epoch: {train_steps_per_epoch})")
                print(f"Validation dataset created: Yes (Steps per epoch: {val_steps_per_epoch})")
                print("-" * 30)
    else:
        print("No samples found in tokenized data. Cannot create datasets.")
        logging.error("No samples found in tokenized data. Cannot create datasets.")
        dataset_creation_failed = True
else:
    print("Skipping Block 5 execution: Tokenization failed or resulted in empty data.")
    logging.error("Skipping Block 5 execution: Tokenization failed or resulted in empty data.")
    dataset_creation_failed = True


# Final status check for Block 5
if dataset_creation_failed:
    print("************************************************************")
    print("ERROR: Block 5 failed or was skipped.")
    print("Cannot proceed to Block 6 (Model Building).")
    print("Review the output from Block 4 and 5.")
    print("************************************************************")
else:
    print("Block 5 completed successfully. Train/Validation datasets created.")
    logging.info("Block 5 completed successfully. Train/Validation datasets created.")

2025-04-24 18:30:42,246 - INFO - Block 5: Tokenized data is valid. Proceeding with dataset creation.
2025-04-24 18:30:42,247 - INFO - Splitting data: 33579 train, 3731 validation.
2025-04-24 18:30:42,327 - INFO - Creating tf.data.Dataset. Shuffle=True, Batch Size=32


Block 5: Tokenized data is valid. Proceeding with dataset creation.
Splitting data: 33579 train, 3731 validation.
Shuffled data indices before splitting.

Creating training dataset...

--- tf.data.Dataset Creation ---
Input shapes: Encoder=(33579, 1024), DecoderIn=(33579, 149), DecoderOut=(33579, 149)


2025-04-24 18:30:42.407806: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-04-24 18:30:42.543715: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 20750 MB memory:  -> device: 0, name: NVIDIA L4, pci bus id: 0000:00:03.0, compute capability: 8.9
2025-04-24 18:30:42,866 - INFO - Shuffling dataset with buffer size 33579
2025-04-24 18:30:42,873 - INFO - tf.data.Dataset created successfully.
2025-04-24 18:30:42,873 - INFO - Creating tf.data.Dataset. Shuffle=False, Batch Size=32
2025-04-24 18:30:42,887 - INFO - tf.data.Dataset created successfully.
2025-04-24 18:30:42,890 - INFO - Block 5 completed successfully. Train/Validation dat

tf.data.Dataset created (shuffle=True).
Element Spec (structure of one batch):
({'encoder_inputs': TensorSpec(shape=(32, 1024), dtype=tf.int32, name=None), 'decoder_inputs': TensorSpec(shape=(32, 149), dtype=tf.int32, name=None)}, {'output_layer': TensorSpec(shape=(32, 149), dtype=tf.int32, name=None)})
------------------------------

Creating validation dataset...

--- tf.data.Dataset Creation ---
Input shapes: Encoder=(3731, 1024), DecoderIn=(3731, 149), DecoderOut=(3731, 149)
tf.data.Dataset created (shuffle=False).
Element Spec (structure of one batch):
({'encoder_inputs': TensorSpec(shape=(32, 1024), dtype=tf.int32, name=None), 'decoder_inputs': TensorSpec(shape=(32, 149), dtype=tf.int32, name=None)}, {'output_layer': TensorSpec(shape=(32, 149), dtype=tf.int32, name=None)})
------------------------------

--- Dataset Splitting and Creation Summary ---
Total samples tokenized: 37310
Training samples: 33579, Validation samples: 3731
Train dataset created: Yes (Steps per epoch: 1049)

## Block 6: Model Architecture (Seq2Seq with Attention) + Execution

In [7]:
# Block 6: Model Architecture (Seq2Seq with Attention) + Execution

import tensorflow as tf
from tensorflow.keras.layers import (Input, Embedding, LSTM, Bidirectional, Dense,
                                     Concatenate, AdditiveAttention, Dropout)
from tensorflow.keras.models import Model
import logging

# --- Function Definition ---

def build_seq2seq_attention_model(vocab_size, embedding_dim, lstm_units, decoder_lstm_units,
                                  attention_units, num_encoder_layers, num_decoder_layers,
                                  dropout_rate, max_len_input, max_len_summary_dec_input):
    """Builds the Encoder-Decoder model with Additive (Bahdanau) Attention."""
    logging.info("Building Seq2Seq Model with Attention...")
    print("\n--- Seq2Seq Model Build (with Attention) ---")

    # --- Encoder ---
    encoder_input_layer = Input(shape=(max_len_input,), dtype='int32', name="encoder_inputs")

    # Embedding layer with masking
    encoder_embedding_layer = Embedding(vocab_size, embedding_dim, mask_zero=True, name="encoder_embedding")
    encoder_embeddings = encoder_embedding_layer(encoder_input_layer)
    encoder_embeddings = Dropout(dropout_rate)(encoder_embeddings) # Dropout after embedding

    # Stacked Bidirectional LSTMs
    encoder_outputs = encoder_embeddings
    encoder_states_list = [] # To store final states [h, c] from each layer

    print(f"Building Encoder with {num_encoder_layers} BiLSTM layers...")
    for i in range(num_encoder_layers):
        # return_sequences=True is crucial for all layers to feed to the next, and for attention from the last layer.
        # return_state=True is needed to get the final states for decoder initialization.
        bilstm_layer = Bidirectional(
            LSTM(lstm_units, return_sequences=True, return_state=True, dropout=dropout_rate, name=f"encoder_bilstm_{i+1}")
        )
        encoder_outputs, forward_h, forward_c, backward_h, backward_c = bilstm_layer(encoder_outputs)

        # Store the states from the *last* layer for decoder initialization
        if i == num_encoder_layers - 1:
            state_h = Concatenate(name="encoder_final_h")([forward_h, backward_h])
            state_c = Concatenate(name="encoder_final_c")([forward_c, backward_c])
            # Final encoder states should match the decoder LSTM units if used directly
            # Ensure DECODER_LSTM_UNITS = LSTM_UNITS * 2 if using this state init scheme
            encoder_states_list = [state_h, state_c]

    # encoder_outputs shape: (batch_size, max_len_input, 2 * lstm_units) - This is the 'value' for attention
    # encoder_states_list: [ (batch_size, 2 * lstm_units), (batch_size, 2 * lstm_units) ] - Final h and c

    # --- Decoder ---
    decoder_input_layer = Input(shape=(max_len_summary_dec_input,), dtype='int32', name="decoder_inputs") # max_len_summary - 1

    # Decoder Embedding layer (separate instance recommended)
    decoder_embedding_layer = Embedding(vocab_size, embedding_dim, mask_zero=True, name="decoder_embedding")
    decoder_embeddings = decoder_embedding_layer(decoder_input_layer)
    decoder_embeddings = Dropout(dropout_rate)(decoder_embeddings) # Dropout after embedding

    # Attention Layer (Additive/Bahdanau)
    # The query will be the decoder hidden state, value will be the encoder outputs
    attention_layer = AdditiveAttention(name="attention_layer")

    # Stacked Decoder LSTMs
    decoder_lstm_outputs = decoder_embeddings
    # Initial state for the first decoder LSTM comes from the final encoder state
    current_decoder_states = encoder_states_list

    print(f"Building Decoder with {num_decoder_layers} LSTM layers and Attention...")
    for i in range(num_decoder_layers):
        decoder_lstm_layer = LSTM(decoder_lstm_units, return_sequences=True, return_state=True, dropout=dropout_rate, name=f"decoder_lstm_{i+1}")

        # Pass initial state only to the first layer
        if i == 0:
             # We need the full sequence and the state from the LSTM
            decoder_lstm_outputs, state_h, state_c = decoder_lstm_layer(decoder_lstm_outputs, initial_state=current_decoder_states)
        else:
             # Subsequent layers receive the output sequence from the previous layer
             # Keras handles state propagation internally *if initial_state isn't provided*
             # However, for attention, we often need the state from the *last* decoder layer.
             # Let's explicitly manage states for clarity, though might be redundant for training model only.
             # Use the states from the previous decoder layer as initial_state for the current one.
             decoder_lstm_outputs, state_h, state_c = decoder_lstm_layer(decoder_lstm_outputs, initial_state=current_decoder_states)

        # Update the current states for the next iteration or for attention query
        current_decoder_states = [state_h, state_c]

    # Now, calculate attention using the final decoder LSTM output sequence and encoder outputs
    # The query for attention is typically the decoder output sequence (or just the state)
    # The value (and key) is the full encoder output sequence
    # Attention query shape: (batch_size, max_len_summary_dec_input, decoder_lstm_units)
    # Attention value shape: (batch_size, max_len_input, 2 * lstm_units)
    context_vector, attention_weights = attention_layer(
        [decoder_lstm_outputs, encoder_outputs], # Pass inputs as a list
        return_attention_scores=True # Keyword args for other options are fine
    )
    # context_vector shape: (batch_size, max_len_summary_dec_input, 2 * lstm_units)

    # Concatenate the context vector and the decoder LSTM output
    # This combined information is fed to the final prediction layer
    decoder_combined_context = Concatenate(axis=-1, name="decoder_attention_concat")(
        [decoder_lstm_outputs, context_vector]
    )
    decoder_combined_context = Dropout(dropout_rate)(decoder_combined_context) # Dropout before final Dense

    # --- Final Output Layer ---
    # Predicts the next token probability distribution
    # Name matches the key in the target part of the tf.data.Dataset
    output_dense_layer = Dense(vocab_size, activation='softmax', name="output_layer")
    decoder_pred_outputs = output_dense_layer(decoder_combined_context)

    # Define the full model for training
    model = Model(inputs=[encoder_input_layer, decoder_input_layer], outputs=decoder_pred_outputs)

    print("\n--- Combined Training Model (with Attention) Summary ---")
    model.summary(line_length=120)
    print(f"Model Inputs: {[inp.name + ': ' + str(inp.shape) for inp in model.inputs]}")
    print(f"Model Outputs: {[out.name + ': ' + str(out.shape) for out in model.outputs]}")
    print(f"Output layer name: {model.layers[-1].name} (should match dataset target key: 'output_layer')")
    print("-" * 30)
    logging.info("Model with Attention built successfully.")

    return model


# --- Execution for Block 6 ---
model = None # Initialize model variable

# Check if datasets are ready from Block 5
if ('dataset_creation_failed' not in locals() or not dataset_creation_failed):
    print("Proceeding to build the Attention model...")
    # Build the model using constants defined in Block 1
    # Note: Decoder input length is MAX_LEN_SUMMARY - 1 because of the shifting
    model = build_seq2seq_attention_model(
        vocab_size=VOCAB_SIZE,
        embedding_dim=EMBEDDING_DIM,
        lstm_units=LSTM_UNITS, # Encoder units per direction
        decoder_lstm_units=DECODER_LSTM_UNITS, # Decoder units
        attention_units=ATTENTION_UNITS, # Attention layer units
        num_encoder_layers=NUM_ENCODER_LAYERS,
        num_decoder_layers=NUM_DECODER_LAYERS,
        dropout_rate=DROPOUT_RATE,
        max_len_input=MAX_LEN_INPUT,
        max_len_summary_dec_input=MAX_LEN_SUMMARY - 1 # Decoder input length
    )

    if model is None:
        print("************************************************************")
        print("ERROR: Model building failed. Check the logs and model definition.")
        print("Cannot proceed to Block 7 (Optimizer/Loss Setup).")
        print("************************************************************")
        logging.error("Model building failed.")
    else:
        print("Block 6 completed successfully. Model with Attention created.")
        logging.info("Block 6 completed successfully. Model with Attention created.")
else:
     print("Skipping Block 6 execution because dataset creation failed in Block 5.")
     logging.error("Skipping Block 6 execution due to dataset creation failure.")

2025-04-24 18:30:42,910 - INFO - Building Seq2Seq Model with Attention...


Proceeding to build the Attention model...

--- Seq2Seq Model Build (with Attention) ---
Building Encoder with 1 BiLSTM layers...
Building Decoder with 1 LSTM layers and Attention...

--- Combined Training Model (with Attention) Summary ---
Model: "model"
________________________________________________________________________________________________________________________
 Layer (type)                          Output Shape               Param #       Connected to                            
 encoder_inputs (InputLayer)           [(None, 1024)]             0             []                                      
                                                                                                                        
 encoder_embedding (Embedding)         (None, 1024, 100)          3000000       ['encoder_inputs[0][0]']                
                                                                                                                        
 decoder_inputs (I

2025-04-24 18:30:45,534 - INFO - Model with Attention built successfully.
2025-04-24 18:30:45,537 - INFO - Block 6 completed successfully. Model with Attention created.


Model Inputs: ['encoder_inputs: (None, 1024)', 'decoder_inputs: (None, 149)']
Model Outputs: ['output_layer/Softmax:0: (None, 149, 30000)']
Output layer name: output_layer (should match dataset target key: 'output_layer')
------------------------------
Block 6 completed successfully. Model with Attention created.


## Block 7: Optimizer and Loss Setup

In [8]:
# Block 7: Optimizer and Loss Setup + Execution
# (Modified for Custom Training Loop)

import tensorflow as tf
import logging

# --- Function Definitions (Simplified for Custom Loop) ---

def setup_optimizer_and_loss(learning_rate):
    """Sets up the optimizer and loss function for the custom training loop."""
    logging.info(f"Setting up optimizer and loss function...")
    print("\n--- Optimizer and Loss Setup ---")

    # Optimizer - Adam is a good default
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, name='Adam')
    print(f"Optimizer: {optimizer.name} (LR={learning_rate})")
    logging.info(f"Optimizer: {optimizer.name} (LR={learning_rate})")

    # Loss Function - SparseCategoricalCrossentropy because target tokens are integers
    # reduction=NONE allows calculating loss per-token, then masking, then averaging.
    # Using from_logits=False because the model's last layer has a softmax activation.
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=False, reduction='none'
    )
    print(f"Loss Function: {loss_object.name}")
    logging.info(f"Loss Function: {loss_object.name} (reduction=none)")

    # Metrics - We'll track these manually in the training loop
    # Example: train_loss = tf.keras.metrics.Mean(name='train_loss')
    # Example: train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
    print("Metrics will be defined and updated within the custom training loop.")
    print("-" * 30)

    return optimizer, loss_object

# --- Execution for Block 7 ---
optimizer = None
loss_object = None

# Check if model exists from Block 6
if 'model' in locals() and model is not None:
    print("Model found. Setting up optimizer and loss...")
    optimizer, loss_object = setup_optimizer_and_loss(LEARNING_RATE)

    if optimizer is None or loss_object is None:
         print("************************************************************")
         print("ERROR: Failed to set up optimizer or loss object.")
         print("Cannot proceed to Block 8 (Custom Training Loop).")
         print("************************************************************")
         logging.error("Optimizer or loss object setup failed.")
    else:
         print("Block 7 completed successfully. Optimizer and loss object created.")
         logging.info("Block 7 completed successfully. Optimizer and loss object created.")

else:
     print("Skipping Block 7 execution because the model was not built successfully in Block 6.")
     logging.error("Skipping Block 7 execution due to model building failure.")

2025-04-24 18:30:45,551 - INFO - Setting up optimizer and loss function...
2025-04-24 18:30:45,556 - INFO - Optimizer: Adam (LR=0.001)
2025-04-24 18:30:45,557 - INFO - Loss Function: sparse_categorical_crossentropy (reduction=none)
2025-04-24 18:30:45,558 - INFO - Block 7 completed successfully. Optimizer and loss object created.


Model found. Setting up optimizer and loss...

--- Optimizer and Loss Setup ---
Optimizer: Adam (LR=0.001)
Loss Function: sparse_categorical_crossentropy
Metrics will be defined and updated within the custom training loop.
------------------------------
Block 7 completed successfully. Optimizer and loss object created.


## Block 8: Custom Training Loop

In [None]:
# Block 8: Custom Training Loop (Scheduled Sampling) + Execution (Corrected train/val steps)

import time
import tensorflow as tf
import numpy as np
import logging
import os

# --- Metrics Definition --- (Unchanged)
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
val_loss = tf.keras.metrics.Mean(name='val_loss')
val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='val_accuracy')

# --- Masked Loss Function --- (Unchanged)
def masked_loss(real, pred):
    loss = loss_object(real, pred)
    mask = tf.math.logical_not(tf.math.equal(real, PAD_ID))
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    total_loss = tf.reduce_sum(loss)
    num_active_elements = tf.reduce_sum(mask)
    return tf.math.divide_no_nan(total_loss, num_active_elements)

# --- Masked Accuracy Function (Removed - Handled directly by metric) ---
# We update metrics directly in train/val steps

# --- Train Step Function (Simplified Forward Pass) ---
@tf.function
def train_step(inputs):
    """Performs one training step with Gradient Tape."""
    # Unpack the dictionary format from tf.data.Dataset
    encoder_input_seq = inputs[0]['encoder_inputs']
    decoder_input_seq = inputs[0]['decoder_inputs'] # Ground truth for teacher forcing
    decoder_target_seq = inputs[1]['output_layer'] # Ground truth targets

    # We are using Teacher Forcing during the training forward pass defined by the Functional Model
    # The Model architecture handles the sequence processing internally.
    # Scheduled Sampling logic needs to be implemented differently, typically
    # by creating a separate model or modifying the layer's call method,
    # which is much more complex than originally anticipated with the Functional API.
    # For now, we will proceed WITHOUT scheduled sampling, using standard Teacher Forcing
    # as defined by the Keras model structure.

    with tf.GradientTape() as tape:
        # Perform the forward pass using the entire model
        # Input is a dictionary matching the keys used in tf.data.Dataset
        predictions = model({'encoder_inputs': encoder_input_seq, 'decoder_inputs': decoder_input_seq}, training=True)
        # predictions shape: (batch_size, max_len_summary_dec_input, vocab_size)

        # Calculate loss for the entire sequence prediction
        # The target sequence `decoder_target_seq` already corresponds to the `predictions`
        # (shifted by one, handled during data prep)
        batch_loss = masked_loss(decoder_target_seq, predictions)

    # Calculate Gradients
    variables = model.trainable_variables
    gradients = tape.gradient(batch_loss, variables)

    # Apply Gradients
    optimizer.apply_gradients(zip(gradients, variables))

    # Update Metrics (use the actual targets and predictions)
    train_loss.update_state(batch_loss)
    train_accuracy.update_state(decoder_target_seq, predictions,
                                sample_weight=tf.cast(tf.math.logical_not(tf.math.equal(decoder_target_seq, PAD_ID)), dtype=tf.float32))


    return batch_loss

# --- Validation Step Function (Simplified Forward Pass) ---
@tf.function
def val_step(inputs):
    """Performs one validation step (using Teacher Forcing)."""
    encoder_input_seq = inputs[0]['encoder_inputs']
    decoder_input_seq = inputs[0]['decoder_inputs'] # Ground truth for teacher forcing
    decoder_target_seq = inputs[1]['output_layer']

    # Perform forward pass in inference mode (training=False)
    predictions = model({'encoder_inputs': encoder_input_seq, 'decoder_inputs': decoder_input_seq}, training=False)

    # Calculate loss
    batch_loss = masked_loss(decoder_target_seq, predictions)

    # Update Validation Metrics
    val_loss.update_state(batch_loss)
    val_accuracy.update_state(decoder_target_seq, predictions,
                              sample_weight=tf.cast(tf.math.logical_not(tf.math.equal(decoder_target_seq, PAD_ID)), dtype=tf.float32))

    return batch_loss

# --- Custom Training Function (Simplified - No Scheduled Sampling Implemented Yet) ---
def train_model_custom(model_to_train, train_ds, val_ds, epochs, optimizer_obj,
                       # Remove sampling parameters for now
                       checkpoint_dir, best_model_path, tensorboard_log_dir,
                       early_stopping_patience, reduce_lr_patience, reduce_lr_factor):
    """Runs the custom training loop (currently using Teacher Forcing)."""

    logging.info("Starting custom model training loop (Teacher Forcing)...")
    print("\n--- Custom Model Training (Teacher Forcing) ---")
    print("NOTE: Scheduled Sampling implementation deferred due to complexity with Functional API.")

    # --- Setup Checkpointing --- (Unchanged)
    checkpoint = tf.train.Checkpoint(optimizer=optimizer_obj, model=model_to_train)
    ckpt_manager = tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=1) # Keep only the best
    best_val_loss = float('inf')
    patience_counter_early_stop = 0
    patience_counter_lr_reduce = 0

    # --- Setup TensorBoard --- (Unchanged)
    summary_writer = tf.summary.create_file_writer(tensorboard_log_dir)

    # --- Calculate steps per epoch --- (Unchanged)
    try:
        steps_per_epoch_train = len(train_ds)
        steps_per_epoch_val = len(val_ds)
        print(f"Train steps per epoch: {steps_per_epoch_train}")
        print(f"Validation steps per epoch: {steps_per_epoch_val}")
    except TypeError:
        print("Could not determine dataset lengths. Ensure drop_remainder=True.")
        logging.error("Could not determine dataset lengths.")
        return None # Exit if dataset length unknown

    # --- Epoch Loop ---
    for epoch in range(epochs):
        start_time_epoch = time.time()

        # Reset metrics at the start of each epoch (Unchanged)
        train_loss.reset_states()
        train_accuracy.reset_states()
        val_loss.reset_states()
        val_accuracy.reset_states()

        print(f"\nEpoch {epoch + 1}/{epochs}")
        logging.info(f"Epoch {epoch + 1} Starting")

        # --- Training Phase ---
        pb_train = tf.keras.utils.Progbar(steps_per_epoch_train, stateful_metrics=['loss', 'accuracy'])
        for i, batch_data in enumerate(train_ds):
            batch_train_loss = train_step(batch_data) # Call simplified train_step
            metrics_values = [('loss', train_loss.result()), ('accuracy', train_accuracy.result())]
            pb_train.update(i + 1, values=metrics_values)

        # --- Validation Phase ---
        print("\nValidation:")
        pb_val = tf.keras.utils.Progbar(steps_per_epoch_val, stateful_metrics=['val_loss', 'val_accuracy'])
        for i, batch_data in enumerate(val_ds):
            val_step(batch_data) # Call simplified val_step
            metrics_values = [('val_loss', val_loss.result()), ('val_accuracy', val_accuracy.result())]
            pb_val.update(i + 1, values=metrics_values)

        epoch_val_loss = val_loss.result()
        epoch_train_loss = train_loss.result()
        epoch_val_acc = val_accuracy.result()
        epoch_train_acc = train_accuracy.result()

        # --- Log Metrics to TensorBoard --- (Unchanged, but removed sampling prob)
        with summary_writer.as_default():
            tf.summary.scalar('train_loss', epoch_train_loss, step=epoch)
            tf.summary.scalar('train_accuracy', epoch_train_acc, step=epoch)
            tf.summary.scalar('val_loss', epoch_val_loss, step=epoch)
            tf.summary.scalar('val_accuracy', epoch_val_acc, step=epoch)
            tf.summary.scalar('learning_rate', optimizer_obj.learning_rate, step=epoch)
            # tf.summary.scalar('sampling_probability', current_sampling_prob, step=epoch) # Removed

        # --- Checkpointing (Save Best Model based on val_loss) --- (Unchanged)
        if epoch_val_loss < best_val_loss:
            print(f"\nValidation loss improved from {best_val_loss:.4f} to {epoch_val_loss:.4f}. Saving model...")
            best_val_loss = epoch_val_loss
            ckpt_save_path = ckpt_manager.save()
            logging.info(f'Saving checkpoint for epoch {epoch+1} at {ckpt_save_path}')
            try:
                 model_to_train.save(best_model_path, save_format='keras')
                 print(f"Full model saved to {best_model_path}")
                 logging.info(f"Best model saved to {best_model_path}")
            except Exception as e:
                 print(f"Error saving full model: {e}")
                 logging.error(f"Error saving full model: {e}", exc_info=True)
            patience_counter_early_stop = 0
            patience_counter_lr_reduce = 0
        else:
            print(f"\nValidation loss did not improve from {best_val_loss:.4f}.")
            patience_counter_early_stop += 1
            patience_counter_lr_reduce += 1

        # --- Reduce Learning Rate on Plateau --- (Unchanged)
        if patience_counter_lr_reduce >= reduce_lr_patience:
            new_lr = optimizer_obj.learning_rate * reduce_lr_factor
            if new_lr >= 1e-6:
                optimizer_obj.learning_rate.assign(new_lr)
                print(f"Reducing learning rate to {optimizer_obj.learning_rate.numpy():.6f}.")
                logging.info(f"Reducing learning rate to {optimizer_obj.learning_rate.numpy():.6f}.")
                patience_counter_lr_reduce = 0
            else:
                print(f"Learning rate reduction skipped, already at minimum.")
                logging.warning(f"Learning rate reduction skipped, already at or below minimum.")

        # --- Early Stopping --- (Unchanged)
        if patience_counter_early_stop >= early_stopping_patience:
            print(f"\nEarly stopping triggered after {early_stopping_patience} epochs without improvement.")
            logging.info(f"Early stopping triggered at epoch {epoch+1}.")
            break

        # --- NO Epsilon Decay Update Needed Now ---

        end_time_epoch = time.time()
        print(f"Time taken for epoch {epoch + 1}: {end_time_epoch - start_time_epoch:.2f} sec")

    print("\n--- Training Finished ---")
    print(f"Best validation loss achieved: {best_val_loss:.4f}")
    logging.info(f"Training finished. Best validation loss: {best_val_loss:.4f}")

    # Optional: Load the best weights back into the model (Unchanged)
    print(f"Loading best model weights from {best_model_path}...")
    try:
        model_to_train = tf.keras.models.load_model(best_model_path)
        print("Best model loaded successfully.")
        logging.info("Best model loaded successfully.")
    except Exception as e:
        print(f"Error loading best model: {e}")
        logging.error(f"Error loading best model: {e}")

    return model_to_train

# --- Execution for Block 8 --- (Simplified Call)

# Check if all required components are available (Unchanged)
if ('model' in locals() and model and
    'optimizer' in locals() and optimizer and
    'loss_object' in locals() and loss_object and
    'train_dataset' in locals() and train_dataset and
    'val_dataset' in locals() and val_dataset):

    # Define paths (Unchanged)
    CHECKPOINT_DIR = os.path.join(OUTPUT_DIR, 'training_checkpoints')
    BEST_MODEL_SAVE_PATH = os.path.join(OUTPUT_DIR, 'pib_summarizer_attention_best.keras')
    TENSORBOARD_LOG_DIR = LOG_DIR

    print(f"Checkpoint directory: {CHECKPOINT_DIR}")
    print(f"Best model save path: {BEST_MODEL_SAVE_PATH}")
    print(f"TensorBoard Log Directory (for custom loop): {TENSORBOARD_LOG_DIR}")
    os.makedirs(CHECKPOINT_DIR, exist_ok=True)
    os.makedirs(os.path.dirname(TENSORBOARD_LOG_DIR), exist_ok=True)


    # Start training (call simplified function)
    trained_model = train_model_custom(
        model_to_train=model,
        train_ds=train_dataset,
        val_ds=val_dataset,
        epochs=EPOCHS,
        optimizer_obj=optimizer,
        # Remove sampling parameters from call
        checkpoint_dir=CHECKPOINT_DIR,
        best_model_path=BEST_MODEL_SAVE_PATH,
        tensorboard_log_dir=TENSORBOARD_LOG_DIR,
        early_stopping_patience=EARLY_STOPPING_PATIENCE_MANUAL,
        reduce_lr_patience=REDUCE_LR_PATIENCE_MANUAL,
        reduce_lr_factor=REDUCE_LR_FACTOR_MANUAL
    )

    # Update the 'model' variable (Unchanged)
    if trained_model:
        model = trained_model
        print("Block 8 completed. Model trained with custom loop (Teacher Forcing).")
        logging.info("Block 8 completed. Model trained with custom loop (Teacher Forcing).")
    else:
        print("Block 8: Custom training loop did not return a valid model.")
        logging.error("Block 8: Custom training loop did not return a valid model.")

else:
     # Error message (Unchanged)
     print("Skipping custom training (Block 8) due to missing model, optimizer, loss object, or datasets.")
     logging.error("Skipping Block 8 execution due to missing components.")
     print(f"  Model exists: {'model' in locals() and model is not None}")
     print(f"  Optimizer exists: {'optimizer' in locals() and optimizer is not None}")
     print(f"  Loss object exists: {'loss_object' in locals() and loss_object is not None}")
     print(f"  Train dataset exists: {'train_dataset' in locals() and train_dataset is not None}")
     print(f"  Validation dataset exists: {'val_dataset' in locals() and val_dataset is not None}")

2025-04-24 18:30:45,612 - INFO - Starting custom model training loop (Teacher Forcing)...
2025-04-24 18:30:45,621 - INFO - Epoch 1 Starting


Checkpoint directory: model_attention_files/training_checkpoints
Best model save path: model_attention_files/pib_summarizer_attention_best.keras
TensorBoard Log Directory (for custom loop): model_attention_files/logs/custom_train/20250424-183033

--- Custom Model Training (Teacher Forcing) ---
NOTE: Scheduled Sampling implementation deferred due to complexity with Functional API.
Train steps per epoch: 1049
Validation steps per epoch: 116

Epoch 1/30


2025-04-24 18:30:52.394065: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA L4" frequency: 2040 num_cores: 58 environment { key: "architecture" value: "8.9" } environment { key: "cuda" value: "11020" } environment { key: "cudnn" value: "8100" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 50331648 shared_memory_size_per_multiprocessor: 102400 memory_size: 21758083072 bandwidth: 300048000 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }
2025-04-24 18:30:53.062693: W tensorflow/core/common_runtime/type_inference.cc:339] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: 


Validation:


2025-04-24 18:41:45.635978: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "GPU" vendor: "NVIDIA" model: "NVIDIA L4" frequency: 2040 num_cores: 58 environment { key: "architecture" value: "8.9" } environment { key: "cuda" value: "11020" } environment { key: "cudnn" value: "8100" } num_registers: 65536 l1_cache_size: 24576 l2_cache_size: 50331648 shared_memory_size_per_multiprocessor: 102400 memory_size: 21758083072 bandwidth: 300048000 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }



Validation loss improved from inf to 5.1210. Saving model...


2025-04-24 18:42:10,832 - INFO - Saving checkpoint for epoch 1 at model_attention_files/training_checkpoints/ckpt-1




2025-04-24 18:42:10,974 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 18:42:10,979 - INFO - Epoch 2 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 1: 685.36 sec

Epoch 2/30

Validation:

Validation loss improved from 5.1210 to 4.4453. Saving model...


2025-04-24 18:52:15,096 - INFO - Saving checkpoint for epoch 2 at model_attention_files/training_checkpoints/ckpt-2




2025-04-24 18:52:15,486 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 18:52:15,491 - INFO - Epoch 3 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 2: 604.51 sec

Epoch 3/30

Validation:

Validation loss improved from 4.4453 to 4.0901. Saving model...


2025-04-24 19:02:01,521 - INFO - Saving checkpoint for epoch 3 at model_attention_files/training_checkpoints/ckpt-3




2025-04-24 19:02:01,902 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 19:02:01,906 - INFO - Epoch 4 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 3: 586.42 sec

Epoch 4/30

Validation:

Validation loss improved from 4.0901 to 3.8742. Saving model...


2025-04-24 19:11:40,322 - INFO - Saving checkpoint for epoch 4 at model_attention_files/training_checkpoints/ckpt-4




2025-04-24 19:11:40,706 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 19:11:40,710 - INFO - Epoch 5 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 4: 578.80 sec

Epoch 5/30

Validation:

Validation loss improved from 3.8742 to 3.7333. Saving model...


2025-04-24 19:21:16,607 - INFO - Saving checkpoint for epoch 5 at model_attention_files/training_checkpoints/ckpt-5




2025-04-24 19:21:16,990 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 19:21:16,995 - INFO - Epoch 6 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 5: 576.28 sec

Epoch 6/30

Validation:

Validation loss improved from 3.7333 to 3.6355. Saving model...


2025-04-24 19:30:47,148 - INFO - Saving checkpoint for epoch 6 at model_attention_files/training_checkpoints/ckpt-6




2025-04-24 19:30:47,526 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 19:30:47,531 - INFO - Epoch 7 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 6: 570.54 sec

Epoch 7/30

Validation:

Validation loss improved from 3.6355 to 3.5583. Saving model...


2025-04-24 19:40:19,013 - INFO - Saving checkpoint for epoch 7 at model_attention_files/training_checkpoints/ckpt-7




2025-04-24 19:40:19,394 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 19:40:19,398 - INFO - Epoch 8 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 7: 571.87 sec

Epoch 8/30

Validation:

Validation loss improved from 3.5583 to 3.5038. Saving model...


2025-04-24 19:49:50,527 - INFO - Saving checkpoint for epoch 8 at model_attention_files/training_checkpoints/ckpt-8




2025-04-24 19:49:50,982 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 19:49:50,986 - INFO - Epoch 9 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 8: 571.59 sec

Epoch 9/30

Validation:

Validation loss improved from 3.5038 to 3.4577. Saving model...


2025-04-24 19:59:19,695 - INFO - Saving checkpoint for epoch 9 at model_attention_files/training_checkpoints/ckpt-9




2025-04-24 19:59:20,074 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 19:59:20,078 - INFO - Epoch 10 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 9: 569.09 sec

Epoch 10/30

Validation:

Validation loss improved from 3.4577 to 3.4221. Saving model...


2025-04-24 20:08:47,349 - INFO - Saving checkpoint for epoch 10 at model_attention_files/training_checkpoints/ckpt-10




2025-04-24 20:08:47,726 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 20:08:47,730 - INFO - Epoch 11 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 10: 567.65 sec

Epoch 11/30

Validation:

Validation loss improved from 3.3699 to 3.3503. Saving model...


2025-04-24 20:37:07,014 - INFO - Saving checkpoint for epoch 13 at model_attention_files/training_checkpoints/ckpt-13




2025-04-24 20:37:07,390 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 20:37:07,394 - INFO - Epoch 14 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 13: 566.80 sec

Epoch 14/30

Validation:

Validation loss improved from 3.3503 to 3.3363. Saving model...


2025-04-24 20:46:34,326 - INFO - Saving checkpoint for epoch 14 at model_attention_files/training_checkpoints/ckpt-14




2025-04-24 20:46:34,710 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 20:46:34,714 - INFO - Epoch 15 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 14: 567.32 sec

Epoch 15/30

Validation:

Validation loss improved from 3.3363 to 3.3220. Saving model...


2025-04-24 20:56:00,800 - INFO - Saving checkpoint for epoch 15 at model_attention_files/training_checkpoints/ckpt-15




2025-04-24 20:56:01,178 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 20:56:01,182 - INFO - Epoch 16 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 15: 566.47 sec

Epoch 16/30

Validation:

Validation loss improved from 3.3220 to 3.3100. Saving model...


2025-04-24 21:05:26,836 - INFO - Saving checkpoint for epoch 16 at model_attention_files/training_checkpoints/ckpt-16




2025-04-24 21:05:27,214 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 21:05:27,218 - INFO - Epoch 17 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 16: 566.04 sec

Epoch 17/30

Validation:

Validation loss improved from 3.3100 to 3.2983. Saving model...


2025-04-24 21:14:52,921 - INFO - Saving checkpoint for epoch 17 at model_attention_files/training_checkpoints/ckpt-17




2025-04-24 21:14:53,298 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 21:14:53,302 - INFO - Epoch 18 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 17: 566.08 sec

Epoch 18/30

Validation:

Validation loss improved from 3.2983 to 3.2891. Saving model...


2025-04-24 21:24:18,948 - INFO - Saving checkpoint for epoch 18 at model_attention_files/training_checkpoints/ckpt-18




2025-04-24 21:24:19,326 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 21:24:19,330 - INFO - Epoch 19 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 18: 566.03 sec

Epoch 19/30

Validation:

Validation loss improved from 3.2891 to 3.2802. Saving model...


2025-04-24 21:33:44,719 - INFO - Saving checkpoint for epoch 19 at model_attention_files/training_checkpoints/ckpt-19




2025-04-24 21:33:45,098 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 21:33:45,102 - INFO - Epoch 20 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 19: 565.77 sec

Epoch 20/30

Validation:

Validation loss improved from 3.2802 to 3.2742. Saving model...


2025-04-24 21:43:10,153 - INFO - Saving checkpoint for epoch 20 at model_attention_files/training_checkpoints/ckpt-20




2025-04-24 21:43:10,534 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 21:43:10,538 - INFO - Epoch 21 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 20: 565.44 sec

Epoch 21/30

Validation:

Validation loss improved from 3.2742 to 3.2683. Saving model...


2025-04-24 21:52:35,072 - INFO - Saving checkpoint for epoch 21 at model_attention_files/training_checkpoints/ckpt-21




2025-04-24 21:52:35,450 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 21:52:35,454 - INFO - Epoch 22 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 21: 564.92 sec

Epoch 22/30

Validation:


2025-04-24 22:01:59,741 - INFO - Epoch 23 Starting



Validation loss did not improve from 3.2683.
Time taken for epoch 22: 564.29 sec

Epoch 23/30

Validation:

Validation loss improved from 3.2683 to 3.2591. Saving model...


2025-04-24 22:11:25,340 - INFO - Saving checkpoint for epoch 23 at model_attention_files/training_checkpoints/ckpt-22




2025-04-24 22:11:25,718 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 22:11:25,723 - INFO - Epoch 24 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 23: 565.98 sec

Epoch 24/30

Validation:

Validation loss improved from 3.2591 to 3.2521. Saving model...


2025-04-24 22:20:50,551 - INFO - Saving checkpoint for epoch 24 at model_attention_files/training_checkpoints/ckpt-23




2025-04-24 22:20:50,930 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 22:20:50,935 - INFO - Epoch 25 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 24: 565.21 sec

Epoch 25/30

Validation:

Validation loss improved from 3.2521 to 3.2508. Saving model...


2025-04-24 22:30:15,836 - INFO - Saving checkpoint for epoch 25 at model_attention_files/training_checkpoints/ckpt-24




2025-04-24 22:30:16,214 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 22:30:16,218 - INFO - Epoch 26 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 25: 565.28 sec

Epoch 26/30

Validation:

Validation loss improved from 3.2508 to 3.2493. Saving model...


2025-04-24 22:39:40,528 - INFO - Saving checkpoint for epoch 26 at model_attention_files/training_checkpoints/ckpt-25




2025-04-24 22:39:40,906 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 22:39:40,910 - INFO - Epoch 27 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 26: 564.69 sec

Epoch 27/30

Validation:

Validation loss improved from 3.2493 to 3.2459. Saving model...


2025-04-24 22:49:06,442 - INFO - Saving checkpoint for epoch 27 at model_attention_files/training_checkpoints/ckpt-26




2025-04-24 22:49:06,822 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 22:49:06,827 - INFO - Epoch 28 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 27: 565.92 sec

Epoch 28/30

Validation:

Validation loss improved from 3.2459 to 3.2397. Saving model...


2025-04-24 22:58:31,033 - INFO - Saving checkpoint for epoch 28 at model_attention_files/training_checkpoints/ckpt-27




2025-04-24 22:58:31,414 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 22:58:31,418 - INFO - Epoch 29 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 28: 564.59 sec

Epoch 29/30

Validation:

Validation loss improved from 3.2397 to 3.2382. Saving model...


2025-04-24 23:07:56,458 - INFO - Saving checkpoint for epoch 29 at model_attention_files/training_checkpoints/ckpt-28




2025-04-24 23:07:56,838 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 23:07:56,842 - INFO - Epoch 30 Starting


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 29: 565.42 sec

Epoch 30/30

Validation:

Validation loss improved from 3.2382 to 3.2377. Saving model...


2025-04-24 23:17:21,910 - INFO - Saving checkpoint for epoch 30 at model_attention_files/training_checkpoints/ckpt-29




2025-04-24 23:17:22,290 - INFO - Best model saved to model_attention_files/pib_summarizer_attention_best.keras
2025-04-24 23:17:22,291 - INFO - Training finished. Best validation loss: 3.2377


Full model saved to model_attention_files/pib_summarizer_attention_best.keras
Time taken for epoch 30: 565.45 sec

--- Training Finished ---
Best validation loss achieved: 3.2377
Loading best model weights from model_attention_files/pib_summarizer_attention_best.keras...


2025-04-24 23:17:24,683 - INFO - Best model loaded successfully.
2025-04-24 23:17:24,685 - INFO - Block 8 completed. Model trained with custom loop (Teacher Forcing).


Best model loaded successfully.
Block 8 completed. Model trained with custom loop (Teacher Forcing).


In [16]:
# Block 9: Inference Model Setup (with Attention) + Execution (Corrected for 1 Enc/1 Dec Layer)

import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, Concatenate, AdditiveAttention, Dropout
from tensorflow.keras.models import Model, load_model
import os
import logging
import numpy as np

# --- Function Definition (Corrected for 1 Enc / 1 Dec Layer & Layer Names) ---
def setup_inference_models_attention_alt(trained_model_path,
                                         num_decoder_layers_config, # <<< ADDED ARGUMENT HERE
                                         decoder_lstm_units,
                                         max_len_input, lstm_units): # lstm_units = encoder units per dir # lstm_units = encoder units per dir
    """Loads the trained model and creates separate encoder/decoder models for inference."""
    logging.info(f"Setting up inference models with attention from: {trained_model_path}")
    print("\n--- Inference Setup (with Attention) ---")

    # --- 1. Load the trained model ---
    if not os.path.exists(trained_model_path):
        print(f"Error: Trained model file not found at {trained_model_path}. Cannot setup inference.")
        logging.error(f"Trained model not found: {trained_model_path}")
        print("-" * 30)
        return None, None
    try:
        logging.info("Loading the full trained model...")
        original_verbosity = tf.get_logger().level
        tf.get_logger().setLevel(logging.ERROR)
        trained_model = load_model(trained_model_path, compile=False)
        tf.get_logger().setLevel(original_verbosity)
        print(f"Successfully loaded trained model from {trained_model_path}")
        print("Note: 'No training configuration' warning during load is expected and benign.")
        loaded_layer_names = [layer.name for layer in trained_model.layers]
        print(f"Layers found in loaded model: {loaded_layer_names}")
        logging.info(f"Layers found in loaded model: {loaded_layer_names}")
    except Exception as e:
        tf.get_logger().setLevel(original_verbosity)
        logging.error(f"Failed to load trained model from {trained_model_path}: {e}", exc_info=True)
        print(f"Error: Failed to load trained model from {trained_model_path}: {e}")
        print("-" * 30)
        return None, None

    # --- 2. Create Inference Encoder Model ---
    logging.info("Creating inference encoder model...")
    inf_encoder = None
    try:
        encoder_input_layer = trained_model.get_layer("encoder_inputs").input
        # Use the correct layer name 'bidirectional' (assuming 1 encoder layer)
        last_encoder_bilstm_layer = trained_model.get_layer("bidirectional")
        encoder_full_seq_output = last_encoder_bilstm_layer.output[0]
        # Get the final concatenated states
        encoder_final_state_h = trained_model.get_layer("encoder_final_h").output
        encoder_final_state_c = trained_model.get_layer("encoder_final_c").output

        inf_encoder = Model(inputs=encoder_input_layer,
                            outputs=[encoder_full_seq_output, encoder_final_state_h, encoder_final_state_c],
                            name="inference_encoder")
        print("Inference Encoder created.")
        inf_encoder.summary(line_length=100) # Optional summary
    except Exception as e:
        logging.error(f"Failed to create inference encoder: {e}", exc_info=True)
        print(f"Error: Failed to create inference encoder: {e}")
        return None, None # Return None for both if encoder fails

    # --- 3. Create Inference Decoder Model ---
    logging.info("Creating inference decoder model...")
    inf_decoder = None
    try:
        # Define Inputs
        decoder_input_single_token = Input(shape=(1,), dtype='int32', name="inf_decoder_input_token")
        # Encoder output sequence depth = 2 * encoder lstm units per direction
        inf_encoder_output_seq = Input(shape=(max_len_input, lstm_units * 2), name="inf_encoder_output_seq")

        # --- Define All Decoder Layer Instances FIRST ---
        # Determine actual number of decoder layers (Should be 1 based on loaded layers)
        actual_num_decoder_layers = 0
        if "decoder_lstm_1" in loaded_layer_names: actual_num_decoder_layers = 1
        # if "decoder_lstm_2" in loaded_layer_names: actual_num_decoder_layers = 2 # Add checks if needed
        print(f"Using {actual_num_decoder_layers} decoder LSTM layer(s) based on loaded model.")
        logging.info(f"Using {actual_num_decoder_layers} decoder LSTM layer(s).")
        if actual_num_decoder_layers == 0:
             raise ValueError("No decoder LSTM layers found in the loaded model.")

        # State inputs only needed for the layers present (h0, c0)
        decoder_state_inputs = []
        state_h_input = Input(shape=(decoder_lstm_units,), name=f'inf_decoder_input_h_0')
        state_c_input = Input(shape=(decoder_lstm_units,), name=f'inf_decoder_input_c_0')
        decoder_state_inputs.extend([state_h_input, state_c_input])

        # Embedding Layer Instance
        trained_decoder_embedding = trained_model.get_layer("decoder_embedding")
        inf_decoder_embedding = Embedding(trained_decoder_embedding.input_dim,
                                           trained_decoder_embedding.output_dim,
                                           mask_zero=False, name="inf_decoder_embedding")
        inf_decoder_embedding.build(input_shape=(None, 1))
        inf_decoder_embedding.set_weights(trained_decoder_embedding.get_weights())
        # Dropout after embedding (Use name 'dropout_1' from loaded list)
        trained_dec_emb_dropout_layer = trained_model.get_layer("dropout_1") # CHECKED from list
        inf_dec_emb_dropout = Dropout(trained_dec_emb_dropout_layer.rate, name="inf_dropout_1")

        # LSTM Layer Instance (Only 1)
        layer_name = "decoder_lstm_1"
        trained_lstm_layer = trained_model.get_layer(layer_name)
        inf_decoder_lstm = LSTM(decoder_lstm_units, return_sequences=True, return_state=True,
                                dropout=trained_lstm_layer.dropout,
                                recurrent_dropout=trained_lstm_layer.recurrent_dropout,
                                name=f"inf_{layer_name}")
        # Build LSTM
        lstm_input_shape = (1, 1, trained_decoder_embedding.output_dim) # Takes embedding output
        lstm_state_shape = (1, decoder_lstm_units)
        inf_decoder_lstm.build(input_shape=[lstm_input_shape, lstm_state_shape, lstm_state_shape])
        inf_decoder_lstm.set_weights(trained_lstm_layer.get_weights())

        # Attention Layer Instance
        trained_attention_layer = trained_model.get_layer("attention_layer")
        inf_attention_layer = AdditiveAttention(name="inf_attention_layer")
        # Build explicitly
        build_query_shape = (1, 1, decoder_lstm_units)
        build_value_shape = (1, max_len_input, lstm_units * 2)
        inf_attention_layer.build(input_shape=[build_query_shape, build_value_shape])
        if trained_attention_layer.get_weights():
             inf_attention_layer.set_weights(trained_attention_layer.get_weights())

        # Concatenate Layer Instance
        inf_concat_layer = Concatenate(axis=-1, name="inf_decoder_attention_concat")

        # Post-Attention Dropout Instance (Use name 'dropout_2' from loaded list)
        trained_post_attn_dropout_layer = trained_model.get_layer("dropout_2") # CHECKED from list
        inf_post_attn_dropout = Dropout(trained_post_attn_dropout_layer.rate, name="inf_dropout_2")

        # Dense Layer Instance
        trained_dense_layer = trained_model.get_layer("output_layer")
        inf_dense_layer = Dense(trained_dense_layer.units,
                                activation=trained_dense_layer.activation, name="inf_output_layer")
        # Build explicitly
        concat_output_dim = decoder_lstm_units + (lstm_units * 2)
        inf_dense_layer.build(input_shape=(None, 1, concat_output_dim))
        inf_dense_layer.set_weights(trained_dense_layer.get_weights())

        # --- Connect Layers using Symbolic Tensors ---
        decoder_embeddings_tensor = inf_decoder_embedding(decoder_input_single_token)
        decoder_output_step = inf_dec_emb_dropout(decoder_embeddings_tensor) # Apply dropout

        # Call the single LSTM layer
        decoder_output_step, state_h_out, state_c_out = inf_decoder_lstm(
            decoder_output_step, initial_state=decoder_state_inputs # Pass [h0_in, c0_in]
        )
        # Define the state outputs
        decoder_state_outputs = [state_h_out, state_c_out]

        # Attention Call (Using positional list as keywords failed before)
        # <<< Trying positional list again, maybe build fixed it >>>
        context_vector, attention_weights = inf_attention_layer(
             [decoder_output_step, inf_encoder_output_seq],
             return_attention_scores=True # Get weights if needed later
        )

        # Concatenate
        decoder_combined_context = inf_concat_layer([decoder_output_step, context_vector])
        decoder_combined_context = inf_post_attn_dropout(decoder_combined_context) # Apply dropout

        # Dense prediction
        decoder_pred_outputs = inf_dense_layer(decoder_combined_context)

        # Define the inference decoder model
        inf_decoder = Model(
             inputs=[decoder_input_single_token, inf_encoder_output_seq] + decoder_state_inputs,
             outputs=[decoder_pred_outputs] + decoder_state_outputs, # logits + [h0_out, c0_out]
             name="inference_decoder"
        )

        print("Inference Decoder created.")
        inf_decoder.summary(line_length=120)

    except Exception as e:
        logging.error(f"Failed to create inference decoder: {e}", exc_info=True)
        print(f"Error: Failed to create inference decoder: {e}")
        inf_decoder = None

    finally:
        print("-" * 30)

    return inf_encoder, inf_decoder


# --- Execution for Block 9 ---
inference_encoder = None
inference_decoder = None

if os.path.exists(BEST_MODEL_SAVE_PATH):
    print(f"Attempting to load best model from: {BEST_MODEL_SAVE_PATH}")
    # Call the alternative setup function
    # Pass NUM_DECODER_LAYERS from config (currently 1), function verifies against loaded model
    inference_encoder, inference_decoder = setup_inference_models_attention_alt(
        BEST_MODEL_SAVE_PATH,
        num_decoder_layers_config=NUM_DECODER_LAYERS, # Pass config value
        decoder_lstm_units=DECODER_LSTM_UNITS,
        max_len_input=MAX_LEN_INPUT,
        lstm_units=LSTM_UNITS
    )

    if inference_encoder is None or inference_decoder is None:
        print("************************************************************")
        print("ERROR: Inference model setup failed. Cannot proceed to generation.")
        print("Check logs above for specific errors.")
        print("************************************************************")
        logging.error("Inference model setup failed.")
    else:
        print("Block 9 completed successfully. Inference encoder and decoder created.")
        logging.info("Block 9 completed successfully. Inference encoder and decoder created.")
else:
    print(f"Skipping inference setup: Trained model not found at {BEST_MODEL_SAVE_PATH}")
    print("Ensure that training (Block 8) ran successfully and saved the model.")
    logging.error(f"Trained model not found at {BEST_MODEL_SAVE_PATH}")

2025-04-25 03:28:05,490 - INFO - Setting up inference models with attention from: model_attention_files/pib_summarizer_attention_best.keras
2025-04-25 03:28:05,491 - INFO - Loading the full trained model...


Attempting to load best model from: model_attention_files/pib_summarizer_attention_best.keras

--- Inference Setup (with Attention) ---


2025-04-25 03:28:07,858 - INFO - Layers found in loaded model: ['encoder_inputs', 'encoder_embedding', 'decoder_inputs', 'dropout', 'decoder_embedding', 'bidirectional', 'dropout_1', 'encoder_final_h', 'encoder_final_c', 'decoder_lstm_1', 'attention_layer', 'decoder_attention_concat', 'dropout_2', 'output_layer']
2025-04-25 03:28:07,859 - INFO - Creating inference encoder model...


Successfully loaded trained model from model_attention_files/pib_summarizer_attention_best.keras
Layers found in loaded model: ['encoder_inputs', 'encoder_embedding', 'decoder_inputs', 'dropout', 'decoder_embedding', 'bidirectional', 'dropout_1', 'encoder_final_h', 'encoder_final_c', 'decoder_lstm_1', 'attention_layer', 'decoder_attention_concat', 'dropout_2', 'output_layer']
Inference Encoder created.
Model: "inference_encoder"
____________________________________________________________________________________________________
 Layer (type)                    Output Shape          Param #     Connected to                     
 encoder_inputs (InputLayer)     [(None, 1024)]        0           []                               
                                                                                                    
 encoder_embedding (Embedding)   (None, 1024, 100)     3000000     ['encoder_inputs[0][0]']         
                                                              

2025-04-25 03:28:07,874 - INFO - Creating inference decoder model...
2025-04-25 03:28:07,880 - INFO - Using 1 decoder LSTM layer(s).


Using 1 decoder LSTM layer(s) based on loaded model.
Inference Decoder created.
Model: "inference_decoder"
________________________________________________________________________________________________________________________
 Layer (type)                          Output Shape               Param #       Connected to                            
 inf_decoder_input_token (InputLayer)  [(None, 1)]                0             []                                      
                                                                                                                        
 inf_decoder_embedding (Embedding)     (None, 1, 100)             3000000       ['inf_decoder_input_token[0][0]']       
                                                                                                                        
 inf_dropout_1 (Dropout)               (None, 1, 100)             0             ['inf_decoder_embedding[0][0]']         
                                              

2025-04-25 03:28:08,229 - INFO - Block 9 completed successfully. Inference encoder and decoder created.


------------------------------
Block 9 completed successfully. Inference encoder and decoder created.


In [19]:
# Block 10: Generation Functions (Greedy and Beam Search) + Execution (with Detokenization Workaround)

import tensorflow as tf
import numpy as np
import logging
import time
from tqdm import tqdm # Use standard tqdm for non-notebook loops

# --- Greedy Search Generation Function (with Detokenization Workaround) ---
def generate_summary_greedy(input_text, tokenizer_obj, inf_encoder_model, inf_decoder_model,
                             max_len_input, max_len_summary, start_token_id, end_token_id,
                             pad_token_id):
    """Generates a summary using greedy decoding."""
    if not all([tokenizer_obj, inf_encoder_model, inf_decoder_model]):
        logging.error("Greedy generation failed: Missing tokenizer or inference models.")
        return "[Error: Missing components]"

    # 1. Tokenize and pad input text
    try:
        input_seq = tokenizer_obj.encode_as_ids(str(input_text))
        encoder_input_data = tf.keras.preprocessing.sequence.pad_sequences(
            [input_seq], maxlen=max_len_input, padding='post', truncating='post', value=pad_token_id
        ).astype(np.int32)
    except Exception as e:
        logging.error(f"Error tokenizing input for greedy generation: {e}")
        return "[Error: Tokenization failed]"

    # 2. Encode the input sequence
    try:
        encoder_outputs = inf_encoder_model.predict(encoder_input_data, verbose=0)
        encoder_output_seq = encoder_outputs[0]
        decoder_states_value = [encoder_outputs[1], encoder_outputs[2]] # [h, c] for 1 decoder layer
    except Exception as e:
        logging.error(f"Error running inference encoder: {e}", exc_info=True)
        return "[Error: Encoder failure]"

    # 3. Initialize Decoder Input
    decoder_input_token = np.array([[start_token_id]], dtype=np.int32)

    # 4. Greedy Decoding Loop
    decoded_tokens = []
    for _ in range(max_len_summary):
        try:
            decoder_inputs = [decoder_input_token, encoder_output_seq] + decoder_states_value
            decoder_outputs = inf_decoder_model.predict(decoder_inputs, verbose=0)
            output_tokens_logits = decoder_outputs[0]
            new_states = decoder_outputs[1:]
        except Exception as e:
            logging.error(f"Error during greedy decoder prediction step: {e}", exc_info=True)
            if not decoded_tokens or decoded_tokens[-1] != end_token_id:
                 decoded_tokens.append(end_token_id)
            break

        sampled_token_id = int(np.argmax(output_tokens_logits[0, -1, :]))

        if sampled_token_id == end_token_id:
            break

        if sampled_token_id != pad_token_id and sampled_token_id != start_token_id:
             if isinstance(sampled_token_id, int):
                decoded_tokens.append(sampled_token_id)
             else:
                logging.error(f"Non-integer sampled_token_id encountered in greedy: {sampled_token_id} (type: {type(sampled_token_id)})")
                break

        if len(decoded_tokens) >= max_len_summary:
            break

        decoder_input_token = np.array([[sampled_token_id]], dtype=np.int32)
        if len(new_states) == len(decoder_states_value):
           decoder_states_value = new_states
        else:
             logging.error(f"State length mismatch during greedy decoding.")
             break

    # 5. Detokenize the result (Piece by Piece Workaround)
    print(f"DEBUG: Greedy tokens before decode: {decoded_tokens}") # Keep debug print
    logging.debug(f"Greedy tokens before decode: {decoded_tokens}")
    summary = "[Error: Detokenization failed]" # Default error message
    try:
        if not decoded_tokens:
            summary = ""
            logging.warning("Token list is empty for detokenization (Greedy).")
        else:
            # Decode piece by piece
            pieces = [tokenizer_obj.id_to_piece(token_id) for token_id in decoded_tokens]
            # Join pieces - Replace SentencePiece space ' ' (U+2581) with normal space
            summary = "".join(pieces).replace(' ', ' ').strip()
            # Optional: More sophisticated joining
            # summary = tokenizer_obj.DecodePieces(pieces)

    except Exception as e:
        logging.error(f"Error detokenizing greedy result (piece by piece): {e}")
        summary = f"[Error: Detokenization failed ({e})]"

    return summary

# --- Beam Search Generation Function (with Detokenization Workaround) ---
def generate_summary_beam_search(input_text, tokenizer_obj, inf_encoder_model, inf_decoder_model,
                                 max_len_input, max_len_summary, start_token_id, end_token_id,
                                 pad_token_id, beam_width):
    """Generates a summary using beam search decoding."""
    if not all([tokenizer_obj, inf_encoder_model, inf_decoder_model]):
        logging.error("Beam search generation failed: Missing tokenizer or inference models.")
        return "[Error: Missing components]"

     # 1. Tokenize and pad input text
    try:
        input_seq = tokenizer_obj.encode_as_ids(str(input_text))
        encoder_input_data = tf.keras.preprocessing.sequence.pad_sequences(
            [input_seq], maxlen=max_len_input, padding='post', truncating='post', value=pad_token_id
        ).astype(np.int32)
    except Exception as e:
        logging.error(f"Error tokenizing input for beam search: {e}")
        return "[Error: Tokenization failed]"

    # 2. Encode the input sequence
    try:
        encoder_outputs = inf_encoder_model.predict(encoder_input_data, verbose=0)
        encoder_output_seq = encoder_outputs[0]
        decoder_initial_states = [encoder_outputs[1], encoder_outputs[2]]
    except Exception as e:
        logging.error(f"Error running inference encoder for beam search: {e}", exc_info=True)
        return "[Error: Encoder failure]"

    # 3. Initialize Beam Search
    initial_beam = (0.0, [start_token_id], decoder_initial_states)
    beams = [initial_beam]
    completed_beams = []

    # 4. Beam Search Decoding Loop
    for _ in range(max_len_summary):
        new_beams = []
        all_candidates = []
        processed_beams_count = 0

        for log_prob, seq, current_states in beams:
            processed_beams_count += 1
            if seq[-1] == end_token_id:
                completed_beams.append((log_prob, seq, current_states))
                continue

            decoder_input_token = np.array([[seq[-1]]], dtype=np.int32)
            try:
                decoder_inputs = [decoder_input_token, encoder_output_seq] + current_states
                decoder_outputs = inf_decoder_model.predict(decoder_inputs, verbose=0)
                output_tokens_logits = decoder_outputs[0][0, -1, :]
                new_states = decoder_outputs[1:]
            except Exception as e:
                 logging.warning(f"Decoder prediction failed for beam step {processed_beams_count}: {e}")
                 completed_beams.append((log_prob - 100.0, seq + [end_token_id], current_states))
                 continue

            log_probs = tf.nn.log_softmax(output_tokens_logits).numpy()
            vocab_size = len(log_probs)
            effective_beam_width = min(beam_width, vocab_size)
            top_k_indices = np.argsort(log_probs)[-effective_beam_width:]
            top_k_log_probs = log_probs[top_k_indices]

            for i in range(len(top_k_indices)):
                token_id = int(top_k_indices[i]) # Ensure integer
                if token_id == pad_token_id or token_id == start_token_id:
                    continue
                token_log_prob = top_k_log_probs[i]
                new_seq = seq + [token_id]
                new_log_prob = log_prob + token_log_prob
                if all(isinstance(t, int) for t in new_seq):
                    all_candidates.append((new_log_prob, new_seq, new_states))
                else:
                    logging.error(f"Non-integer found in beam candidate sequence: {new_seq}")


        if processed_beams_count == len(completed_beams) and processed_beams_count > 0:
            break
        if not all_candidates:
            break

        ordered_candidates = sorted(all_candidates, key=lambda x: x[0], reverse=True)
        beams = ordered_candidates[:beam_width]

        if all(b[1][-1] == end_token_id for b in beams):
            completed_beams.extend(beams)
            break

    # 5. Final Selection
    completed_beams.extend(beams)
    if not completed_beams:
         logging.warning("Beam search finished with no completed beams.")
         return "[Error: No sequences generated]"

    best_beam = sorted(completed_beams, key=lambda x: x[0], reverse=True)[0]
    best_seq = best_beam[1]

    # 6. Detokenize (Piece by Piece Workaround)
    final_tokens = [token for token in best_seq if token not in [start_token_id, end_token_id, pad_token_id]]
    print(f"DEBUG: Beam tokens before decode: {final_tokens}") # Keep debug print
    logging.debug(f"Beam tokens before decode: {final_tokens}")
    summary = "[Error: Detokenization failed]" # Default error message
    try:
        if not final_tokens:
            summary = ""
            logging.warning("Token list is empty for detokenization (Beam).")
        else:
             # Decode piece by piece
            pieces = [tokenizer_obj.id_to_piece(token_id) for token_id in final_tokens]
            # Join pieces
            summary = "".join(pieces).replace(' ', ' ').strip()
            # Optional: Try DecodePieces
            # summary = tokenizer_obj.DecodePieces(pieces)

    except Exception as e:
        logging.error(f"Error detokenizing beam search result (piece by piece): {e}")
        summary = f"[Error: Detokenization failed ({e})]"

    return summary


# --- Execution for Block 10 ---
print("\n--- Block 10: Generation Functions Defined ---")
print("Functions `generate_summary_greedy` and `generate_summary_beam_search` are now available.")
print("They will be used in Block 11 for evaluation.")
logging.info("Block 10 completed. Generation functions defined.")

# --- Optional: Test one generation ---
# Check if all necessary components exist before testing
if ('inference_encoder' in locals() and inference_encoder and
    'inference_decoder' in locals() and inference_decoder and
    'tokenizer' in locals() and tokenizer and
    'processed_df' in locals() and not processed_df.empty):

    # Select a sample ensuring it's not empty
    sample_text = None
    for text in processed_df['cleaned_text']:
        if text and isinstance(text, str) and len(text.split()) > 5:
             sample_text = text
             break

    if sample_text:
        print("\n--- Sample Generation Test ---")
        print(f"Input Text (sample):\n{sample_text[:300]}...")

        print("\nGenerating (Greedy):")
        start_g = time.time()
        greedy_summary = generate_summary_greedy(
            sample_text, tokenizer, inference_encoder, inference_decoder,
            MAX_LEN_INPUT, MAX_LEN_SUMMARY, START_ID, END_ID, PAD_ID
        )
        print(f"Time: {time.time() - start_g:.2f}s")
        print(f"Output:\n{greedy_summary}") # Check this output carefully

        print("\nGenerating (Beam Search):")
        start_b = time.time()
        beam_summary = generate_summary_beam_search(
            sample_text, tokenizer, inference_encoder, inference_decoder,
            MAX_LEN_INPUT, MAX_LEN_SUMMARY, START_ID, END_ID, PAD_ID, BEAM_WIDTH
        )
        print(f"Time: {time.time() - start_b:.2f}s")
        print(f"Output:\n{beam_summary}") # Check this output carefully
        print("-" * 30)
    else:
        print("\nCould not find a valid non-empty sample text in processed_df for testing.")

else:
    print("\nSkipping sample generation test: Inference models, tokenizer, or processed_df not available.")

2025-04-25 03:37:34,226 - INFO - Block 10 completed. Generation functions defined.



--- Block 10: Generation Functions Defined ---
Functions `generate_summary_greedy` and `generate_summary_beam_search` are now available.
They will be used in Block 11 for evaluation.

--- Sample Generation Test ---
Input Text (sample):
states can flexibly use of central assistance of inr4,000 per toilet, says shri venkaiah naidu parliament members discuss new urban schemes suggest assured flow of gst revenue to urban local bodies shortcomings of jnnurm reviewed shri naidu outlines new design improvements members of the parliamenta...

Generating (Greedy):
DEBUG: Greedy tokens before decode: [16924, 1, 4, 22, 7, 38, 13, 284, 211, 5, 31, 2156, 1776, 5, 375, 4, 203, 7, 4, 915, 179, 144, 5, 14, 1059, 149, 10, 205, 4, 233, 7, 362, 8, 592, 6, 4, 88, 5, 422, 18, 4, 22, 7, 38, 13, 284, 211, 5, 31, 1352, 768, 5, 150, 4, 177, 7, 4, 81, 1619, 11, 19, 108, 149, 6, 4, 22, 150, 4, 177, 7, 4, 81, 1619, 11, 19, 108, 149, 5, 2363, 32, 4, 36, 23, 15, 170, 10, 205, 4, 895, 7, 4, 1171, 5, 61, 4, 271, 7, 

In [20]:
# Block 12: Summarize a Single PDF File

import os
import logging
import time
# Attempt to import the PDF library
try:
    import fitz  # PyMuPDF
    _pymupdf_installed = True
except ImportError:
    print("PyMuPDF not found. Please install it to process PDFs:")
    print("!pip install PyMuPDF")
    print("Then RESTART the kernel and re-run this block.")
    _pymupdf_installed = False

# --- PDF Text Extraction Function ---
def extract_text_from_pdf(pdf_path):
    """Extracts text from all pages of a PDF file using PyMuPDF."""
    if not _pymupdf_installed:
        logging.error("PyMuPDF library not available. Cannot extract PDF text.")
        return None, "PyMuPDF library not installed."

    if not os.path.exists(pdf_path):
        logging.error(f"PDF file not found at: {pdf_path}")
        return None, f"PDF file not found at: {pdf_path}"

    extracted_text = ""
    try:
        doc = fitz.open(pdf_path)
        logging.info(f"Opened PDF: {pdf_path}, Pages: {doc.page_count}")
        for page_num in range(doc.page_count):
            page = doc.load_page(page_num)
            extracted_text += page.get_text("text") # Extract plain text
            extracted_text += "\n" # Add newline between pages
        doc.close()
        logging.info(f"Successfully extracted text from {pdf_path}.")
        return extracted_text, None # Return text and no error
    except Exception as e:
        logging.error(f"Failed to extract text from PDF {pdf_path}: {e}", exc_info=True)
        return None, f"Error extracting text from PDF: {e}"

# --- Summarization Execution ---

# <<< IMPORTANT: SET THE PATH TO YOUR PDF FILE BELOW >>>
pdf_path_to_summarize = "Test_Pdfs_folder/PIB_270235_2025_04_15.pdf" # REPLACE WITH ACTUAL PATH

print(f"\n--- Summarizing PDF: {pdf_path_to_summarize} ---")

# Check if all necessary components are ready
summarization_possible = True
if not _pymupdf_installed:
    print("Cannot proceed: PyMuPDF is not installed.")
    summarization_possible = False
elif 'tokenizer' not in locals() or tokenizer is None:
    print("Cannot proceed: Tokenizer not found (Run Block 4?).")
    summarization_possible = False
elif 'inference_encoder' not in locals() or inference_encoder is None:
    print("Cannot proceed: Inference encoder not found (Run Block 9?).")
    summarization_possible = False
elif 'inference_decoder' not in locals() or inference_decoder is None:
    print("Cannot proceed: Inference decoder not found (Run Block 9?).")
    summarization_possible = False
elif 'clean_text' not in globals():
     print("Cannot proceed: clean_text function not defined (Run Block 2?).")
     summarization_possible = False
elif 'generate_summary_beam_search' not in globals(): # Or check for greedy if preferred
      print("Cannot proceed: Generation function not defined (Run Block 10?).")
      summarization_possible = False


if summarization_possible:
    # 1. Extract Text
    print("\nStep 1: Extracting text from PDF...")
    start_extract = time.time()
    raw_text, error_msg = extract_text_from_pdf(pdf_path_to_summarize)
    print(f"Extraction time: {time.time() - start_extract:.2f}s")

    if error_msg:
        print(f"Extraction Failed: {error_msg}")
    elif not raw_text or not raw_text.strip():
         print("Extraction Failed: No text found in the PDF.")
    else:
        print(f"Successfully extracted {len(raw_text)} characters.")

        # 2. Clean Text
        print("\nStep 2: Cleaning extracted text...")
        start_clean = time.time()
        # Use the clean_text function defined in Block 2
        cleaned_input_text = clean_text(raw_text)
        print(f"Cleaning time: {time.time() - start_clean:.2f}s")

        if not cleaned_input_text or not cleaned_input_text.strip():
            print("Cleaning Failed: Text became empty after cleaning.")
        else:
            print("Text cleaned successfully.")
            print("\nCleaned Text (First 500 chars):\n", cleaned_input_text[:500], "...")

            # 3. Generate Summary
            print("\nStep 3: Generating summary using Beam Search...")
            start_gen = time.time()
            # Use the generate_summary_beam_search function from Block 10
            generated_summary = generate_summary_beam_search(
                input_text=cleaned_input_text,
                tokenizer_obj=tokenizer,
                inf_encoder_model=inference_encoder,
                inf_decoder_model=inference_decoder,
                max_len_input=MAX_LEN_INPUT,
                max_len_summary=MAX_LEN_SUMMARY,
                start_token_id=START_ID,
                end_token_id=END_ID,
                pad_token_id=PAD_ID,
                beam_width=BEAM_WIDTH
            )
            print(f"Generation time: {time.time() - start_gen:.2f}s")

            # 4. Display Summary
            print("\n--- Generated Summary ---")
            print(generated_summary)
            print("-" * 25)

print("\nPDF Summarization Test Complete.")

2025-04-25 03:54:01,468 - INFO - Opened PDF: Test_Pdfs_folder/PIB_270235_2025_04_15.pdf, Pages: 1
2025-04-25 03:54:01,488 - INFO - Successfully extracted text from Test_Pdfs_folder/PIB_270235_2025_04_15.pdf.



--- Summarizing PDF: Test_Pdfs_folder/PIB_270235_2025_04_15.pdf ---

Step 1: Extracting text from PDF...
Extraction time: 0.04s
Successfully extracted 2446 characters.

Step 2: Cleaning extracted text...
Cleaning time: 0.00s
Text cleaned successfully.

Cleaned Text (First 500 chars):
 15-april-2025 1054 ist ' haj pilgrimage the accords high priority for indian muslims to undertake the annual haj pilgrimage. as a result of its efforts, the country allocation for india which was 136,020 in 2014 has gradually increased to 175,025 in 2025. these quotas are finalized by the saudi authorities closer to the time of the pilgrimage. the moma through the haj committee of india manages arrangements for the bulk of the quota allotted to india, which is 122,518 in the current year. all th ...

Step 3: Generating summary using Beam Search...
DEBUG: Beam tokens before decode: [29854, 1, 8371, 1, 25, 315, 228, 24, 4, 384, 15, 170, 10, 205, 19, 23, 15, 299, 8, 121, 6, 40, 150, 4, 36, 23, 15, 226, 10, 