In [None]:
# # NEW Block 1: Setup and Configuration (Modified for Speed)
# import os
# import re
# import json
# import time
# import logging
# from datetime import datetime
# import importlib.metadata
# import gc

# import pandas as pd
# import numpy as np
# import tensorflow as tf
# from tensorflow import keras
# from tensorflow.keras import layers
# from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard

# # <<< CHANGE: Enable Mixed Precision >>>
# # Using float16 for computations where possible can speed up training on compatible GPUs (like L4)
# # and reduce memory usage.
# tf.keras.mixed_precision.set_global_policy('mixed_float16')
# print("Attempting to enable Mixed Precision (mixed_float16)")
# logging.info("Attempting to enable Mixed Precision (mixed_float16)")


# # --- Library Imports/Checks (Keep as before) ---
# try:
#     import sentencepiece as spm
# except ImportError:
#     print("SentencePiece not found. You might need to install it (`pip install sentencepiece`)")
#     spm = None # Indicate missing library

# try:
#     from langdetect import detect, DetectorFactory
#     from langdetect.lang_detect_exception import LangDetectException
#     DetectorFactory.seed = 0
#     _langdetect_installed = True
# except ImportError:
#     print("langdetect not found. You might need to install it (`pip install langdetect`)")
#     _langdetect_installed = False

# try:
#     from rouge_score import rouge_scorer, scoring
# except ImportError:
#     print("rouge-score not found. You might need to install it (`pip install rouge-score nltk`)")
#     try:
#         import nltk
#         nltk.download('punkt', quiet=True)
#     except ImportError:
#         print("NLTK not found, which might be needed for rouge-score.")

# from tqdm.notebook import tqdm

# # --- Configuration ---
# OUTPUT_DIR = 'model3_files'
# os.makedirs(OUTPUT_DIR, exist_ok=True)
# print(f"Ensuring output directory exists: {os.path.abspath(OUTPUT_DIR)}")

# # File Paths
# INPUT_JSONL = 'mergedt04.jsonl'
# OUTPUT_PARQUET = os.path.join(OUTPUT_DIR, 'processed_dataframe.parquet')
# TOKENIZER_MODEL_PREFIX = os.path.join(OUTPUT_DIR, 'pib_summarizer_spm_50k') # Keep same tokenizer
# TOKENIZER_MODEL_FILE = f'{TOKENIZER_MODEL_PREFIX}.model'
# LOG_DIR = os.path.join(OUTPUT_DIR, "logs", "fit", datetime.now().strftime("%Y%m%d-%H%M%S"))
# MODEL_SAVE_PATH = os.path.join(OUTPUT_DIR, 'pib_summarizer_no_attention_FAST.keras') # Changed model name

# # Data Processing Params (Keep Same - avoid re-running Blocks 2-4 if possible)
# MIN_INPUT_WORDS = 20
# MIN_SUMMARY_WORDS = 5
# LANG_DETECT_THRESHOLD = 0.90

# # Tokenizer Params (Keep Same - avoid re-running Block 4)
# VOCAB_SIZE = 50000
# PAD_ID = 0
# UNK_ID = 1
# START_ID = 2
# END_ID = 3

# # <<< CHANGE: Model Hyperparameters for Speed >>>
# EMBEDDING_DIM = 300 # Keep embedding size (important for representation)
# LSTM_UNITS = 512  # Reduced from 1024
# DECODER_LSTM_UNITS = LSTM_UNITS * 2 # Now 1024 (was 2048)
# NUM_ENCODER_LAYERS = 2 # Reduced from 3
# NUM_DECODER_LAYERS = 2 # Reduced from 3
# DROPOUT_RATE = 0.2 # Keep dropout
# MAX_LEN_INPUT = 1024 # Keep sequence lengths
# MAX_LEN_SUMMARY = 150 # Keep sequence lengths

# # <<< CHANGE: Training Params - Increase Batch Size (Trial) >>>
# BATCH_SIZE = 128 # Increased from 32/64 - Relying on Mixed Precision & smaller model
# EPOCHS = 30 # Reduce max epochs slightly, relying on early stopping
# LEARNING_RATE = 0.001 # Keep LR
# EARLY_STOPPING_PATIENCE = 5 # Keep patience
# REDUCE_LR_PATIENCE = 3 # Keep patience
# REDUCE_LR_FACTOR = 0.2 # Keep factor

# # Inference Params
# BEAM_WIDTH = 5

# # --- Setup Logging ---
# LOG_FILE_PATH = os.path.join(OUTPUT_DIR, 'training_log_FAST.log') # Changed log file name
# # Ensure logging handlers are cleared if re-running the cell in the same kernel session
# root_logger = logging.getLogger()
# if root_logger.hasHandlers():
#     root_logger.handlers.clear()

# logging.basicConfig(level=logging.INFO,
#                     format='%(asctime)s - %(levelname)s - %(message)s',
#                     handlers=[
#                         logging.FileHandler(LOG_FILE_PATH),
#                         logging.StreamHandler()
#                     ])
# print(f"Logging setup complete. Log file: {LOG_FILE_PATH}")


# # --- Debugging Info ---
# print("\n--- Configuration (Optimized for Speed) ---")
# logging.info("--- Configuration (Optimized for Speed) ---")
# print(f"Mixed Precision Enabled: {'mixed_float16' in tf.keras.mixed_precision.global_policy().name}")
# logging.info(f"Mixed Precision Enabled: {'mixed_float16' in tf.keras.mixed_precision.global_policy().name}")
# # ... (rest of the debugging prints remain the same, but will reflect new values) ...
# print(f"TensorFlow Version: {tf.__version__}"); logging.info(f"TensorFlow Version: {tf.__version__}")
# try:
#     print(f"SentencePiece Version: {spm.__version__}"); logging.info(f"SentencePiece Version: {spm.__version__}")
# except NameError: print("SentencePiece not imported."); logging.warning("SentencePiece not imported.")

# if _langdetect_installed:
#     try:
#         langdetect_version = importlib.metadata.version("langdetect")
#         print(f"Langdetect Version: {langdetect_version}"); logging.info(f"Langdetect Version: {langdetect_version}")
#     except Exception as e:
#         print(f"Langdetect Version: Could not determine version ({e})"); logging.warning(f"Langdetect Version: Could not determine version ({e})")
# else:
#      print("Langdetect: Not installed or failed to import."); logging.warning("Langdetect: Not installed or failed to import.")

# gpu_devices = tf.config.list_physical_devices('GPU')
# print(f"GPU Available: {gpu_devices}"); logging.info(f"GPU Available: {gpu_devices}")
# print(f"Input JSONL: {INPUT_JSONL}"); logging.info(f"Input JSONL: {INPUT_JSONL}")
# print(f"Output Parquet Cache: {OUTPUT_PARQUET}"); logging.info(f"Output Parquet Cache: {OUTPUT_PARQUET}")
# print(f"Tokenizer Model Prefix: {TOKENIZER_MODEL_PREFIX}"); logging.info(f"Tokenizer Model Prefix: {TOKENIZER_MODEL_PREFIX}")
# print(f"Model Save Path: {MODEL_SAVE_PATH}"); logging.info(f"Model Save Path: {MODEL_SAVE_PATH}")
# print(f"TensorBoard Log Dir: {LOG_DIR}"); logging.info(f"TensorBoard Log Dir: {LOG_DIR}")
# print(f"Vocab Size: {VOCAB_SIZE}"); logging.info(f"Vocab Size: {VOCAB_SIZE}")
# print(f"Embedding Dim: {EMBEDDING_DIM}"); logging.info(f"Embedding Dim: {EMBEDDING_DIM}")
# print(f"Encoder LSTM Units (per direction): {LSTM_UNITS}"); logging.info(f"Encoder LSTM Units (per direction): {LSTM_UNITS}")
# print(f"Decoder LSTM Units: {DECODER_LSTM_UNITS}"); logging.info(f"Decoder LSTM Units: {DECODER_LSTM_UNITS}")
# print(f"Encoder Layers: {NUM_ENCODER_LAYERS}"); logging.info(f"Encoder Layers: {NUM_ENCODER_LAYERS}")
# print(f"Decoder Layers: {NUM_DECODER_LAYERS}"); logging.info(f"Decoder Layers: {NUM_DECODER_LAYERS}")
# print(f"Max Input Length: {MAX_LEN_INPUT}"); logging.info(f"Max Input Length: {MAX_LEN_INPUT}")
# print(f"Max Summary Length: {MAX_LEN_SUMMARY}"); logging.info(f"Max Summary Length: {MAX_LEN_SUMMARY}")
# print(f"Batch Size: {BATCH_SIZE}"); logging.info(f"Batch Size: {BATCH_SIZE}")
# print(f"Max Epochs: {EPOCHS}"); logging.info(f"Max Epochs: {EPOCHS}")
# print(f"Learning Rate: {LEARNING_RATE}"); logging.info(f"Learning Rate: {LEARNING_RATE}")
# print("-" * 30); logging.info("-" * 30)

In [1]:
# Block 1: Setup and Configuration (Modified for Output Directory)
import os
import re
import json
import time
import logging
from datetime import datetime
import importlib.metadata # Use for getting package versions (Python 3.8+)
# import pkg_resources # Fallback if importlib.metadata is not available or desired
import gc # For garbage collection

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, TensorBoard

# Check if SentencePiece is available, install if not (useful in notebooks)
try:
    import sentencepiece as spm
except ImportError:
    print("SentencePiece not found. You might need to install it (`pip install sentencepiece`)")
    # If in a notebook, uncomment and run the next line, then RESTART the kernel
    # %pip install sentencepiece
    # import sentencepiece as spm # Try importing again after potential install

# Check if langdetect is available
try:
    from langdetect import detect, DetectorFactory
    from langdetect.lang_detect_exception import LangDetectException
    # Ensure consistent results for langdetect
    DetectorFactory.seed = 0
    _langdetect_installed = True
except ImportError:
    print("langdetect not found. You might need to install it (`pip install langdetect`)")
    # If in a notebook, uncomment and run the next line, then RESTART the kernel
    # %pip install langdetect
    _langdetect_installed = False


# Check if rouge-score is available (for evaluation later)
try:
    from rouge_score import rouge_scorer, scoring
except ImportError:
    print("rouge-score not found. You might need to install it (`pip install rouge-score nltk`)")
    # If in a notebook, uncomment and run the next line, then RESTART the kernel
    # %pip install rouge-score nltk
    # Need to download nltk data if not already present
    try:
        import nltk
        nltk.download('punkt', quiet=True)
    except ImportError:
        print("NLTK not found, which might be needed for rouge-score.")


from tqdm.notebook import tqdm # Use tqdm.notebook for Jupyter/Vertex AI Notebooks

# --- Configuration ---
# <<< CHANGE: Define Base Output Directory >>>
OUTPUT_DIR = 'model3_files'
os.makedirs(OUTPUT_DIR, exist_ok=True) # Create the directory if it doesn't exist
print(f"Ensuring output directory exists: {os.path.abspath(OUTPUT_DIR)}")

# File Paths (now relative to OUTPUT_DIR)
INPUT_JSONL = 'mergedt04.jsonl' # Input dataset (Assuming it's in the same dir as notebook or provide full path)
OUTPUT_PARQUET = os.path.join(OUTPUT_DIR, 'processed_dataframe.parquet') # Cached processed data
TOKENIZER_MODEL_PREFIX = os.path.join(OUTPUT_DIR, 'pib_summarizer_spm_50k') # Prefix for SentencePiece model files
TOKENIZER_MODEL_FILE = f'{TOKENIZER_MODEL_PREFIX}.model'
LOG_DIR = os.path.join(OUTPUT_DIR, "logs", "fit", datetime.now().strftime("%Y%m%d-%H%M%S"))
MODEL_SAVE_PATH = os.path.join(OUTPUT_DIR, 'pib_summarizer_no_attention.keras') # Where to save the best model

# Data Processing Params
MIN_INPUT_WORDS = 20
MIN_SUMMARY_WORDS = 5
LANG_DETECT_THRESHOLD = 0.90 # Note: Threshold not used in current langdetect implementation, just checking 'en'

# Tokenizer Params
VOCAB_SIZE = 30000 # Target vocabulary size
PAD_ID = 0
UNK_ID = 1
START_ID = 2
END_ID = 3

# Model Hyperparameters
EMBEDDING_DIM = 100
LSTM_UNITS = 256
DECODER_LSTM_UNITS = LSTM_UNITS * 2
NUM_ENCODER_LAYERS = 2
NUM_DECODER_LAYERS = 2
DROPOUT_RATE = 0.2
MAX_LEN_INPUT = 1024
MAX_LEN_SUMMARY = 150

# Training Params
BATCH_SIZE = 64 # Tune based on GPU memory
EPOCHS = 30
LEARNING_RATE = 0.001
EARLY_STOPPING_PATIENCE = 5
REDUCE_LR_PATIENCE = 3
REDUCE_LR_FACTOR = 0.2

# Inference Params
BEAM_WIDTH = 5 # For beam search (if implemented later)

# --- Setup Logging ---
# <<< CHANGE: Log file within OUTPUT_DIR >>>
LOG_FILE_PATH = os.path.join(OUTPUT_DIR, 'training_log.log')
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler(LOG_FILE_PATH), # Log to file
                        logging.StreamHandler() # Also log to console
                    ])
print(f"Logging setup complete. Log file: {LOG_FILE_PATH}")


# --- Debugging Info ---
print("\n--- Configuration ---")
logging.info("--- Configuration ---")
print(f"TensorFlow Version: {tf.__version__}"); logging.info(f"TensorFlow Version: {tf.__version__}")
try:
    print(f"SentencePiece Version: {spm.__version__}"); logging.info(f"SentencePiece Version: {spm.__version__}")
except NameError: print("SentencePiece not imported."); logging.warning("SentencePiece not imported.")

if _langdetect_installed:
    try:
        langdetect_version = importlib.metadata.version("langdetect")
        print(f"Langdetect Version: {langdetect_version}"); logging.info(f"Langdetect Version: {langdetect_version}")
    except Exception as e:
        print(f"Langdetect Version: Could not determine version ({e})"); logging.warning(f"Langdetect Version: Could not determine version ({e})")
else:
     print("Langdetect: Not installed or failed to import."); logging.warning("Langdetect: Not installed or failed to import.")

gpu_devices = tf.config.list_physical_devices('GPU')
print(f"GPU Available: {gpu_devices}"); logging.info(f"GPU Available: {gpu_devices}")
print(f"Input JSONL: {INPUT_JSONL}"); logging.info(f"Input JSONL: {INPUT_JSONL}")
print(f"Output Parquet Cache: {OUTPUT_PARQUET}"); logging.info(f"Output Parquet Cache: {OUTPUT_PARQUET}")
print(f"Tokenizer Model Prefix: {TOKENIZER_MODEL_PREFIX}"); logging.info(f"Tokenizer Model Prefix: {TOKENIZER_MODEL_PREFIX}")
print(f"Model Save Path: {MODEL_SAVE_PATH}"); logging.info(f"Model Save Path: {MODEL_SAVE_PATH}")
print(f"TensorBoard Log Dir: {LOG_DIR}"); logging.info(f"TensorBoard Log Dir: {LOG_DIR}")
print(f"Vocab Size: {VOCAB_SIZE}"); logging.info(f"Vocab Size: {VOCAB_SIZE}")
print(f"Embedding Dim: {EMBEDDING_DIM}"); logging.info(f"Embedding Dim: {EMBEDDING_DIM}")
print(f"Encoder LSTM Units (per direction): {LSTM_UNITS}"); logging.info(f"Encoder LSTM Units (per direction): {LSTM_UNITS}")
print(f"Decoder LSTM Units: {DECODER_LSTM_UNITS}"); logging.info(f"Decoder LSTM Units: {DECODER_LSTM_UNITS}")
print(f"Max Input Length: {MAX_LEN_INPUT}"); logging.info(f"Max Input Length: {MAX_LEN_INPUT}")
print(f"Max Summary Length: {MAX_LEN_SUMMARY}"); logging.info(f"Max Summary Length: {MAX_LEN_SUMMARY}")
print(f"Batch Size: {BATCH_SIZE}"); logging.info(f"Batch Size: {BATCH_SIZE}")
print(f"Epochs: {EPOCHS}"); logging.info(f"Epochs: {EPOCHS}")
print(f"Learning Rate: {LEARNING_RATE}"); logging.info(f"Learning Rate: {LEARNING_RATE}")
print("-" * 30); logging.info("-" * 30)

2025-04-24 15:44:49.109618: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-04-24 15:44:49.570731: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-04-24 15:44:51.480725: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nccl2/lib:/usr/local/cuda/extras/CUPTI/lib64:/usr/lib/

Ensuring output directory exists: /home/jupyter/model3_files
Logging setup complete. Log file: model3_files/training_log.log

--- Configuration ---
TensorFlow Version: 2.11.0
SentencePiece Version: 0.2.0
Langdetect Version: 1.0.9


2025-04-24 15:44:54,915 - INFO - GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
2025-04-24 15:44:54,916 - INFO - Input JSONL: mergedt04.jsonl
2025-04-24 15:44:54,918 - INFO - Output Parquet Cache: model3_files/processed_dataframe.parquet
2025-04-24 15:44:54,919 - INFO - Tokenizer Model Prefix: model3_files/pib_summarizer_spm_50k
2025-04-24 15:44:54,919 - INFO - Model Save Path: model3_files/pib_summarizer_no_attention.keras
2025-04-24 15:44:54,920 - INFO - TensorBoard Log Dir: model3_files/logs/fit/20250424-154454
2025-04-24 15:44:54,922 - INFO - Vocab Size: 30000
2025-04-24 15:44:54,922 - INFO - Embedding Dim: 100
2025-04-24 15:44:54,923 - INFO - Encoder LSTM Units (per direction): 256
2025-04-24 15:44:54,925 - INFO - Decoder LSTM Units: 512
2025-04-24 15:44:54,926 - INFO - Max Input Length: 1024
2025-04-24 15:44:54,926 - INFO - Max Summary Length: 150
2025-04-24 15:44:54,927 - INFO - Batch Size: 64
2025-04-24 15:44:54,929 - INFO - Epochs: 30
2025-04

GPU Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Input JSONL: mergedt04.jsonl
Output Parquet Cache: model3_files/processed_dataframe.parquet
Tokenizer Model Prefix: model3_files/pib_summarizer_spm_50k
Model Save Path: model3_files/pib_summarizer_no_attention.keras
TensorBoard Log Dir: model3_files/logs/fit/20250424-154454
Vocab Size: 30000
Embedding Dim: 100
Encoder LSTM Units (per direction): 256
Decoder LSTM Units: 512
Max Input Length: 1024
Max Summary Length: 150
Batch Size: 64
Epochs: 30
Learning Rate: 0.001
------------------------------


In [2]:
# Block 2: Data Loading and Initial Cleaning Functions + Execution

# --- Function Definitions ---
def load_data(jsonl_path):
    """Loads data from a JSONL file."""
    logging.info(f"Attempting to load data from: {jsonl_path}")
    data = []
    lines_processed = 0
    lines_failed = 0
    if not os.path.exists(jsonl_path):
        logging.error(f"Input file not found: {jsonl_path}")
        print(f"\n--- Data Loading Error ---")
        print(f"Error: Input file not found at {jsonl_path}")
        print("Please ensure the file exists and the path is correct.")
        print("-" * 30)
        return pd.DataFrame()

    try:
        with open(jsonl_path, 'r', encoding='utf-8') as f:
            for i, line in enumerate(f):
                lines_processed += 1
                try:
                    # Skip empty lines
                    if not line.strip():
                        logging.warning(f"Skipping empty line: {i+1}")
                        lines_failed +=1
                        continue
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    logging.warning(f"Skipping malformed JSON line: {i+1}. Error: {e}")
                    lines_failed += 1
                    continue
        df = pd.DataFrame(data)
        logging.info(f"Loaded {len(df)} records successfully out of {lines_processed} lines ({lines_failed} failed/skipped).")
        # --- Debugging Info ---
        print("\n--- Data Loading ---")
        print(f"Processed {lines_processed} lines from {jsonl_path}.")
        print(f"Successfully loaded {len(df)} records.")
        print(f"Skipped/failed {lines_failed} lines.")
        if not df.empty:
            print("Columns:", df.columns.tolist())
            print("Data Types:\n", df.dtypes)
            print("Sample record (first 5 rows raw_df):\n", df.head())
        else:
            print("Loaded DataFrame (raw_df) is empty. Check input file content and format.")
            logging.warning("Loaded DataFrame (raw_df) is empty after processing the file.")
        print("-" * 30)
        return df
    except FileNotFoundError: # This case is already handled above, but keep for robustness
        logging.error(f"Error: Input file not found at {jsonl_path}")
        print(f"\n--- Data Loading Error ---")
        print(f"Error: Input file not found at {jsonl_path}")
        print("-" * 30)
        return pd.DataFrame()
    except Exception as e:
        logging.error(f"An unexpected error occurred during data loading: {e}", exc_info=True)
        print(f"\n--- Data Loading Error ---")
        print(f"An unexpected error occurred: {e}")
        print("-" * 30)
        return pd.DataFrame()


def clean_text(text):
    """Applies cleaning steps to a single text string."""
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = text.replace('“', '"').replace('”', '"').replace("‘", "'").replace("’", "'")
    text = re.sub(r'file:///[^ ]+\.pdf', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\[\s*\d+\s*\]', '', text)
    headers_footers = [
        r"press information bureau", r"government of india", r"ministry of [\w\s]+",
        # Make date pattern more robust (optional day, month formats, etc.)
        r"posted on:\s*\d{1,2}\s+\w{3,}\s+\d{4}\s+\d{1,2}:\d{2}\s*[ap]m\s*(by pib \w+)?",
        r"release id: \d+",
        r"\(release id.*?\)",
        r"pib \w+", # Keep this general
        r"\*{3,}\s*[a-z\/]+\s*\*{3,}", # More general footer pattern
    ]
    for pattern in headers_footers:
        text = re.sub(pattern, '', text, flags=re.IGNORECASE)

    text = text.replace('\n', ' ')
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r"[^a-z0-9\s.,!?'\"-]", "", text) # Keep allowed chars
    text = text.strip()
    return text

# --- Execution for Block 2 ---

# --- Debugging: Test cleaning function ---
print("\n--- Cleaning Function Test (Block 2) ---")
test_text = """
Press Information Bureau\nGovernment of India\nMinistry of Finance\nPosted on: 25 JUL 2024 6:00PM by PIB Delhi
This is a [ 1] test document from file:///path/to/doc.pdf. Check www.example.com.
It has “quotes” and ‘apostrophes’.   Extra spaces. And some !?.,'"- punctuation.
Bad chars: #$%^&*()_+={}[]|\\:;<>~/
***DS/AK***
(Release ID: 12345)
"""
cleaned_test = clean_text(test_text)
print(f"Original:\n{test_text}")
print(f"\nCleaned:\n{cleaned_test}")
print("-" * 30)

# Load the raw data into a DataFrame called 'raw_df'
raw_df = load_data(INPUT_JSONL)

# <<< CRUCIAL CHECK >>>
if 'raw_df' not in locals() or raw_df.empty:
    print("************************************************************")
    print("ERROR: Block 2 failed to load data into `raw_df`.")
    print("Cannot proceed to Block 3. Please check the 'Data Loading' output above.")
    print("Verify the INPUT_JSONL path and the file content.")
    print("************************************************************")
    # Optional: Stop execution here in a notebook context if desired
    # raise RuntimeError("Failed to load raw data. Stopping execution.")
else:
    print("Block 2 completed. `raw_df` created successfully.")
    logging.info("Block 2 completed. `raw_df` created successfully.")

# A variable 'processed_df' will be created in the next block IF raw_df is valid

2025-04-24 15:45:02,311 - INFO - Attempting to load data from: mergedt04.jsonl



--- Cleaning Function Test (Block 2) ---
Original:

Press Information Bureau
Government of India
Ministry of Finance
Posted on: 25 JUL 2024 6:00PM by PIB Delhi
This is a [ 1] test document from file:///path/to/doc.pdf. Check www.example.com.
It has “quotes” and ‘apostrophes’.   Extra spaces. And some !?.,'"- punctuation.
Bad chars: #$%^&*()_+={}[]|\:;<>~/
***DS/AK***
(Release ID: 12345)


Cleaned:
25 jul 2024 600pm by this is a test document from . check it has "quotes" and 'apostrophes'. extra spaces. and some !?.,'"- punctuation. bad chars
------------------------------


2025-04-24 15:45:04,558 - INFO - Loaded 74128 records successfully out of 74128 lines (0 failed/skipped).
2025-04-24 15:45:04,594 - INFO - Block 2 completed. `raw_df` created successfully.



--- Data Loading ---
Processed 74128 lines from mergedt04.jsonl.
Successfully loaded 74128 records.
Skipped/failed 0 lines.
Columns: ['pdf_filename', 'extracted_text', 'gemini_summary', 'gemini_topics']
Data Types:
 pdf_filename      object
extracted_text    object
gemini_summary    object
gemini_topics     object
dtype: object
Sample record (first 5 rows raw_df):
                 pdf_filename  \
0  PIB_115363_2015_02_11.pdf   
1  PIB_115365_2015_02_11.pdf   
2  PIB_115366_2015_02_11.pdf   
3  PIB_115367_2015_02_12.pdf   
4  PIB_115368_2015_02_12.pdf   

                                      extracted_text  \
0  Ministry of Housing & Urban Affairs\nStates ca...   
1  Ministry of Culture\nConsultative Committee of...   
2  Ministry of Tourism\nConsultative Committee of...   
3  Prime Minister's Office\nPM pays tributes to S...   
4  Prime Minister's Office\nPM appalled at the ne...   

                                      gemini_summary  \
0  A meeting of the Parliamentary Consultativ

In [3]:
# Block 3: Preprocessing Pipeline + Execution

# --- Function Definition ---
def preprocess_data(df, text_col='extracted_text', summary_col='gemini_summary', cache_path=OUTPUT_PARQUET):
    """Applies the full preprocessing pipeline to the dataframe."""
    logging.info(f"Preprocessing started. Cache path: {cache_path}")
    start_time = time.time()

    # --- Cache Check ---
    if os.path.exists(cache_path):
        try:
            logging.info(f"Loading processed data from cache: {cache_path}")
            processed_df_from_cache = pd.read_parquet(cache_path)
            logging.info(f"Successfully loaded {len(processed_df_from_cache)} records from cache.")
            # --- Debugging Info ---
            print("\n--- Preprocessing Pipeline (Block 3) ---")
            print(f"Loaded {len(processed_df_from_cache)} records from cache: {cache_path}")
            if not processed_df_from_cache.empty:
                print("Columns:", processed_df_from_cache.columns.tolist())
                print("Sample processed data (from cache):\n", processed_df_from_cache.head())
            else:
                print("Warning: Cache file loaded an empty DataFrame.")
                logging.warning(f"Cache file {cache_path} contained an empty DataFrame.")
            print("-" * 30)
            return processed_df_from_cache # Return cached data
        except Exception as e:
            logging.error(f"Failed to load or parse cache file {cache_path}: {e}", exc_info=True)
            print(f"\n--- Preprocessing Cache Error ---")
            print(f"Error reading cache file {cache_path}: {e}. Proceeding with reprocessing.")
            # If cache fails, delete it to force reprocessing next time? Optional.
            # try: os.remove(cache_path) except OSError: pass


    # --- Preprocessing Steps (if cache doesn't exist or failed) ---
    print("\n--- Preprocessing Pipeline (Block 3) ---")
    print(f"Cache not used or failed. Starting processing of {len(df)} raw records...")
    logging.info(f"Cache not used or failed. Starting processing of {len(df)} raw records...")


    # Ensure required columns exist in the input df
    if text_col not in df.columns or summary_col not in df.columns:
        logging.error(f"Input DataFrame missing required columns: '{text_col}' or '{summary_col}'. Available: {df.columns.tolist()}")
        print("\n--- Preprocessing Pipeline Error ---")
        print(f"Input DataFrame missing required columns: '{text_col}' or '{summary_col}'. Available: {df.columns.tolist()}")
        print("Cannot preprocess.")
        print("-" * 30)
        return pd.DataFrame() # Return empty dataframe

    # Make a copy to avoid SettingWithCopyWarning on the original raw_df
    df_processed = df.copy()

    logging.info("Applying text cleaning...")
    tqdm.pandas(desc="Cleaning Text")
    df_processed['cleaned_text'] = df_processed[text_col].progress_apply(clean_text)
    tqdm.pandas(desc="Cleaning Summary")
    df_processed['cleaned_summary'] = df_processed[summary_col].progress_apply(clean_text)

    initial_count = len(df_processed)
    logging.info(f"Initial record count for processing: {initial_count}")
    print(f"Initial record count for processing: {initial_count}")
    print("Sample after basic cleaning:")
    cols_to_display = [c for c in ['cleaned_text', 'cleaned_summary'] if c in df_processed.columns]
    if cols_to_display and not df_processed.empty:
        print(df_processed[cols_to_display].head())


    # Language Filtering
    if _langdetect_installed: # Only run if library is available
        logging.info("Applying language filtering...")
        valid_indices = []
        skipped_lang = 0
        for index, text in tqdm(df_processed['cleaned_text'].items(), total=len(df_processed), desc="Language Filtering"):
            try:
                if not text or len(text.split()) < 5:
                    skipped_lang += 1
                    continue
                # Use detect_langs for probability, though simple detect('en') is faster if threshold isn't strict
                lang = detect(text[:500]) # Detect on first 500 chars
                if lang == 'en': # Simple check for English
                    valid_indices.append(index)
                else:
                    skipped_lang += 1
            except LangDetectException:
                skipped_lang += 1
            except Exception as e:
                logging.warning(f"Language detection error on index {index}: {e}")
                skipped_lang += 1

        df_processed = df_processed.loc[valid_indices].copy()
        lang_filtered_count = len(df_processed)
        logging.info(f"Language filtering: Kept {lang_filtered_count}, Removed {initial_count - lang_filtered_count} non-English/error records.")
        print(f"Count after language filtering: {lang_filtered_count} ({initial_count - lang_filtered_count} removed)")
    else:
        print("Skipping language filtering as langdetect is not available.")
        logging.warning("Skipping language filtering as langdetect is not available.")


    # Length Filtering (only if records remain)
    if not df_processed.empty:
        logging.info("Applying length filtering...")
        df_processed['text_word_count'] = df_processed['cleaned_text'].apply(lambda x: len(x.split()))
        df_processed['summary_word_count'] = df_processed['cleaned_summary'].apply(lambda x: len(x.split()))

        original_count_before_len_filter = len(df_processed)
        df_processed = df_processed[df_processed['text_word_count'] >= MIN_INPUT_WORDS].copy()
        df_processed = df_processed[df_processed['summary_word_count'] >= MIN_SUMMARY_WORDS].copy()
        len_filtered_count = len(df_processed)
        logging.info(f"Length filtering: Kept {len_filtered_count}, Removed {original_count_before_len_filter - len_filtered_count} short records.")
        print(f"Count after length filtering: {len_filtered_count} ({original_count_before_len_filter - len_filtered_count} removed)")
    else:
        print("Skipping length filtering as DataFrame is empty after previous steps.")
        logging.warning("Skipping length filtering due to empty DataFrame.")


    # Add Start/End Tokens (only if records remain)
    final_processed_df = pd.DataFrame() # Initialize
    if not df_processed.empty:
        logging.info("Adding <start> and <end> tokens to summaries...")
        df_processed['cleaned_summary_tagged'] = df_processed['cleaned_summary'].apply(lambda x: f"<start> {x} <end>")
        # Final Selection and Renaming
        final_processed_df = df_processed[['cleaned_text', 'cleaned_summary_tagged']].rename(columns={'cleaned_summary_tagged': 'target_summary'})
    else:
        print("Skipping token tagging as DataFrame is empty.")
        logging.warning("Skipping token tagging due to empty DataFrame.")


    # --- Final Output and Caching ---
    print(f"Final processed record count: {len(final_processed_df)}")
    logging.info(f"Final processed record count: {len(final_processed_df)}")
    if not final_processed_df.empty:
        print("Sample processed data (final):\n", final_processed_df.head())
        # Cache the result
        try:
            logging.info(f"Caching processed data to: {cache_path}")
            final_processed_df.to_parquet(cache_path, index=False)
            logging.info("Caching successful.")
            print(f"Processed data cached successfully to {cache_path}")
        except Exception as e:
            logging.error(f"Failed to cache processed data to {cache_path}: {e}", exc_info=True)
            print(f"\n--- Caching Error ---")
            print(f"Failed to cache data to {cache_path}: {e}")
    else:
        logging.warning("Processed DataFrame is empty. No data will be cached.")
        print("Warning: Processed DataFrame is empty. Nothing to cache.")

    end_time = time.time()
    logging.info(f"Preprocessing finished in {end_time - start_time:.2f} seconds.")
    print(f"Preprocessing finished in {end_time - start_time:.2f} seconds.")
    print("-" * 30)

    return final_processed_df

# --- Execution for Block 3 ---
processed_df = pd.DataFrame() # Initialize as empty DataFrame

# Only proceed if raw_df from Block 2 exists and is not empty
if 'raw_df' in locals() and isinstance(raw_df, pd.DataFrame) and not raw_df.empty:
    print("Proceeding with preprocessing using `raw_df`...")
    processed_df = preprocess_data(raw_df, cache_path=OUTPUT_PARQUET)
else:
    print("Skipping Block 3 execution because `raw_df` is not available or is empty.")
    print("Check the output of Block 2 for errors.")
    logging.error("Skipping Block 3 execution because `raw_df` is not available or is empty.")

# <<< CRUCIAL CHECK >>>
if 'processed_df' not in locals() or processed_df.empty:
    print("************************************************************")
    print("WARNING: Block 3 resulted in an empty `processed_df`.")
    print("This could be due to loading errors, aggressive filtering, or issues during processing.")
    print("Subsequent blocks (Tokenizer, Model Training) will likely fail or be skipped.")
    print("Review the 'Preprocessing Pipeline' output above.")
    print("************************************************************")
    logging.warning("Block 3 resulted in an empty `processed_df`.")
else:
    print("Block 3 completed. `processed_df` created or loaded successfully.")
    logging.info("Block 3 completed. `processed_df` created or loaded successfully.")

# Clean up raw_df if memory is a concern and processing was successful (optional)
if 'processed_df' in locals() and not processed_df.empty and 'raw_df' in locals():
   print("Cleaning up raw_df from memory...")
   del raw_df
   gc.collect()
   logging.info("Cleaned up raw_df from memory.")

2025-04-24 15:45:11,161 - INFO - Preprocessing started. Cache path: model3_files/processed_dataframe.parquet
2025-04-24 15:45:11,163 - INFO - Loading processed data from cache: model3_files/processed_dataframe.parquet


Proceeding with preprocessing using `raw_df`...


2025-04-24 15:45:12,746 - INFO - Successfully loaded 73568 records from cache.
2025-04-24 15:45:12,749 - INFO - Block 3 completed. `processed_df` created or loaded successfully.



--- Preprocessing Pipeline (Block 3) ---
Loaded 73568 records from cache: model3_files/processed_dataframe.parquet
Columns: ['cleaned_text', 'target_summary']
Sample processed data (from cache):
                                         cleaned_text  \
0  urban affairs states can flexibly use of centr...   
1  dr mahesh sharma the parliamentary consultativ...   
2  dr mahesh sharma the parliamentary consultativ...   
3  prime minister's office pm pays tributes to sw...   
4  prime minister's office pm appalled at the new...   

                                      target_summary  
0  <start> a meeting of the parliamentary consult...  
1  <start> a meeting of the parliamentary consult...  
2  <start> a meeting of the parliamentary consult...  
3  <start> prime minister narendra modi paid trib...  
4  <start> prime minister narendra modi expressed...  
------------------------------
Block 3 completed. `processed_df` created or loaded successfully.
Cleaning up raw_df from memory...


2025-04-24 15:45:13,005 - INFO - Cleaned up raw_df from memory.


In [4]:
# Block 4: Tokenizer Training and Usage + Execution

import os
import pandas as pd
import numpy as np
import logging
import time
import tensorflow as tf
from tqdm.notebook import tqdm
import sentencepiece as spm

# --- Function Definitions ---

def train_sentencepiece(data_series, model_prefix, vocab_size, special_tokens):
    """Trains a SentencePiece Unigram model."""
    logging.info(f"Starting SentencePiece training. Output prefix: {model_prefix}")
    start_time = time.time()

    # Create a temporary file to store the text data for training
    # Ensure the directory for the temp file exists (it should be OUTPUT_DIR)
    temp_dir = os.path.dirname(model_prefix)
    os.makedirs(temp_dir, exist_ok=True)
    temp_text_file = f"{model_prefix}_training_data.txt" # Use prefix for temp file name

    try:
        if data_series.empty:
            logging.error("Cannot train SentencePiece on empty data series.")
            print("\n--- SentencePiece Training Error ---")
            print("Input data series is empty. Cannot train tokenizer.")
            print("-" * 30)
            return False # Indicate failure

        with open(temp_text_file, 'w', encoding='utf-8') as f:
            for text in tqdm(data_series, desc="Writing Training Data"):
                f.write(str(text) + '\n') # Ensure text is string
        logging.info(f"Training data written to {temp_text_file}")

        spm_command = (
            f'--input={temp_text_file} --model_prefix={model_prefix} '
            f'--vocab_size={vocab_size} --model_type=unigram '
            f'--pad_id={PAD_ID} --unk_id={UNK_ID} '
            f'--bos_id={START_ID} --eos_id={END_ID} ' # bos = <start>, eos = <end>
            f'--unk_piece=<unk> --bos_piece=<start> --eos_piece=<end> --pad_piece=<pad> '
            f'--hard_vocab_limit=false '
            f'--character_coverage=1.0 '
            f'--shuffle_input_sentence=true --input_sentence_size=10000000' # Sample size
        )

        print("\n--- SentencePiece Training ---")
        logging.info(f"Running SentencePiece with command args...")
        print(f"Running SentencePiece with command args (simplified): {spm_command.split('--input')[0]}...")

        spm.SentencePieceTrainer.train(spm_command)

        training_duration = time.time() - start_time
        logging.info(f"SentencePiece training completed in {training_duration:.2f} seconds.")
        print(f"SentencePiece model files created: {model_prefix}.model, {model_prefix}.vocab")
        print(f"Training duration: {training_duration:.2f} seconds.")

        os.remove(temp_text_file)
        logging.info(f"Removed temporary training file: {temp_text_file}")
        print("-" * 30)
        return True # Indicate success

    except Exception as e:
        logging.error(f"SentencePiece training failed: {e}", exc_info=True)
        print(f"\n--- SentencePiece Training Error ---")
        print(f"SentencePiece training failed: {e}")
        if os.path.exists(f"{model_prefix}.model"): os.remove(f"{model_prefix}.model")
        if os.path.exists(f"{model_prefix}.vocab"): os.remove(f"{model_prefix}.vocab")
        if os.path.exists(temp_text_file): os.remove(temp_text_file)
        print("-" * 30)
        return False # Indicate failure


# <<< CORRECTED VERSION of load_tokenizer >>>
def load_tokenizer(model_path):
    """Loads a trained SentencePiece model."""
    logging.info(f"Loading SentencePiece tokenizer from: {model_path}")
    if not os.path.exists(model_path):
        logging.error(f"Tokenizer model file not found at {model_path}")
        print(f"\n--- Tokenizer Loading Error ---")
        print(f"Error: Tokenizer model file not found at {model_path}")
        print("-" * 30)
        return None
    try:
        tokenizer = spm.SentencePieceProcessor()
        tokenizer.load(model_path)
        print("\n--- Tokenizer Loading ---")
        logging.info(f"Successfully loaded tokenizer: {model_path}")
        print(f"Successfully loaded tokenizer: {model_path}")

        # <<< FIX: Use id_to_piece() for string representation >>>
        pad_id_val = tokenizer.pad_id()
        unk_id_val = tokenizer.unk_id()
        bos_id_val = tokenizer.bos_id()
        eos_id_val = tokenizer.eos_id()

        # Handle cases where special IDs might not be set (though unlikely with our training flags)
        pad_piece_str = tokenizer.id_to_piece(pad_id_val) if pad_id_val is not None and pad_id_val >= 0 else 'N/A'
        unk_piece_str = tokenizer.id_to_piece(unk_id_val) if unk_id_val is not None and unk_id_val >= 0 else 'N/A'
        bos_piece_str = tokenizer.id_to_piece(bos_id_val) if bos_id_val is not None and bos_id_val >= 0 else 'N/A'
        eos_piece_str = tokenizer.id_to_piece(eos_id_val) if eos_id_val is not None and eos_id_val >= 0 else 'N/A'

        print(f"Vocabulary Size: {tokenizer.vocab_size()}")
        print(f"PAD ID ({pad_piece_str}): {pad_id_val}")
        print(f"UNK ID ({unk_piece_str}): {unk_id_val}")
        print(f"BOS/Start ID ({bos_piece_str}): {bos_id_val}")
        print(f"EOS/End ID ({eos_piece_str}): {eos_id_val}")
        print("-" * 30)

        # Sanity check special token IDs (remains the same)
        if not (pad_id_val == PAD_ID and unk_id_val == UNK_ID and \
                bos_id_val == START_ID and eos_id_val == END_ID):
             logging.warning("Loaded tokenizer special token IDs DO NOT match configured IDs!")
             print("WARNING: Loaded tokenizer special token IDs DO NOT match configured IDs!")
             print(f"  Loaded: PAD={pad_id_val}, UNK={unk_id_val}, BOS={bos_id_val}, EOS={eos_id_val}")
             print(f"  Config: PAD={PAD_ID}, UNK={UNK_ID}, BOS={START_ID}, EOS={END_ID}")
             # Optional: Treat as error? For now, just warn.
             # return None # Uncomment to treat ID mismatch as a fatal error

        return tokenizer
    except Exception as e:
        logging.error(f"Failed to load or process tokenizer model from {model_path}: {e}", exc_info=True)
        print(f"\n--- Tokenizer Loading Error ---")
        print(f"Failed to load or process tokenizer model from {model_path}: {e}") # Show the specific error
        print("-" * 30)
        return None


def tokenize_texts(texts, tokenizer, max_len):
    """Tokenizes a list/series of texts and pads/truncates them."""
    if texts is None or texts.empty:
        logging.warning("Attempted to tokenize an empty list/series of texts.")
        print("Warning: Input texts for tokenization is empty.")
        return np.array([]) # Return empty numpy array
    if tokenizer is None:
        logging.error("Cannot tokenize texts: Tokenizer is None.")
        return np.array([])

    logging.info(f"Tokenizing {len(texts)} texts with max_len={max_len}...")
    try:
        texts_list = [str(text) if pd.notna(text) else '' for text in texts.tolist()]
        tokenized_sequences = tokenizer.encode(texts_list)

        padded_sequences = tf.keras.preprocessing.sequence.pad_sequences(
            tokenized_sequences,
            maxlen=max_len,
            padding='post',
            truncating='post',
            value=PAD_ID # Explicitly use PAD_ID
        )
        print("\n--- Tokenization ---")
        logging.info(f"Tokenization successful for {len(texts_list)} texts.")
        print(f"Tokenized {len(texts_list)} texts.")
        print(f"Shape of padded sequences: {padded_sequences.shape}")
        if len(texts_list) > 0 and len(padded_sequences) > 0:
            print(f"Original Text (sample 0): {texts_list[0][:100]}...")
            print(f"Tokenized IDs (sample 0): {padded_sequences[0][:20]}...")
        print("-" * 30)
        return padded_sequences
    except Exception as e:
        logging.error(f"Error during text tokenization: {e}", exc_info=True)
        print(f"\n--- Tokenization Error ---")
        print(f"An error occurred during tokenization: {e}")
        print("-" * 30)
        return np.array([])


def detokenize_sequences(sequences, tokenizer):
    """Converts sequences of token IDs back to text. Handles single sequence or batch."""
    if tokenizer is None:
        logging.error("Detokenization failed: Tokenizer is None.")
        return "[Detokenization Error: No Tokenizer]"
    if sequences is None: return [] if isinstance(sequences, list) else ""

    try:
        if isinstance(sequences, tf.Tensor): sequences = sequences.numpy()
        if isinstance(sequences, np.ndarray): sequences = sequences.tolist()

        is_batch = isinstance(sequences, list) and (len(sequences) == 0 or isinstance(sequences[0], list))
        if not is_batch: sequences = [sequences] # Wrap single list for uniform processing

        texts = []
        for seq in sequences:
            actual_tokens = []
            # Ensure seq is iterable (list/tuple)
            if not hasattr(seq, '__iter__'): continue
            for token_id_float in seq:
                token_id = int(token_id_float)
                if token_id == END_ID: break
                if token_id != PAD_ID and token_id != START_ID:
                     actual_tokens.append(token_id)
            texts.append(tokenizer.decode(actual_tokens))

        return texts if is_batch else texts[0]
    except Exception as e:
        logging.error(f"Error during detokenization: {e}", exc_info=True)
        return "[Detokenization Error]"


# --- Execution for Block 4 ---
tokenizer = None
encoder_input_data = np.array([])
decoder_input_data = np.array([])
decoder_target_data = np.array([])
tokenization_failed = False # Flag to track status

# Only proceed if processed_df from Block 3 exists and is not empty
if 'processed_df' in locals() and isinstance(processed_df, pd.DataFrame) and not processed_df.empty:
    print("Block 4: `processed_df` is valid. Proceeding with tokenizer.")
    logging.info("Block 4: `processed_df` is valid. Proceeding with tokenizer.")

    # Combine text and summary for tokenizer training data
    if 'cleaned_text' in processed_df.columns and 'target_summary' in processed_df.columns:
        print("Preparing data for tokenizer training...")
        full_corpus = pd.concat([processed_df['cleaned_text'], processed_df['target_summary']], ignore_index=True)
        full_corpus.dropna(inplace=True)
        print(f"Full corpus size for tokenizer: {len(full_corpus)}")

        special_tokens = ['<pad>', '<unk>', '<start>', '<end>'] # For reference

        # Train only if model file doesn't exist
        if not os.path.exists(TOKENIZER_MODEL_FILE):
            print(f"Tokenizer model {TOKENIZER_MODEL_FILE} not found. Starting training...")
            training_successful = train_sentencepiece(full_corpus, TOKENIZER_MODEL_PREFIX, VOCAB_SIZE, special_tokens)
            if not training_successful:
                 print("Tokenizer training failed. Cannot proceed with tokenization.")
                 logging.error("Tokenizer training failed.")
                 tokenization_failed = True # Set failure flag
            else:
                 print("Tokenizer training successful.")
                 logging.info("Tokenizer training successful.")
        else:
            print(f"Tokenizer model {TOKENIZER_MODEL_FILE} already exists. Skipping training.")
            logging.info(f"Found existing tokenizer model: {TOKENIZER_MODEL_FILE}. Skipping training.")

        # Load the tokenizer (only if training didn't fail)
        if not tokenization_failed:
            tokenizer = load_tokenizer(TOKENIZER_MODEL_FILE) # Use the corrected function

            if tokenizer:
                print("Tokenizing cleaned text...")
                encoder_input_data = tokenize_texts(processed_df['cleaned_text'], tokenizer, MAX_LEN_INPUT)
                print("Tokenizing target summaries...")
                decoder_full_data = tokenize_texts(processed_df['target_summary'], tokenizer, MAX_LEN_SUMMARY)

                if encoder_input_data.size > 0 and decoder_full_data.size > 0:
                    print("Creating decoder input/target sequences...")
                    decoder_input_data = decoder_full_data[:, :-1]
                    decoder_target_data = decoder_full_data[:, 1:]

                    print("\n--- Data Shapes After Tokenization & Shifting ---")
                    print("Encoder Input Shape:", encoder_input_data.shape)
                    print("Decoder Input Shape:", decoder_input_data.shape)
                    print("Decoder Target Shape:", decoder_target_data.shape)
                    print("-" * 30)
                    logging.info(f"Tokenization successful. Shapes: Encoder={encoder_input_data.shape}, DecoderIn={decoder_input_data.shape}, DecoderOut={decoder_target_data.shape}")

                else:
                    print("Tokenization resulted in empty arrays. Cannot proceed.")
                    logging.error("Tokenization resulted in empty arrays.")
                    tokenization_failed = True # Set failure flag
                    # Clear arrays
                    encoder_input_data = np.array([])
                    decoder_input_data = np.array([])
                    decoder_target_data = np.array([])


            else:
                print("Failed to load tokenizer. Cannot proceed.")
                logging.error("Failed to load tokenizer.")
                tokenization_failed = True # Set failure flag

    else:
         print("Skipping tokenizer step: required columns ('cleaned_text', 'target_summary') missing in processed_df.")
         logging.error("Skipping tokenizer step: required columns missing.")
         tokenization_failed = True # Set failure flag
else:
     # This case should have been caught earlier if Blocks 2/3 failed
     print("Skipping Block 4 execution because `processed_df` is not available or is empty.")
     print("Check the output of Block 3.")
     logging.error("Skipping Block 4 execution because `processed_df` is not available or is empty.")
     tokenization_failed = True # Set failure flag


# Final status check for Block 4
if tokenization_failed:
    print("************************************************************")
    print("ERROR: Block 4 failed or was skipped due to issues in previous blocks or during tokenization.")
    print("Cannot proceed to Block 5 (Dataset Creation).")
    print("************************************************************")
else:
    print("Block 4 completed successfully. Tokenized data created.")
    logging.info("Block 4 completed successfully. Tokenized data created.")

2025-04-24 15:45:19,034 - INFO - Block 4: `processed_df` is valid. Proceeding with tokenizer.
2025-04-24 15:45:19,060 - INFO - Found existing tokenizer model: model3_files/pib_summarizer_spm_50k.model. Skipping training.
2025-04-24 15:45:19,061 - INFO - Loading SentencePiece tokenizer from: model3_files/pib_summarizer_spm_50k.model
2025-04-24 15:45:19,158 - INFO - Successfully loaded tokenizer: model3_files/pib_summarizer_spm_50k.model
2025-04-24 15:45:19,159 - INFO - Tokenizing 73568 texts with max_len=1024...


Block 4: `processed_df` is valid. Proceeding with tokenizer.
Preparing data for tokenizer training...
Full corpus size for tokenizer: 147136
Tokenizer model model3_files/pib_summarizer_spm_50k.model already exists. Skipping training.

--- Tokenizer Loading ---
Successfully loaded tokenizer: model3_files/pib_summarizer_spm_50k.model
Vocabulary Size: 30000
PAD ID (<pad>): 0
UNK ID (<unk>): 1
BOS/Start ID (<start>): 2
EOS/End ID (<end>): 3
------------------------------
Tokenizing cleaned text...


2025-04-24 15:45:38,959 - INFO - Tokenization successful for 73568 texts.



--- Tokenization ---
Tokenized 73568 texts.
Shape of padded sequences: (73568, 1024)
Original Text (sample 0): urban affairs states can flexibly use of central assistance of rs.4,000 per toilet, says shri venkai...
Tokenized IDs (sample 0): [  253   188    82   203 18188   606   320   289     7    76   235     7
    43     6  8390    80  4236     5  2429    31]...
------------------------------


2025-04-24 15:45:39,328 - INFO - Tokenizing 73568 texts with max_len=150...


Tokenizing target summaries...


2025-04-24 15:45:44,221 - INFO - Tokenization successful for 73568 texts.
2025-04-24 15:45:44,314 - INFO - Tokenization successful. Shapes: Encoder=(73568, 1024), DecoderIn=(73568, 149), DecoderOut=(73568, 149)
2025-04-24 15:45:44,316 - INFO - Block 4 completed successfully. Tokenized data created.



--- Tokenization ---
Tokenized 73568 texts.
Shape of padded sequences: (73568, 150)
Original Text (sample 0): <start> a meeting of the parliamentary consultative committee discussed shortcomings in the jawaharl...
Tokenized IDs (sample 0): [   11     1  8959     1    13    88     7     4  1239  2583   135   569
 10151    10     4  2925  2058   253   629   184]...
------------------------------
Creating decoder input/target sequences...

--- Data Shapes After Tokenization & Shifting ---
Encoder Input Shape: (73568, 1024)
Decoder Input Shape: (73568, 149)
Decoder Target Shape: (73568, 149)
------------------------------
Block 4 completed successfully. Tokenized data created.


In [5]:
# Block 5: Data Preparation for TensorFlow (tf.data.Dataset) + Execution

# --- Function Definition ---
# (create_tf_dataset defined as in the previous good version)
# ... (insert definition for create_tf_dataset here) ...
def create_tf_dataset(encoder_inputs, decoder_inputs, decoder_targets, batch_size, shuffle=True):
    """Creates a tf.data.Dataset for training or validation."""
    if not isinstance(encoder_inputs, np.ndarray) or \
       not isinstance(decoder_inputs, np.ndarray) or \
       not isinstance(decoder_targets, np.ndarray):
        logging.error("Inputs to create_tf_dataset must be numpy arrays.")
        print("Error: Inputs for dataset creation are not numpy arrays.")
        return None

    if encoder_inputs.size == 0 or decoder_inputs.size == 0 or decoder_targets.size == 0:
        logging.error("Cannot create dataset from empty numpy arrays.")
        print("Error: Input arrays for dataset creation are empty.")
        return None
    if not (encoder_inputs.shape[0] == decoder_inputs.shape[0] == decoder_targets.shape[0]):
        logging.error(f"Mismatch in number of samples: Enc={encoder_inputs.shape[0]}, DecIn={decoder_inputs.shape[0]}, DecOut={decoder_targets.shape[0]}")
        print("Error: Mismatch in number of samples between input/output arrays.")
        return None

    logging.info(f"Creating tf.data.Dataset. Shuffle={shuffle}, Batch Size={batch_size}")
    print("\n--- tf.data.Dataset Creation ---")
    print(f"Input shapes: Encoder={encoder_inputs.shape}, DecoderIn={decoder_inputs.shape}, DecoderOut={decoder_targets.shape}")

    try:
        dataset = tf.data.Dataset.from_tensor_slices(
            (
                {"encoder_inputs": encoder_inputs, "decoder_inputs": decoder_inputs}, # Model inputs dict
                {"output_layer": decoder_targets} # Model outputs dict
            )
        )

        if shuffle:
            # Use a buffer size approx the size of the dataset for good shuffling
            buffer_size = len(encoder_inputs)
            dataset = dataset.shuffle(buffer_size=buffer_size, reshuffle_each_iteration=True)
            logging.info(f"Shuffling dataset with buffer size {buffer_size}")

        dataset = dataset.batch(batch_size, drop_remainder=True) # Drop remainder is important for stateful RNNs if used, generally good practice
        dataset = dataset.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

        logging.info("tf.data.Dataset created successfully.")
        print(f"tf.data.Dataset created (shuffle={shuffle}).")
        print("Element Spec (structure of one batch):")
        print(dataset.element_spec)
        print("-" * 30)
        return dataset

    except Exception as e:
        logging.error(f"Failed to create tf.data.Dataset: {e}", exc_info=True)
        print(f"\n--- Dataset Creation Error ---")
        print(f"Failed to create tf.data.Dataset: {e}")
        print("-" * 30)
        return None


# --- Execution for Block 5 ---
train_dataset = None
val_dataset = None
num_train_samples = 0
num_val_samples = 0
dataset_creation_failed = False # Flag

# Only proceed if Block 4 succeeded (tokenization_failed is False)
# and the resulting numpy arrays exist and are not empty
if 'tokenization_failed' in locals() and not tokenization_failed and \
   'encoder_input_data' in locals() and encoder_input_data.size > 0 and \
   'decoder_input_data' in locals() and decoder_input_data.size > 0 and \
   'decoder_target_data' in locals() and decoder_target_data.size > 0:

    print("Block 5: Tokenized data is valid. Proceeding with dataset creation.")
    logging.info("Block 5: Tokenized data is valid. Proceeding with dataset creation.")

    num_samples = encoder_input_data.shape[0]
    if num_samples > 0:
        num_val_samples = int(0.1 * num_samples)
        # Ensure there's at least one validation sample if possible, and enough training samples
        if num_val_samples == 0 and num_samples > 1 : num_val_samples = 1
        num_train_samples = num_samples - num_val_samples

        if num_train_samples <= 0:
             print(f"Error: Not enough samples ({num_samples}) for a train/validation split (Val samples = {num_val_samples}).")
             logging.error(f"Not enough samples ({num_samples}) for train/val split.")
             dataset_creation_failed = True
        else:
            print(f"Splitting data: {num_train_samples} train, {num_val_samples} validation.")
            logging.info(f"Splitting data: {num_train_samples} train, {num_val_samples} validation.")

            # Shuffle indices *before* splitting
            indices = np.arange(num_samples)
            np.random.shuffle(indices)
            encoder_input_data = encoder_input_data[indices]
            decoder_input_data = decoder_input_data[indices]
            decoder_target_data = decoder_target_data[indices]
            print("Shuffled data indices before splitting.")

            # Perform the split
            encoder_input_train = encoder_input_data[:num_train_samples]
            decoder_input_train = decoder_input_data[:num_train_samples]
            decoder_target_train = decoder_target_data[:num_train_samples]

            encoder_input_val = encoder_input_data[num_train_samples:]
            decoder_input_val = decoder_input_data[num_train_samples:]
            decoder_target_val = decoder_target_data[num_train_samples:]

            # Create datasets
            print("\nCreating training dataset...")
            train_dataset = create_tf_dataset(
                encoder_input_train, decoder_input_train, decoder_target_train, BATCH_SIZE, shuffle=True
            )
            print("\nCreating validation dataset...")
            val_dataset = create_tf_dataset(
                encoder_input_val, decoder_input_val, decoder_target_val, BATCH_SIZE, shuffle=False # No need to shuffle validation
            )

            if train_dataset is None or val_dataset is None:
                 print("Error: Failed to create train or validation dataset.")
                 logging.error("Failed to create train or validation dataset.")
                 dataset_creation_failed = True
            else:
                print("\n--- Dataset Splitting and Creation Summary ---")
                print(f"Total samples tokenized: {num_samples}")
                print(f"Training samples: {num_train_samples}, Validation samples: {num_val_samples}")
                print(f"Train dataset created: Yes")
                print(f"Validation dataset created: Yes")
                try: print(f" Approx. train steps per epoch: {len(train_dataset)}")
                except TypeError: print(" Approx. train steps per epoch: Unknown (infinite dataset?)")
                try: print(f" Approx. validation steps per epoch: {len(val_dataset)}")
                except TypeError: print(" Approx. validation steps per epoch: Unknown (infinite dataset?)")
                print("-" * 30)

    else:
        print("No samples found in tokenized data. Cannot create datasets.")
        logging.error("No samples found in tokenized data. Cannot create datasets.")
        dataset_creation_failed = True

else:
    print("Skipping Block 5 execution: Tokenization failed or resulted in empty data.")
    logging.error("Skipping Block 5 execution: Tokenization failed or resulted in empty data.")
    dataset_creation_failed = True


# Final status check for Block 5
if dataset_creation_failed:
    print("************************************************************")
    print("ERROR: Block 5 failed or was skipped.")
    print("Cannot proceed to Block 6 (Model Building).")
    print("Review the output from Block 4 and 5.")
    print("************************************************************")
else:
    print("Block 5 completed successfully. Train/Validation datasets created.")
    logging.info("Block 5 completed successfully. Train/Validation datasets created.")

2025-04-24 15:48:23,349 - INFO - Block 5: Tokenized data is valid. Proceeding with dataset creation.
2025-04-24 15:48:23,351 - INFO - Splitting data: 66212 train, 7356 validation.
2025-04-24 15:48:23,516 - INFO - Creating tf.data.Dataset. Shuffle=True, Batch Size=64


Block 5: Tokenized data is valid. Proceeding with dataset creation.
Splitting data: 66212 train, 7356 validation.
Shuffled data indices before splitting.

Creating training dataset...

--- tf.data.Dataset Creation ---
Input shapes: Encoder=(66212, 1024), DecoderIn=(66212, 149), DecoderOut=(66212, 149)


2025-04-24 15:48:23.524812: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-04-24 15:48:23.688464: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1613] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 20750 MB memory:  -> device: 0, name: NVIDIA L4, pci bus id: 0000:00:03.0, compute capability: 8.9
2025-04-24 15:48:24,134 - INFO - Shuffling dataset with buffer size 66212
2025-04-24 15:48:24,141 - INFO - tf.data.Dataset created successfully.
2025-04-24 15:48:24,142 - INFO - Creating tf.data.Dataset. Shuffle=False, Batch Size=64
2025-04-24 15:48:24,161 - INFO - tf.data.Dataset created successfully.
2025-04-24 15:48:24,166 - INFO - Block 5 completed successfully. Train/Validation dat

tf.data.Dataset created (shuffle=True).
Element Spec (structure of one batch):
({'encoder_inputs': TensorSpec(shape=(64, 1024), dtype=tf.int32, name=None), 'decoder_inputs': TensorSpec(shape=(64, 149), dtype=tf.int32, name=None)}, {'output_layer': TensorSpec(shape=(64, 149), dtype=tf.int32, name=None)})
------------------------------

Creating validation dataset...

--- tf.data.Dataset Creation ---
Input shapes: Encoder=(7356, 1024), DecoderIn=(7356, 149), DecoderOut=(7356, 149)
tf.data.Dataset created (shuffle=False).
Element Spec (structure of one batch):
({'encoder_inputs': TensorSpec(shape=(64, 1024), dtype=tf.int32, name=None), 'decoder_inputs': TensorSpec(shape=(64, 149), dtype=tf.int32, name=None)}, {'output_layer': TensorSpec(shape=(64, 149), dtype=tf.int32, name=None)})
------------------------------

--- Dataset Splitting and Creation Summary ---
Total samples tokenized: 73568
Training samples: 66212, Validation samples: 7356
Train dataset created: Yes
Validation dataset crea

In [6]:
# Block 6: Model Architecture + Execution
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, Dropout, Concatenate
from tensorflow.keras.models import Model

# --- Function Definition ---
def build_training_model(vocab_size, embedding_dim, lstm_units, decoder_lstm_units,
                         num_encoder_layers, num_decoder_layers, dropout_rate,
                         max_len_input, max_len_summary):
    """Builds the combined Encoder-Decoder model for training."""
    logging.info("Building Combined Training Model...")
    print("\n--- Combined Training Model Build ---")

    # --- Encoder part ---
    encoder_input_layer = Input(shape=(max_len_input,), name="encoder_inputs")
    # Use mask_zero=True for Embedding layers
    encoder_embedding_layer = Embedding(vocab_size, embedding_dim, mask_zero=True, name="encoder_embedding")
    encoder_embeddings = encoder_embedding_layer(encoder_input_layer)

    current_sequence = encoder_embeddings
    encoder_states = [] # Will hold final states [h, c] from the last layer

    print(f"Building Encoder with {num_encoder_layers} BiLSTM layers...")
    for i in range(num_encoder_layers):
        is_last_layer = (i == num_encoder_layers - 1)
        bilstm = Bidirectional(
            LSTM(lstm_units, return_sequences=True, return_state=True, dropout=dropout_rate, name=f"encoder_bilstm_{i+1}")
        )
        # We need return_sequences=True for stacking, even on the last layer,
        # but we only *use* the states from the last layer for the decoder init.
        # The state outputs (forward_h, forward_c, backward_h, backward_c) are always returned when return_state=True.
        encoder_output_seq, forward_h, forward_c, backward_h, backward_c = bilstm(current_sequence)

        if is_last_layer:
            state_h = Concatenate(name="encoder_final_h")([forward_h, backward_h])
            state_c = Concatenate(name="encoder_final_c")([forward_c, backward_c])
            encoder_states = [state_h, state_c] # Final states for decoder init
        # Update current_sequence for the next layer (always use the sequence output)
        current_sequence = encoder_output_seq # Output sequence from BiLSTM

    # --- Decoder part (using teacher forcing) ---
    decoder_input_layer = Input(shape=(max_len_summary - 1,), name="decoder_inputs") # Shifted target summary
    # Use a *separate* Embedding layer for the decoder is good practice
    decoder_embedding_layer = Embedding(vocab_size, embedding_dim, mask_zero=True, name="decoder_embedding")
    decoder_embeddings = decoder_embedding_layer(decoder_input_layer)

    current_sequence = decoder_embeddings
    # Use encoder final states as initial state for the *first* decoder LSTM
    current_states = encoder_states

    print(f"Building Decoder with {num_decoder_layers} LSTM layers...")
    for i in range(num_decoder_layers):
        # We need return_sequences=True to feed into the final Dense layer
        # We don't need return_state=True here for the *training* model output
        decoder_lstm = LSTM(decoder_lstm_units, return_sequences=True, return_state=False, dropout=dropout_rate, name=f"decoder_lstm_{i+1}")
        # Pass initial state only to the first layer
        if i == 0:
            current_sequence = decoder_lstm(current_sequence, initial_state=current_states)
        else:
            # Keras handles state propagation between stacked LSTMs internally during training
            current_sequence = decoder_lstm(current_sequence)

    # --- Final output layer ---
    output_dense_layer = Dense(vocab_size, activation='softmax', name="output_layer")
    decoder_outputs = output_dense_layer(current_sequence)

    # Define the complete model for training
    training_model = Model(inputs=[encoder_input_layer, decoder_input_layer], outputs=decoder_outputs)

    # --- Debugging Info ---
    print("\n--- Combined Training Model Summary ---")
    training_model.summary(line_length=120)
    print(f"Model Inputs: {[inp.name + ': ' + str(inp.shape) for inp in training_model.inputs]}")
    print(f"Model Outputs: {[out.name + ': ' + str(out.shape) for out in training_model.outputs]}")
    # Verify output layer name matches dataset key
    print(f"Output layer name: {training_model.layers[-1].name} (should match dataset target key: 'output_layer')")
    print("-" * 30)

    return training_model

# --- Execution for Block 6 ---
model = None # Initialize model variable

# Build the model
# Use the constants defined in Block 1
model = build_training_model(
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIM,
    lstm_units=LSTM_UNITS, # Encoder units per direction
    decoder_lstm_units=DECODER_LSTM_UNITS, # Decoder units (matching concatenated encoder state)
    num_encoder_layers=NUM_ENCODER_LAYERS,
    num_decoder_layers=NUM_DECODER_LAYERS, # Use separate constant if needed
    dropout_rate=DROPOUT_RATE,
    max_len_input=MAX_LEN_INPUT,
    max_len_summary=MAX_LEN_SUMMARY # Use max_len_summary - 1 due to decoder input shape? No, model handles length internally
)

if model is None:
    print("Model building failed.")

2025-04-24 15:48:30,248 - INFO - Building Combined Training Model...



--- Combined Training Model Build ---
Building Encoder with 2 BiLSTM layers...
Building Decoder with 2 LSTM layers...

--- Combined Training Model Summary ---
Model: "model"
________________________________________________________________________________________________________________________
 Layer (type)                          Output Shape               Param #       Connected to                            
 encoder_inputs (InputLayer)           [(None, 1024)]             0             []                                      
                                                                                                                        
 encoder_embedding (Embedding)         (None, 1024, 100)          3000000       ['encoder_inputs[0][0]']                
                                                                                                                        
 bidirectional (Bidirectional)         [(None, 1024, 512),        731136        ['encoder_embedding

In [7]:
# Block 7: Model Compilation and Callbacks + Execution

# --- Function Definitions ---
def compile_model(model_to_compile, learning_rate):
    """Compiles the Keras model."""
    if model_to_compile is None:
         print("Error: Cannot compile a None model.")
         return None

    logging.info(f"Compiling model with AdamW optimizer and learning rate {learning_rate}...")

    # Optimizer - Use AdamW if available (good default), fallback to Adam
    try:
        optimizer = tf.keras.optimizers.AdamW(learning_rate=learning_rate)
        opt_name = "AdamW"
    except AttributeError:
        logging.warning("tf.keras.optimizers.AdamW not found, using Adam instead.")
        print("Warning: tf.keras.optimizers.AdamW not found, using Adam instead.")
        optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
        opt_name = "Adam"

    # Loss Function - SparseCategoricalCrossentropy because target tokens are integers (not one-hot)
    loss = tf.keras.losses.SparseCategoricalCrossentropy()

    # Metrics - Accuracy is useful to monitor during training
    metrics = ['accuracy'] # Can add custom metrics later if needed

    # Set jit_compile=False for stability based on project description
    model_to_compile.compile(optimizer=optimizer, loss=loss, metrics=metrics, jit_compile=False)

    # --- Debugging Info ---
    print("\n--- Model Compilation ---")
    print(f"Optimizer: {opt_name} (LR={learning_rate})")
    print(f"Loss Function: {loss.__class__.__name__}")
    print(f"Metrics: {metrics}")
    print("Model compiled successfully.")
    print("-" * 30)
    return model_to_compile # Return compiled model


def get_callbacks(model_save_path, log_dir, early_stopping_patience, reduce_lr_patience, reduce_lr_factor):
    """Sets up Keras callbacks for training."""
    logging.info("Setting up callbacks...")

    # Ensure log directory exists
    os.makedirs(log_dir, exist_ok=True)
    print(f"TensorBoard log directory: {log_dir}")

    # ModelCheckpoint: Save the best model based on validation loss
    checkpoint = ModelCheckpoint(
        filepath=model_save_path,
        monitor='val_loss',          # Monitor validation loss
        save_best_only=True,         # Only save if validation loss improves
        save_weights_only=False,     # Save the entire model (architecture + weights + optimizer state)
        mode='min',                  # The lower the validation loss, the better
        verbose=1
    )

    # EarlyStopping: Stop training if validation loss doesn't improve
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=early_stopping_patience,
        restore_best_weights=True,   # Restore weights from the epoch with the best val_loss
        mode='min',
        verbose=1
    )

    # ReduceLROnPlateau: Reduce learning rate if validation loss plateaus
    reduce_lr = ReduceLROnPlateau(
        monitor='val_loss',
        factor=reduce_lr_factor,     # Factor by which LR is reduced (new_lr = lr * factor)
        patience=reduce_lr_patience,
        min_lr=1e-6,                 # Lower bound for the learning rate
        mode='min',
        verbose=1
    )

    # TensorBoard: Log metrics for visualization
    tensorboard = TensorBoard(
        log_dir=log_dir,
        histogram_freq=1,            # Log histogram visualizations every 1 epoch
        write_graph=True,           # Visualize the graph in TensorBoard
        update_freq='epoch'          # Log metrics after each epoch
    )

    callback_list = [checkpoint, early_stopping, reduce_lr, tensorboard]

    # --- Debugging Info ---
    print("\n--- Callbacks Setup ---")
    print(f"ModelCheckpoint: monitor='val_loss', save_best_only=True, path='{model_save_path}'")
    print(f"EarlyStopping: monitor='val_loss', patience={early_stopping_patience}, restore_best_weights=True")
    print(f"ReduceLROnPlateau: monitor='val_loss', patience={reduce_lr_patience}, factor={reduce_lr_factor}")
    print(f"TensorBoard: log_dir='{log_dir}'")
    print(f"Total callbacks: {len(callback_list)}")
    print("-" * 30)

    return callback_list

# --- Execution for Block 7 ---
callbacks = [] # Initialize callbacks list

if 'model' in locals() and model is not None:
     # Compile the model created in Block 6
     model = compile_model(model, LEARNING_RATE) # Reassign model to the compiled version
     # Get the callbacks
     callbacks = get_callbacks(
         MODEL_SAVE_PATH,
         LOG_DIR,
         EARLY_STOPPING_PATIENCE,
         REDUCE_LR_PATIENCE,
         REDUCE_LR_FACTOR
     )
else:
     print("Skipping compilation and callback setup as the model was not built successfully in Block 6.")

2025-04-24 15:48:47,275 - INFO - Compiling model with AdamW optimizer and learning rate 0.001...
2025-04-24 15:48:47,292 - INFO - Setting up callbacks...



--- Model Compilation ---
Optimizer: Adam (LR=0.001)
Loss Function: SparseCategoricalCrossentropy
Metrics: ['accuracy']
Model compiled successfully.
------------------------------
TensorBoard log directory: model3_files/logs/fit/20250424-154454

--- Callbacks Setup ---
ModelCheckpoint: monitor='val_loss', save_best_only=True, path='model3_files/pib_summarizer_no_attention.keras'
EarlyStopping: monitor='val_loss', patience=5, restore_best_weights=True
ReduceLROnPlateau: monitor='val_loss', patience=3, factor=0.2
TensorBoard: log_dir='model3_files/logs/fit/20250424-154454'
Total callbacks: 4
------------------------------


In [None]:
# Block 8: Model Training + Execution

import time
import logging
import tensorflow as tf
import numpy as np # Needed for np.argmin in history analysis
import pandas as pd # Needed for plotting history
import matplotlib.pyplot as plt # Needed for plotting history
from tensorflow.keras.callbacks import EarlyStopping # Needed for type checking in history analysis


# --- Function Definition (Corrected Indentation) ---
def train_model(model_to_train, train_data, val_data, epochs_to_run, callback_list):
    """Trains the model using model.fit()"""
    logging.info("Starting model training...")
    # --- Debugging Info ---
    print("\n--- Model Training ---")

    # --- Input Checks ---
    if model_to_train is None:
        print("Error: Model is not defined or not compiled. Cannot train.")
        logging.error("Attempted to train a None or uncompiled model.")
        print("-" * 30)
        return None
    if not hasattr(model_to_train, 'optimizer') or model_to_train.optimizer is None:
        print("Error: Model has not been compiled. Cannot train.")
        logging.error("Attempted to train an uncompiled model.")
        print("-" * 30)
        return None
    if train_data is None:
        print("Error: Training dataset is not defined. Cannot train.")
        logging.error("Attempted to train with a None training dataset.")
        print("-" * 30)
        return None
    if val_data is None:
        print("Warning: Validation dataset is not defined. Training without validation monitoring (EarlyStopping/ModelCheckpoint might not work as expected).")
        logging.warning("Training without validation dataset.")

    # --- Print Training Info ---
    print(f"Training for a maximum of {epochs_to_run} epochs.")
    # Use BATCH_SIZE constant defined in Block 1 for reporting
    print(f"Using Batch Size: {BATCH_SIZE}")
    print(f"Using {len(callback_list)} callbacks: {[cb.__class__.__name__ for cb in callback_list]}")
    try:
        train_steps = len(train_data)
        print(f"Train dataset steps per epoch: {train_steps}")
    except TypeError:
        print("Could not determine train dataset length (infinite dataset?).")
        train_steps = None
    if val_data:
        try:
             val_steps = len(val_data)
             print(f"Validation dataset steps per epoch: {val_steps}")
        except TypeError:
             print("Could not determine validation dataset length.")
             val_steps = None
    else:
         val_steps = None

    # --- Training Loop with Corrected try...except ---
    start_time = time.time()
    train_history = None
    try:  # <--- Start of the try block
        train_history = model_to_train.fit(
            train_data,
            epochs=epochs_to_run,
            validation_data=val_data, # Pass validation data
            callbacks=callback_list,
            verbose=1 # Show progress bar and metrics per epoch
        )
        training_time = time.time() - start_time
        logging.info(f"Model training finished in {training_time:.2f} seconds.")
        print(f"\nTraining complete. Total time: {training_time:.2f} seconds.")
        # --- Debugging: Print final metrics ---
        if train_history and train_history.history:
            print("Final Training Metrics (from last epoch):")
            for metric, value in train_history.history.items():
                 if value:
                     print(f"  {metric}: {value[-1]:.4f}")
                 else:
                     print(f"  {metric}: No data recorded")
            # Note about EarlyStopping
            # Check if EarlyStopping callback is present in the list
            early_stopping_callback = next((cb for cb in callback_list if isinstance(cb, EarlyStopping)), None)
            if early_stopping_callback and early_stopping_callback.restore_best_weights:
                 if 'val_loss' in train_history.history and train_history.history['val_loss']:
                     # Find the epoch with the minimum validation loss
                     best_epoch_idx = np.argmin(train_history.history['val_loss'])
                     best_val_loss = train_history.history['val_loss'][best_epoch_idx]
                     print(f"Best validation loss ({best_val_loss:.4f}) occurred at epoch {best_epoch_idx + 1}.")
                 else:
                      print("Note: Early stopping with restore_best_weights used, but val_loss history unavailable to determine best epoch.")
        else:
            print("Training history is not available.")

    # <<< CORRECTED INDENTATION for except/finally >>>
    except tf.errors.ResourceExhaustedError as e: # Catch the more common OOM error type
        logging.error(f"Out of Memory (ResourceExhaustedError) during training: {e}", exc_info=True)
        print(f"\n--- Training Error: Out of Memory ---")
        print(f"GPU ran out of memory. Reduce BATCH_SIZE in Block 1 and restart the kernel.")
        # Access BATCH_SIZE directly (assuming it's a global constant from Block 1)
        print(f"Current BATCH_SIZE: {BATCH_SIZE}")
        print(f"Error details: {e}")
        print("Suggestion: If memory fragmentation is suspected, try setting environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' before starting Python/Jupyter.")
    except Exception as e: # Keep generic catch-all
        if "OOM" in str(e) or "out of memory" in str(e).lower() or "resource exhausted" in str(e).lower():
             logging.error(f"Likely Out of Memory error during training (caught by general Exception): {e}", exc_info=True)
             print(f"\n--- Training Error: Likely Out of Memory ---")
             print(f"GPU ran out of memory. Reduce BATCH_SIZE in Block 1 and restart the kernel.")
             print(f"Current BATCH_SIZE: {BATCH_SIZE}")
             print(f"Error details: {e}")
        else:
             logging.error(f"An unexpected error occurred during model training: {e}", exc_info=True)
             print(f"\n--- Training Error ---")
             print(f"An unexpected error occurred during training: {e}")

    finally: # <--- Correctly indented finally
        print("-" * 30)

    return train_history # <--- Return statement should be outside finally, aligned with try/except


# --- Execution for Block 8 ---
history = None # Initialize history variable

# Check if all required components are available
if ('model' in locals() and model and hasattr(model, 'optimizer') and model.optimizer and
    'train_dataset' in locals() and train_dataset and
    'val_dataset' in locals() and val_dataset and # Make sure validation exists for callbacks
    'callbacks' in locals() and callbacks):

    # Assuming BATCH_SIZE is accessible from Block 1's scope
    print(f"Starting training with BATCH_SIZE = {BATCH_SIZE}")
    history = train_model(
        model,
        train_dataset,
        val_dataset,
        EPOCHS, # Use constant from Block 1
        callbacks
    )
else:
     print("Skipping training due to missing compiled model, datasets, or callbacks.")
     # Add more detailed checks for debugging
     print(f"  Model exists and compiled: {'model' in locals() and model and hasattr(model, 'optimizer') and model.optimizer is not None}")
     print(f"  Train dataset exists: {'train_dataset' in locals() and train_dataset is not None}")
     print(f"  Validation dataset exists: {'val_dataset' in locals() and val_dataset is not None}")
     print(f"  Callbacks exist: {'callbacks' in locals() and callbacks and len(callbacks)>0}")


# --- Debugging: Plot training history (optional) ---
if history and history.history:
    print("\n--- Plotting Training History ---")
    try:
        # Ensure Matplotlib is imported if not already done globally
        # import matplotlib.pyplot as plt
        # Ensure pandas is imported if not already done globally
        # import pandas as pd
        pd.DataFrame(history.history).plot(figsize=(10, 6))
        plt.grid(True)
        plt.title("Model Training History")
        plt.xlabel("Epoch")
        plt.ylabel("Metric Value")
        # Adjust ylim based on metrics present
        min_loss = float('inf')
        if 'loss' in history.history and history.history['loss']:
            min_loss = min(min_loss, min(history.history['loss']))
        if 'val_loss' in history.history and history.history['val_loss']:
             min_loss = min(min_loss, min(history.history['val_loss']))

        # Set bottom ylim slightly below min loss, but not below 0
        plt.ylim(bottom=max(0, min_loss - 0.1 if min_loss != float('inf') else 0))

        # Save plot to file
        plot_path = os.path.join(OUTPUT_DIR, 'training_history.png') # Use OUTPUT_DIR from Block 1
        plt.savefig(plot_path)
        print(f"Plot saved to {plot_path}")
        plt.show() # Display the plot in the notebook
        print("Plot displayed.")
    except Exception as e:
        print(f"Could not plot or save history: {e}")
        logging.error(f"Could not plot or save history: {e}", exc_info=True)
    finally:
        print("-" * 30)
elif 'history' in locals() and history is None:
     print("Skipping history plot because training did not run or failed.")

2025-04-24 15:48:51,965 - INFO - Starting model training...


Starting training with BATCH_SIZE = 64

--- Model Training ---
Training for a maximum of 30 epochs.
Using Batch Size: 64
Using 4 callbacks: ['ModelCheckpoint', 'EarlyStopping', 'ReduceLROnPlateau', 'TensorBoard']
Train dataset steps per epoch: 1034
Validation dataset steps per epoch: 114
Epoch 1/30


2025-04-24 15:49:06.804629: W tensorflow/core/common_runtime/type_inference.cc:339] Type inference failed. This indicates an invalid graph that escaped type checking. Error message: INVALID_ARGUMENT: expected compatible input types, but input 1:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_INT32
    }
  }
}
 is neither a subtype nor a supertype of the combined inputs preceding it:
type_id: TFT_OPTIONAL
args {
  type_id: TFT_PRODUCT
  args {
    type_id: TFT_TENSOR
    args {
      type_id: TFT_FLOAT
    }
  }
}

	while inferring type of node 'cond_20/output/_23'
2025-04-24 15:49:07.112046: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:428] Loaded cuDNN version 8900
2025-04-24 15:49:07.986716: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:630] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2025-04-24 15:49:08.107215: I tensorflow/compiler/xla/servic

Epoch 1: val_loss improved from inf to 6.47542, saving model to model3_files/pib_summarizer_no_attention.keras
Epoch 2/30
Epoch 2: val_loss improved from 6.47542 to 5.04802, saving model to model3_files/pib_summarizer_no_attention.keras
Epoch 3/30
Epoch 3: val_loss improved from 5.04802 to 4.37027, saving model to model3_files/pib_summarizer_no_attention.keras
Epoch 4/30
Epoch 4: val_loss improved from 4.37027 to 4.03666, saving model to model3_files/pib_summarizer_no_attention.keras
Epoch 5/30

In [None]:
# Block 9: Inference Setup + Execution

# --- Function Definition ---
def setup_inference_models(trained_model_path, num_decoder_layers, decoder_lstm_units):
    """Loads the trained model and creates separate encoder/decoder models for inference."""
    logging.info(f"Setting up inference models from: {trained_model_path}")
    # --- Debugging Info ---
    print("\n--- Inference Setup ---")

    # --- 1. Load the trained model ---
    if not os.path.exists(trained_model_path):
        print(f"Error: Trained model file not found at {trained_model_path}. Cannot setup inference.")
        logging.error(f"Trained model not found: {trained_model_path}")
        print("-" * 30)
        return None, None

    try:
        logging.info("Loading the full trained model...")
        # No custom objects expected based on the architecture
        trained_model = tf.keras.models.load_model(trained_model_path)
        print(f"Successfully loaded trained model from {trained_model_path}")
    except Exception as e:
        logging.error(f"Failed to load trained model from {trained_model_path}: {e}", exc_info=True)
        print(f"Error: Failed to load trained model from {trained_model_path}: {e}")
        print("-" * 30)
        return None, None

    # --- 2. Create Inference Encoder Model ---
    logging.info("Creating inference encoder model...")
    inf_encoder = None
    try:
        # Extract layers by name from the loaded trained_model
        encoder_input_layer = trained_model.get_layer("encoder_inputs").input
        # Get the final concatenated states from the encoder part
        encoder_state_h = trained_model.get_layer("encoder_final_h").output
        encoder_state_c = trained_model.get_layer("encoder_final_c").output
        encoder_states = [encoder_state_h, encoder_state_c]

        inf_encoder = Model(inputs=encoder_input_layer, outputs=encoder_states, name="inference_encoder")
        print("Inference Encoder created.")
        # --- Debugging Info ---
        print("\n--- Inference Encoder Summary ---")
        inf_encoder.summary(line_length=100)
        print(f"Inference Encoder Inputs: {inf_encoder.input_shape}")
        print(f"Inference Encoder Outputs (States): {[s.shape for s in inf_encoder.output]}")

    except Exception as e:
        logging.error(f"Failed to create inference encoder: {e}", exc_info=True)
        print(f"Error: Failed to create inference encoder: {e}")
        inf_encoder = None # Ensure it's None on failure

    # --- 3. Create Inference Decoder Model ---
    # This requires rebuilding the decoder part to handle state inputs/outputs explicitly.
    logging.info("Creating inference decoder model...")
    inf_decoder = None
    try:
        # Decoder Inputs: Single token + state from previous step for EACH LSTM layer
        decoder_input_single_token = Input(shape=(1,), name="inf_decoder_input_token")

        # State inputs: one pair (h, c) for EACH decoder LSTM layer
        decoder_state_inputs = []
        for i in range(num_decoder_layers):
            state_h = Input(shape=(decoder_lstm_units,), name=f'inf_decoder_input_h_{i}')
            state_c = Input(shape=(decoder_lstm_units,), name=f'inf_decoder_input_c_{i}')
            decoder_state_inputs.extend([state_h, state_c])

        # Embedding layer (reuse weights from trained model)
        decoder_embedding_layer = trained_model.get_layer("decoder_embedding")
        # Need to ensure the inference embedding layer has the same config
        inf_embedding_layer = Embedding(decoder_embedding_layer.input_dim,
                                        decoder_embedding_layer.output_dim,
                                        mask_zero=decoder_embedding_layer.mask_zero, # Usually False or not needed for single token input
                                        name="inf_decoder_embedding")
        inf_embedding_layer.build(input_shape=(None, 1)) # Build layer with expected input shape
        inf_embedding_layer.set_weights(decoder_embedding_layer.get_weights())
        decoder_embeddings = inf_embedding_layer(decoder_input_single_token)

        # Recreate LSTM layers, setting return_state=True and loading weights
        current_sequence = decoder_embeddings
        decoder_state_outputs = [] # To collect output states from each layer

        for i in range(num_decoder_layers):
            # Get the corresponding trained LSTM layer
            trained_lstm_layer = trained_model.get_layer(f"decoder_lstm_{i+1}")

            # Create a new LSTM layer configured for inference
            inf_decoder_lstm = LSTM(decoder_lstm_units, return_sequences=True, return_state=True, name=f"inf_decoder_lstm_{i+1}")

            # Build the layer before setting weights (important!)
            # Determine input shape for build: sequence comes from previous layer or embedding, states come from input list
            if i == 0:
                 lstm_input_shape = [decoder_embeddings.shape, decoder_state_inputs[0].shape, decoder_state_inputs[1].shape]
            else:
                 # Input sequence shape is output sequence shape of previous LSTM
                 lstm_input_shape = [current_sequence.shape, decoder_state_inputs[i*2].shape, decoder_state_inputs[i*2+1].shape]
            # inf_decoder_lstm.build(input_shape=lstm_input_shape) # Build may not be needed if called directly

            # Call the layer, passing the states for this specific layer
            # States for layer 'i' are at indices i*2 and i*2+1 in decoder_state_inputs
            current_sequence, state_h_out, state_c_out = inf_decoder_lstm(
                current_sequence, initial_state=decoder_state_inputs[i*2 : i*2+2]
            )

            # Set weights *after* the layer has been called/built implicitly
            inf_decoder_lstm.set_weights(trained_lstm_layer.get_weights())


            # Store the output states for this layer
            decoder_state_outputs.extend([state_h_out, state_c_out])

        # Final Dense layer (reuse weights)
        output_dense_layer = trained_model.get_layer("output_layer")
        # Create a new Dense layer with same config
        inf_dense_layer = Dense(output_dense_layer.units,
                                activation=output_dense_layer.activation,
                                name="inf_output_layer")
        # Build layer before setting weights
        inf_dense_layer.build(input_shape=current_sequence.shape)
        # Set weights
        inf_dense_layer.set_weights(output_dense_layer.get_weights())
        decoder_dense_outputs = inf_dense_layer(current_sequence) # Shape: (batch, 1, vocab_size)

        # Define the inference decoder model
        inf_decoder = Model(
             inputs=[decoder_input_single_token] + decoder_state_inputs,
             outputs=[decoder_dense_outputs] + decoder_state_outputs, # Return logits + ALL output states
             name="inference_decoder"
        )

        print("Inference Decoder created.")
        # --- Debugging Info ---
        print("\n--- Inference Decoder Summary ---")
        inf_decoder.summary(line_length=120)
        print(f"Inference Decoder Inputs: {[inp.name + ': ' + str(inp.shape) for inp in inf_decoder.inputs]}")
        print(f"Inference Decoder Outputs: {[out.name + ': ' + str(out.shape) for out in inf_decoder.outputs]}")

    except Exception as e:
        logging.error(f"Failed to create inference decoder: {e}", exc_info=True)
        print(f"Error: Failed to create inference decoder: {e}")
        print("Hint: Rebuilding the decoder for inference with correct state flow and weight loading is complex.")
        inf_decoder = None # Ensure it's None on failure

    finally:
        print("-" * 30)

    return inf_encoder, inf_decoder


# --- Execution for Block 9 ---
inference_encoder = None
inference_decoder = None

# Load the best model saved by ModelCheckpoint during training
if os.path.exists(MODEL_SAVE_PATH):
    inference_encoder, inference_decoder = setup_inference_models(
        MODEL_SAVE_PATH,
        num_decoder_layers=NUM_DECODER_LAYERS, # Pass necessary dimensions
        decoder_lstm_units=DECODER_LSTM_UNITS
    )
    if inference_encoder is None or inference_decoder is None:
        print("Inference model setup failed.")
else:
    print(f"Skipping inference setup: Trained model not found at {MODEL_SAVE_PATH}")
    print("Ensure that training (Block 8) ran successfully and saved the model.")

In [None]:
# Block 11: Evaluation (ROUGE Score) + Execution

# --- Function Definitions ---
def calculate_rouge_scores(predictions, references):
    """Calculates ROUGE-1, ROUGE-2, ROUGE-L F1 scores."""
    logging.info(f"Calculating ROUGE scores for {len(predictions)} pairs...")
    # --- Debugging Info ---
    print("\n--- ROUGE Score Calculation ---")

    if not isinstance(predictions, list) or not isinstance(references, list):
         print("Error: Predictions and References must be lists.")
         return {}
    if not predictions or not references :
        print("Error: Predictions or References list is empty.")
        return {}
    if len(predictions) != len(references):
        print(f"Error: Mismatch in number of predictions ({len(predictions)}) and references ({len(references)}).")
        logging.error("ROUGE calculation failed: Mismatched lengths.")
        print("-" * 30)
        return {}

    # Initialize scorer
    try:
        # Uses stemming by default which is standard practice
        scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    except Exception as e:
        print(f"Error initializing RougeScorer: {e}")
        print("Make sure the 'rouge-score' library and potentially 'nltk' are installed.")
        return {}

    # Aggregate scores
    aggregator = scoring.BootstrapAggregator()

    print(f"Calculating scores for {len(predictions)} prediction-reference pairs...")
    skipped_pairs = 0
    for i, (pred, ref) in enumerate(tqdm(zip(predictions, references), total=len(predictions), desc="ROUGE Calculation")):
        # Ensure inputs are strings
        pred_str = str(pred) if pred is not None else ""
        ref_str = str(ref) if ref is not None else ""

        # Handle potential empty strings (assign zero score)
        if not pred_str or not ref_str:
            skipped_pairs += 1
            # Create a zero score dictionary matching RougeScorer output structure
            zero_scores = {
                'rouge1': scoring.Score(precision=0.0, recall=0.0, fmeasure=0.0),
                'rouge2': scoring.Score(precision=0.0, recall=0.0, fmeasure=0.0),
                'rougeL': scoring.Score(precision=0.0, recall=0.0, fmeasure=0.0),
            }
            aggregator.add_scores(zero_scores)
        else:
            try:
                 scores = scorer.score(target=ref_str, prediction=pred_str)
                 aggregator.add_scores(scores)
            except Exception as e:
                 print(f"\nError scoring pair {i}: Pred='{pred_str[:50]}...', Ref='{ref_str[:50]}...'. Error: {e}")
                 skipped_pairs += 1
                 # Add zero scores on error as well
                 zero_scores = {
                     'rouge1': scoring.Score(precision=0.0, recall=0.0, fmeasure=0.0),
                     'rouge2': scoring.Score(precision=0.0, recall=0.0, fmeasure=0.0),
                     'rougeL': scoring.Score(precision=0.0, recall=0.0, fmeasure=0.0),
                 }
                 aggregator.add_scores(zero_scores)


    if skipped_pairs > 0:
        print(f"Warning: Skipped or encountered errors for {skipped_pairs} pairs during ROUGE calculation.")

    # Compute aggregated results (confidence intervals available via result.low/mid/high)
    try:
        result = aggregator.aggregate()
    except ValueError as e:
         print(f"Error during ROUGE aggregation: {e}")
         print("This might happen if all pairs were skipped or resulted in errors.")
         return {}


    # Extract F1 scores (mid represents the point estimate)
    rouge_f1_scores = {key: result[key].mid.fmeasure * 100 for key in result} # Convert to percentage

    logging.info(f"ROUGE calculation complete. Scores: {rouge_f1_scores}")
    # --- Debugging Info ---
    print("\nAggregated ROUGE F1 Scores (%):")
    for key, score in rouge_f1_scores.items():
        print(f"  {key}: {score:.2f}")
    print("-" * 30)

    return rouge_f1_scores


def evaluate_model_on_set(data_df, text_col, target_col, # Use target_col which has <start>/<end>
                          tokenizer_obj, inf_encoder_model, inf_decoder_model,
                          max_len_input, max_len_summary, start_token_id, end_token_id,
                          num_decoder_layers):
    """Generates summaries for a given DataFrame slice and calculates ROUGE scores."""
    logging.info("Starting model evaluation...")
    # --- Debugging Info ---
    print("\n--- Model Evaluation ---")

    if data_df is None or data_df.empty:
        print("Error: Input DataFrame for evaluation is empty. Cannot evaluate.")
        logging.error("Evaluation failed: Input DataFrame empty.")
        return {}

    # Check for required columns
    if text_col not in data_df.columns or target_col not in data_df.columns:
         print(f"Error: Missing required columns in evaluation DataFrame: need '{text_col}' and '{target_col}'. Found: {data_df.columns.tolist()}")
         logging.error(f"Evaluation failed: Missing columns '{text_col}' or '{target_col}'")
         return {}


    # Prepare input texts (assuming they are already cleaned if df came from processed_df)
    input_texts = data_df[text_col].tolist()

    # Prepare reference summaries: Need to strip <start> and <end> tokens for ROUGE
    reference_summaries_raw = data_df[target_col].tolist()
    reference_summaries_cleaned = []
    print("Cleaning reference summaries (removing <start>/<end>)...")
    for ref in reference_summaries_raw:
        cleaned_ref = str(ref).replace("<start>", "").replace("<end>", "").strip()
        reference_summaries_cleaned.append(cleaned_ref)

    print(f"Prepared {len(input_texts)} input texts and {len(reference_summaries_cleaned)} reference summaries for evaluation.")

    # Generate summaries for all evaluation inputs
    generated_summaries = []
    print("Generating summaries for evaluation set...")
    for text in tqdm(input_texts, desc="Generating Evaluation Summaries"):
        # Use the greedy generation function (replace with beam search if implemented)
        summary = generate_summary_greedy(
            input_text=text, # Pass raw (but cleaned) text
            tokenizer_obj=tokenizer_obj,
            inf_encoder_model=inf_encoder_model,
            inf_decoder_model=inf_decoder_model,
            max_len_input=max_len_input,
            max_len_summary=max_len_summary,
            start_token_id=start_token_id,
            end_token_id=end_token_id,
            num_decoder_layers=num_decoder_layers
        )
        # Handle potential errors from generation
        if summary.startswith("[Error"):
            print(f"Warning: Generation failed for one input, using empty string for evaluation.")
            generated_summaries.append("") # Append empty string on error
        else:
             generated_summaries.append(summary)

    print(f"Generated {len(generated_summaries)} summaries.")
    # --- Debugging: Show a few generated vs reference ---
    print("\nSample Generated vs Reference Summaries (Evaluation Set):")
    num_samples_to_show = min(3, len(generated_summaries))
    for i in range(num_samples_to_show):
        print(f"\n--- Eval Sample {i+1} ---")
        print(f"Input Text (start): {str(input_texts[i])[:200]}...") # Ensure string conversion
        print(f"Reference Summary: {reference_summaries_cleaned[i]}")
        print(f"Generated Summary: {generated_summaries[i]}")


    # Calculate ROUGE scores
    final_rouge_scores = calculate_rouge_scores(generated_summaries, reference_summaries_cleaned)

    logging.info("Evaluation complete.")
    print("-" * 30)
    return final_rouge_scores


# --- Execution for Block 11 ---
rouge_results = {} # Initialize results dictionary

# Evaluate on the validation set created in Block 5.
# We need the DataFrame slice corresponding to the validation data.
# We need the original 'processed_df' before shuffling for consistent indices,
# OR reconstruct the val df from the numpy arrays if 'processed_df' was modified/deleted.

# Let's try to reconstruct from numpy arrays (assuming they exist from Block 5)
if ('encoder_input_val' in locals() and encoder_input_val.size > 0 and
    'decoder_target_val' in locals() and decoder_target_val.size > 0 and # Use decoder target for reference summary IDs
    'tokenizer' in locals() and tokenizer):

    print("\nReconstructing validation DataFrame subset for evaluation...")
    # Detokenize encoder inputs to get the original 'cleaned_text' (approx)
    # This might not be perfect due to truncation/padding
    # For accurate evaluation, it's better to save the test split *before* tokenization.
    # Let's proceed with detokenization, acknowledging potential inaccuracies.

    # Detokenize the validation encoder inputs - this is computationally expensive!
    # val_texts_detokenized = [detokenize_sequences(seq, tokenizer) for seq in tqdm(encoder_input_val, desc="Detokenizing val inputs")]
    # print("Warning: Evaluating on detokenized validation inputs, which might differ slightly from original text due to tokenization limits.")

    # Alternative: If 'processed_df' still exists and indices were tracked/saved, use that.
    # Assuming we don't have the original text readily available, we'll use the reference summaries.

    # Detokenize the validation *target* summaries (which have start/end)
    val_summaries_detokenized = [detokenize_sequences(seq, tokenizer) for seq in tqdm(decoder_target_val, desc="Detokenizing val targets")]

    # Create a temporary DataFrame for evaluation
    # We need input texts. Since detokenizing encoder inputs is slow/lossy,
    # let's *skip* providing the original text source for evaluation printouts,
    # and focus only on comparing generated vs reference summaries.
    # We will generate summaries based on the *tokenized* validation encoder input.

    print("Generating summaries for validation set using tokenized inputs...")
    generated_val_summaries = []
    skipped_val_gen = 0

    # Check if inference models are ready
    if ('inference_encoder' in locals() and inference_encoder and
        'inference_decoder' in locals() and inference_decoder):

        # Need to iterate through validation *encoder inputs* and generate
        for i, enc_input_seq in enumerate(tqdm(encoder_input_val, desc="Generating Val Summaries")):
             # We need to feed the *encoder input sequence* to the encoder,
             # then use the states to start greedy search.
             # The `generate_summary_greedy` expects raw text, so we adapt the logic here.

             # 1. Encode the input sequence
             try:
                  # Reshape sequence for batch dimension (1, max_len_input)
                  input_seq_batch = np.expand_dims(enc_input_seq, axis=0)
                  initial_encoder_states = inference_encoder.predict(input_seq_batch, verbose=0)
                  # Initialize decoder states (same logic as in generate_summary_greedy)
                  decoder_states_value = initial_encoder_states * NUM_DECODER_LAYERS
             except Exception as e:
                  print(f"Error encoding val sequence {i}: {e}")
                  generated_val_summaries.append("[Error: Encoding failed]")
                  skipped_val_gen += 1
                  continue

             # 2. Initialize Decoder Input
             target_seq = np.array([[START_ID]])

             # 3. Greedy Decoding Loop (Simplified version of the function's loop)
             decoded_tokens_val = []
             for step in range(MAX_LEN_SUMMARY):
                 try:
                      decoder_inputs = [target_seq] + decoder_states_value
                      decoder_outputs = inference_decoder.predict(decoder_inputs, verbose=0)
                      output_tokens_logits = decoder_outputs[0]
                      new_states = decoder_outputs[1:]
                 except Exception as e:
                      print(f"Error during val decoder prediction step {step+1} for sequence {i}: {e}")
                      decoded_tokens_val.append(END_ID) # Try to end gracefully
                      break

                 sampled_token_id = np.argmax(output_tokens_logits[0, -1, :])
                 if sampled_token_id == END_ID:
                      break
                 if sampled_token_id != START_ID and sampled_token_id != PAD_ID:
                    decoded_tokens_val.append(sampled_token_id)

                 target_seq = np.array([[sampled_token_id]])
                 if len(new_states) == len(decoder_states_value):
                    decoder_states_value = new_states
                 else:
                    print(f"State mismatch during val generation {i}, step {step+1}")
                    break
                 if len(decoded_tokens_val) >= MAX_LEN_SUMMARY:
                     break

             # 4. Detokenize
             summary = detokenize_sequences(np.array(decoded_tokens_val), tokenizer)
             generated_val_summaries.append(summary)

        if skipped_val_gen > 0:
            print(f"Skipped generation for {skipped_val_gen} validation samples due to errors.")

        # --- Debugging: Show a few generated vs reference ---
        print("\nSample Generated vs Reference Summaries (Validation Set):")
        num_samples_to_show = min(3, len(generated_val_summaries))
        for i in range(num_samples_to_show):
            print(f"\n--- Val Sample {i+1} ---")
            # print(f"Input Text (Detokenized - Approx): {val_texts_detokenized[i][:200]}...") # If available
            print(f"Reference Summary (Detokenized): {val_summaries_detokenized[i]}")
            print(f"Generated Summary: {generated_val_summaries[i]}")

        # Calculate ROUGE scores using the generated summaries and the detokenized references
        rouge_results = calculate_rouge_scores(generated_val_summaries, val_summaries_detokenized)

        print("\n--- Final Evaluation ROUGE Scores (on Validation Set) ---")
        print(rouge_results)
        print("-" * 30)

    else:
        print("Skipping validation set generation: Missing inference models.")

elif not ('encoder_input_val' in locals() and encoder_input_val.size > 0):
     print("Skipping evaluation: Validation input data not found (Block 5 might have failed or was not run).")
elif not ('decoder_target_val' in locals() and decoder_target_val.size > 0):
      print("Skipping evaluation: Validation target data not found (Block 5 might have failed or was not run).")
elif not ('tokenizer' in locals() and tokenizer):
      print("Skipping evaluation: Tokenizer not found (Block 4 might have failed or was not run).")