In [1]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xzf aclImdb_v1.tar.gz
!rm -rf aclImdb/train/unsup

--2025-05-05 04:51:10--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2025-05-05 04:51:12 (52.9 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [2]:
# Fine-tuning a Pretrained Backbone with Keras NLP
# --- VERSION WITH OPTIMIZATIONS ---

import tensorflow as tf
import keras_nlp
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
import time
import os
import math # For ceiling division

print("TensorFlow version:", tf.__version__)
print("Keras NLP version:", keras_nlp.__version__)

TensorFlow version: 2.18.0
Keras NLP version: 0.18.1


In [None]:

# --- Configuration ---
# Dataset
dataset_dir = "aclImdb" # Make sure this directory exists
# Model & Sequence Length
# Consider smaller models like "distilbert_base_en_uncased" if memory is tight
# PRESET = "distilbert_base_en_uncased"
PRESET = "bert_base_en_uncased"
# Try reducing sequence length (e.g., 256, 128) if memory is an issue and texts allow
max_sequence_length = 256
# Training Hyperparameters
BATCH_SIZE = 16 # Reduced batch size to save memory (adjust based on your GPU)
INITIAL_EPOCHS = 2 # Epochs for training the head
FINE_TUNE_EPOCHS = 3 # Epochs for fine-tuning the whole model
INITIAL_LR = 1e-4 # Learning rate for initial head training
END_LR = 1e-7 # End learning rate for decay schedule during fine-tuning
# Learning rate for fine-tuning - will be overwritten by schedule if enabled
# FINE_TUNE_LR = 5e-5
WEIGHT_DECAY = 0.01 # Weight decay for AdamW
WARMUP_RATE = 0.1 # Proportion of fine-tuning steps for warmup
ENABLE_LR_SCHEDULE = True # Set to False to use fixed FINE_TUNE_LR
# Mixed Precision (requires compatible GPU - e.g., T4, V100, A100)
ENABLE_MIXED_PRECISION = True

# --- (Optional) Enable Mixed Precision Training ---
if ENABLE_MIXED_PRECISION:
    try:
        print("\n=== Enabling Mixed Precision Training (float16) ===")
        keras.mixed_precision.set_global_policy('mixed_float16')
        print("Mixed precision policy set to 'mixed_float16'.")
        # Note: Ensure your GPU supports float16 operations for performance benefits.
    except Exception as e:
        print(f"Could not enable mixed precision: {e}. Continuing with float32.")
        ENABLE_MIXED_PRECISION = False # Fallback if policy setting fails

# --- 1. Load and Prepare Dataset ---
print("\n=== 1. Loading and Preparing Dataset ===")
if not os.path.exists(dataset_dir):
    print(f"ERROR: Dataset directory '{dataset_dir}' not found.")
    print("Please download the IMDB dataset (aclImdb_v1.tar.gz) and extract it.")
    raise FileNotFoundError(f"Dataset directory '{dataset_dir}' not found.")

In [None]:

# Load raw text data directly into tf.data.Dataset objects
train_ds, validation_ds = keras.utils.text_dataset_from_directory(
    dataset_dir,
    batch_size=BATCH_SIZE, # Use config variable
    validation_split=0.2,
    subset="both",
    seed=42,
)

# Get dataset sizes for LR schedule calculation (more robust than hardcoding)
try:
    num_train_examples = tf.data.experimental.cardinality(train_ds).numpy() * BATCH_SIZE
    num_val_examples = tf.data.experimental.cardinality(validation_ds).numpy() * BATCH_SIZE
    # Note: Cardinality might be infinite/unknown if dataset ops aren't fully defined yet.
    # If it fails, fall back to info printed during loading (usually reliable here)
    if num_train_examples <= 0: raise ValueError("Cardinality failed")
    print(f"Detected {num_train_examples} training examples, {num_val_examples} validation examples.")
except:
    print("Could not determine dataset size via cardinality, using loading info.")
    # Use info from loading printout (update if your split/dataset changes)
    num_train_examples = 40004
    num_val_examples = 10001
    print(f"Using assumed sizes: {num_train_examples} training, {num_val_examples} validation.")


print(f"Training dataset specs: {train_ds.element_spec}")
print(f"Validation dataset specs: {validation_ds.element_spec}")


In [None]:

# Apply caching and prefetching for performance
train_ds = train_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
validation_ds = validation_ds.cache().prefetch(buffer_size=tf.data.AUTOTUNE)

# --- 2. Load Pretrained Backbone & Preprocessor ---
print(f"\n=== 2. Loading Pretrained Backbone & Preprocessor ({PRESET}) ===")

# Use specific preprocessor/backbone based on PRESET
if "distilbert" in PRESET:
    PreprocessorClass = keras_nlp.models.DistilBertPreprocessor
    BackboneClass = keras_nlp.models.DistilBertBackbone
elif "bert" in PRESET:
    PreprocessorClass = keras_nlp.models.BertPreprocessor
    BackboneClass = keras_nlp.models.BertBackbone
else:
    raise ValueError(f"Unsupported preset type: {PRESET}")

preprocessor = PreprocessorClass.from_preset(
    PRESET,
    sequence_length=max_sequence_length,
)
backbone = BackboneClass.from_preset(PRESET)



In [None]:

# --- 3. Build the Fine-tuning Model ---
print("\n=== 3. Building the Fine-tuning Model ===")

inputs = keras.Input(shape=(), dtype="string", name="text_input")
preprocessed_inputs = preprocessor(inputs)
backbone_outputs = backbone(preprocessed_inputs)

# Use pooled_output if available and suitable (often pre-processed for classification)
# Otherwise, fallback to CLS token from sequence_output
if "pooled_output" in backbone_outputs:
     # Check if pooled_output is valid (might be None for some models/configs)
    if backbone_outputs["pooled_output"] is not None:
        print("Using 'pooled_output' from backbone.")
        sequence_representation = backbone_outputs["pooled_output"]
    else: # Fallback if pooled_output is None
         print("Using 'sequence_output[:, 0, :]' (CLS token) as pooled_output was None.")
         sequence_representation = backbone_outputs["sequence_output"][:, 0, :]
else: # Fallback if key doesn't exist
    print("Using 'sequence_output[:, 0, :]' (CLS token).")
    sequence_representation = backbone_outputs["sequence_output"][:, 0, :]

# Add dropout and classification head
intermediate_output = keras.layers.Dropout(0.1)(sequence_representation)
# Use float32 for the final Dense layer when using mixed precision for stability
outputs = keras.layers.Dense(1, activation="sigmoid", name="classifier_head", dtype=tf.float32)(intermediate_output)

model = keras.Model(inputs, outputs)
model.summary()



In [None]:
# OPTIMIZATION: Use AdamW optimizer
optimizer_tuned = keras.optimizers.AdamW(
    learning_rate=final_learning_rate, # Use schedule or fixed LR
    weight_decay=WEIGHT_DECAY,
    # clipnorm=1.0 # Optional: gradient clipping
)

model.compile(
    optimizer=optimizer_tuned,
    loss=loss_fn, # Reuse loss from before
    metrics=["accuracy"],
    jit_compile=False # Keep False
)

print("\nContinuing training (fine-tuning) with unfrozen backbone...")
total_epochs = INITIAL_EPOCHS + FINE_TUNE_EPOCHS
start_time = time.time()
history_unfrozen = model.fit(
    train_ds,
    epochs=total_epochs,
    initial_epoch=INITIAL_EPOCHS, # Start counting from here
    validation_data=validation_ds,
)
end_time = time.time()
print(f"Time taken for fine-tuning: {end_time - start_time:.2f} seconds")


In [None]:

# --- 4. Initial Training with Frozen Backbone ---
print("\n=== 4. Training with Frozen Backbone (Warm-up) ===")
print("Initially freezing the backbone layers...")
backbone.trainable = False

# OPTIMIZATION: Use AdamW optimizer
optimizer_frozen = keras.optimizers.AdamW(
    learning_rate=INITIAL_LR,
    weight_decay=WEIGHT_DECAY,
    # clipnorm=1.0 # Optional: gradient clipping
)
# Loss needs to be float32 when using mixed precision
loss_fn = keras.losses.BinaryCrossentropy(from_logits=False) # Sigmoid applied

model.compile(
    optimizer=optimizer_frozen,
    loss=loss_fn,
    metrics=["accuracy"],
    jit_compile=False # Keep False to avoid XLA string issues
)

print(f"Training head for {INITIAL_EPOCHS} epochs...")
start_time = time.time()
history_frozen = model.fit(
    train_ds,
    epochs=INITIAL_EPOCHS,
    validation_data=validation_ds,
)
end_time = time.time()
print(f"Time taken for training with frozen backbone: {end_time - start_time:.2f} seconds")


# --- 5. Fine-tuning the Entire Model ---
print("\n=== 5. Fine-tuning the Entire Model ===")
print("Unfreezing the backbone layers...")
backbone.trainable = True

# OPTIMIZATION: Calculate steps for LR schedule
steps_per_epoch = math.ceil(num_train_examples / BATCH_SIZE)
fine_tune_steps = steps_per_epoch * FINE_TUNE_EPOCHS
warmup_steps = int(WARMUP_RATE * fine_tune_steps)
print(f"Steps per epoch: {steps_per_epoch}")
print(f"Total fine-tune steps: {fine_tune_steps}")
print(f"Warmup steps: {warmup_steps}")

if ENABLE_LR_SCHEDULE:
    print("Using learning rate schedule with linear decay and warmup.")
    # Start LR is the peak LR after warmup
    start_lr_schedule = INITIAL_LR # Or use FINE_TUNE_LR if defined separately

    # Create Polynomial Decay schedule (often used, approximates linear)
    lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
        initial_learning_rate=start_lr_schedule,
        decay_steps=fine_tune_steps - warmup_steps, # Steps after warmup
        end_learning_rate=END_LR,
        power=1.0, # Power 1.0 is linear decay
    )

    # Apply warmup phase
    if warmup_steps > 0:
        warmup_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
             initial_learning_rate=1e-9, # Start from near zero
             decay_steps=warmup_steps,
             end_learning_rate=start_lr_schedule,
             power=1.0
        )
        # Combine warmup and decay
        # Note: Keras schedules are based on step counts directly
        # We need to manually switch based on step or use a custom schedule wrapper
        # For simplicity here, we might just use AdamW's internal schedule capabilities if simpler,
        # or accept that PolynomialDecay starts decay immediately after warmup peak.
        # A simpler approximation using AdamW's beta_1 for warmup isn't standard.
        # Let's stick to the PolynomialDecay but acknowledge the warmup integration isn't perfect via built-ins easily.
        # A more robust way involves custom schedule objects or optimizers with built-in warmup+decay.
        # Let's just use the decay part for now, starting from peak LR, assuming warmup happens implicitly or via optimizer properties
        # Reverting to a simpler setup for PolynomialDecay post-warmup:
        lr_schedule_post_warmup = tf.keras.optimizers.schedules.PolynomialDecay(
             initial_learning_rate=start_lr_schedule, # Peak LR
             decay_steps=fine_tune_steps - warmup_steps,
             end_learning_rate=END_LR,
             power=1.0
        )
        # We'll pass this to AdamW. Warmup needs custom handling or specific optimizer support usually.
        # Keras's AdamW doesn't have built-in linear warmup + decay schedule.
        # For demonstration, we proceed with decay from peak LR. Actual warmup requires more code.
        print(f"WARNING: Using PolynomialDecay starting from peak LR ({start_lr_schedule}). Proper warmup requires custom schedule.")
        final_learning_rate = lr_schedule_post_warmup # Use the decay schedule
else:
    print(f"Using fixed learning rate: {INITIAL_LR}") # Use initial LR as FINE_TUNE_LR wasn't set
    final_learning_rate = INITIAL_LR


In [None]:


# --- 6. Visualize Training History ---
# (Plotting code remains the same as previous version)
print("\n=== 6. Visualizing Training History ===")
history = {}
try:
    # Combine histories only if both phases ran successfully
    history["accuracy"] = history_frozen.history["accuracy"] + history_unfrozen.history["accuracy"]
    history["val_accuracy"] = history_frozen.history["val_accuracy"] + history_unfrozen.history["val_accuracy"]
    history["loss"] = history_frozen.history["loss"] + history_unfrozen.history["loss"]
    history["val_loss"] = history_frozen.history["val_loss"] + history_unfrozen.history["val_loss"]
except KeyError as e:
     print(f"Warning: Could not combine histories completely due to missing key: {e}. Plotting might be incomplete.")
     history = history_unfrozen.history # Plot only the fine-tuning part if frozen part failed

if history and "accuracy" in history: # Check if history is usable
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(history["accuracy"], label="Training Accuracy")
    plt.plot(history["val_accuracy"], label="Validation Accuracy")
    plt.axvline(x=INITIAL_EPOCHS-1, color="r", linestyle="--", label="Start Fine Tuning")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.ylim([0, 1]) # Standardize y-axis for accuracy
    plt.legend()
    plt.title("Training and Validation Accuracy")

    plt.subplot(1, 2, 2)
    plt.plot(history["loss"], label="Training Loss")
    plt.plot(history["val_loss"], label="Validation Loss")
    plt.axvline(x=INITIAL_EPOCHS-1, color="r", linestyle="--", label="Start Fine Tuning")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()
    plt.title("Training and Validation Loss")

    plt.tight_layout()
    plt.show()
else:
    print("Skipping plotting as history data is missing or incomplete.")


# --- 7. Final Model Evaluation ---
print("\n=== 7. Final Model Evaluation ===")
loss, accuracy = model.evaluate(validation_ds)
# If using mixed precision, accuracy might be float16, convert for display
accuracy = float(accuracy)
loss = float(loss)
print(f"Final Validation Accuracy: {accuracy:.4f}")
print(f"Final Validation Loss: {loss:.4f}")



In [None]:

# --- 8. Testing on New Examples ---
# (Testing code remains the same)
print("\n=== 8. Testing on New Examples ===")
test_examples = [
    "This movie was fantastic! I really enjoyed the plot and the acting was superb.",
    "What a waste of time. Poor acting, terrible script, and boring storyline.",
    "The movie had good special effects but the story was somewhat confusing.",
    "I fell asleep halfway through the movie.",
    "The characters were well-developed and the dialogue was engaging."
]
predictions = model.predict(tf.constant(test_examples))
for i, (text, pred) in enumerate(zip(test_examples, predictions)):
    pred_value = float(pred[0]) # Ensure float for comparison
    sentiment = "Positive" if pred_value >= 0.5 else "Negative"
    confidence = pred_value if sentiment == "Positive" else 1 - pred_value
    print(f"\nExample {i+1}: {text}")
    print(f"Prediction: {sentiment} (confidence: {confidence:.4f})")


# --- 9. Saving the Fine-tuned Model ---
# (Saving code remains largely the same)
print("\n=== 9. Saving the Fine-tuned Model ===")
model_save_path = f"./{PRESET}_imdb_finetuned.keras" # Dynamic name based on preset
try:
    model.save(model_save_path)
    print(f"Model saved successfully to {model_save_path}")

    # Optional: Test loading the saved model
    print("\nTesting loading the saved model...")
    # When loading mixed precision models, custom objects usually not needed
    # but good practice if complex layers were added
    loaded_model = keras.models.load_model(model_save_path)
    print("Model loaded successfully.")
    loaded_pred = loaded_model.predict(["A quick test after loading."])
    print(f"Test prediction with loaded model: {float(loaded_pred[0][0]):.4f}")

except Exception as e:
    print(f"Error saving or loading model: {e}")


# --- Tips ---
# (Tips remain the same)
print("\n=== Tips for Improving Fine-tuning Results ===")
# ... (tips copied from previous version) ...
print("1. Use longer training with more epochs (monitor validation loss for overfitting)")
print("2. Experiment with different learning rates or learning rate schedules (e.g., linear decay)")
print("3. Try larger batch sizes if your GPU memory allows (or use gradient accumulation)")
print("4. Explore gradual unfreezing (unfreeze layer groups incrementally)")
print("5. Apply techniques like weight decay (AdamW helps here)")
print("6. Use longer maximum sequence length if needed and memory allows")
print("7. Consider data augmentation techniques for text (e.g., back-translation, synonym replacement)")
print("8. Try different pretrained backbones (e.g., RoBERTa, DistilBERT, ELECTRA)")


print(f"\nComplete! Optimized fine-tuning script finished for {PRESET}.")


=== 8. Testing on New Examples ===
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step

Example 1: This movie was fantastic! I really enjoyed the plot and the acting was superb.
Prediction: Positive (confidence: 0.5115)

Example 2: What a waste of time. Poor acting, terrible script, and boring storyline.
Prediction: Positive (confidence: 0.5115)

Example 3: The movie had good special effects but the story was somewhat confusing.
Prediction: Positive (confidence: 0.5115)

Example 4: I fell asleep halfway through the movie.
Prediction: Positive (confidence: 0.5116)

Example 5: The characters were well-developed and the dialogue was engaging.
Prediction: Positive (confidence: 0.5115)

=== 9. Saving the Fine-tuned Model ===
Model saved successfully to ./bert_base_en_uncased_imdb_finetuned.keras

Testing loading the saved model...
Error saving or loading model: Exception encountered when calling BertTextClassifierPreprocessor.call().

[1mCould not automatically infer the