In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Load IMDb dataset
max_features = 5000  # Only use the top 5000 words
(X_train, y_train), (X_valid, y_valid) = imdb.load_data(num_words=max_features)

# Pad sequences to ensure uniform length
maxlen = 200  # Maximum sequence length
X_train = pad_sequences(X_train, maxlen=maxlen)
X_valid = pad_sequences(X_valid, maxlen=maxlen)

# Transformer Encoder Layer
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout_rate):
    # Multi-head self-attention
    attention = layers.MultiHeadAttention(num_heads=num_heads, key_dim=head_size)(inputs, inputs)
    attention = layers.Dropout(dropout_rate)(attention)
    attention = layers.LayerNormalization(epsilon=1e-6)(inputs + attention)  # Residual connection
    
    # Feed-forward network
    ff = layers.Dense(ff_dim, activation='relu')(attention)
    ff = layers.Dense(inputs.shape[-1])(ff)
    ff = layers.Dropout(dropout_rate)(ff)
    output = layers.LayerNormalization(epsilon=1e-6)(attention + ff)  # Residual connection
    
    return output

# Build the Transformer model
def build_transformer_model(input_shape, num_classes, head_size=256, num_heads=8, ff_dim=256, num_layers=2, dropout_rate=0.1):
    inputs = layers.Input(shape=input_shape)
    
    # Embedding layer
    embedding = layers.Embedding(input_dim=max_features, output_dim=head_size)(inputs)
    
    # Stacked Transformer Encoders
    x = embedding
    for _ in range(num_layers):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout_rate)
    
    # Pooling
    x = layers.GlobalAveragePooling1D()(x)
    
    # Output Layer
    x = layers.Dense(64, activation='relu')(x)
    output = layers.Dense(num_classes, activation='sigmoid')(x)
    
    model = Model(inputs, output)
    return model

# Instantiate and compile the model
model = build_transformer_model(input_shape=(maxlen,), num_classes=1)
model.compile(loss='binary_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, batch_size=64, epochs=5, validation_data=(X_valid, y_valid))

# Evaluate the model
score = model.evaluate(X_valid, y_valid, verbose=0)
print(f"Test loss: {score[0]}")
print(f"Test accuracy: {score[1]}")

# Example prediction
sample_review = ["This movie was fantastic! I loved it."]
# Preprocess the sample review (convert to tokens, pad, etc.)
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(sample_review)
sequence = tokenizer.texts_to_sequences(sample_review)
padded_sequence = pad_sequences(sequence, maxlen=maxlen)

# Predict sentiment (0: negative, 1: positive)
prediction = model.predict(padded_sequence)
print(f"Prediction: {'Positive' if prediction[0] > 0.5 else 'Negative'}")
