In [None]:
# Step 1: Load IMDb dataset
from datasets import load_dataset

dataset = load_dataset("imdb")
train_data = dataset["train"]
test_data = dataset["test"]

# Step 2: Load Pretrained BERT Model and Tokenizer
model_name = "bert-base-uncased"  # You can use other models like 'bert-large-uncased', 'bert-base-cased', etc.
tokenizer = BertTokenizer.from_pretrained(model_name)

# Step 3: Preprocess the Dataset (Tokenization)
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

train_data = train_data.map(preprocess_function, batched=True)
test_data = test_data.map(preprocess_function, batched=True)

# Step 4: Convert datasets to TensorFlow format
train_data = train_data.remove_columns(["text", "label"])  # Drop unnecessary columns
test_data = test_data.remove_columns(["text", "label"])

train_data.set_format(type='tensorflow', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format(type='tensorflow', columns=['input_ids', 'attention_mask', 'label'])

# Step 5: Create TensorFlow Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_data),
    train_data["label"]
)).batch(32)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_data),
    test_data["label"]
)).batch(32)

# Step 6: Load BERT Model for Sequence Classification
model = TFBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Step 7: Compile the Model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy()

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Step 8: Train the Model
model.fit(train_dataset, epochs=3, batch_size=32)

# Step 9: Evaluate the Model
results = model.evaluate(test_dataset)
print(f"Test Loss: {results[0]}")
print(f"Test Accuracy: {results[1]}")

# Step 10: Make Predictions (optional)
def predict_sentiment(texts):
    inputs = tokenizer(texts, return_tensors="tf", truncation=True, padding=True, max_length=512)
    outputs = model(inputs)
    logits = outputs.logits
    predictions = tf.argmax(logits, axis=-1)
    return ["positive" if pred == 1 else "negative" for pred in predictions.numpy()]

# Example prediction
sample_reviews = [
    "This movie was amazing! The plot and acting were fantastic.",
    "I hated this movie. The storyline was boring and predictable."
]

predictions = predict_sentiment(sample_reviews)
print(predictions)
