In [1]:

import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import re
import string
from transformers import BertTokenizer, TFBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalAccuracy, Precision, Recall
from sklearn.metrics import classification_report
import os

# Load IMDb dataset
dataset, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
train_data, test_data = dataset["train"], dataset["test"]

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Preprocessing function
def preprocess_text(text):
    text = text.numpy().decode("utf-8").lower()  # Convert to lowercase
    text = re.sub(f"[{string.punctuation}]", "", text)  # Remove punctuation
    return text

# Prepare training and testing data
train_texts, train_labels = [], []
test_texts, test_labels = [], []

for text, label in train_data:
    train_texts.append(preprocess_text(text))
    train_labels.append(label.numpy())

for text, label in test_data:
    test_texts.append(preprocess_text(text))
    test_labels.append(label.numpy())

# Tokenization and Padding
max_length = 200
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=max_length, return_tensors='tf')
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=max_length, return_tensors='tf')

train_input_ids = train_encodings["input_ids"]
test_input_ids = test_encodings["input_ids"]

train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

# Load pre-trained BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Compile model
optimizer = Adam(learning_rate=2e-5, epsilon=1e-08)
loss = SparseCategoricalCrossentropy(from_logits=True)
metrics = [SparseCategoricalAccuracy(name="accuracy"), Precision(name="precision"), Recall(name="recall")]
model.compile()

# Train model
history = model.fit(train_input_ids, train_labels, validation_data=(test_input_ids, test_labels), epochs=3, batch_size=16)

# Model evaluation
y_pred_logits = model.predict(test_input_ids).logits
y_pred = np.argmax(y_pred_logits, axis=1)

print("Classification Report:")
print(classification_report(test_labels, y_pred, target_names=["Negative", "Positive"]))

print("BERT Model Training and Evaluation Complete!")

# Save model and tokenizer
save_directory = "bert_sentiment_model"
os.makedirs(save_directory, exist_ok=True)
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")







All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.








Epoch 1/3






Epoch 2/3
Epoch 3/3
Classification Report:
              precision    recall  f1-score   support

    Negative       0.50      1.00      0.67     12500
    Positive       0.00      0.00      0.00     12500

    accuracy                           0.50     25000
   macro avg       0.25      0.50      0.33     25000
weighted avg       0.25      0.50      0.33     25000

BERT Model Training and Evaluation Complete!


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Model and tokenizer saved to bert_sentiment_model
