In [2]:
# Import necessary libraries
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Sample dataset: list of texts and their corresponding labels (0 for negative, 1 for positive)
texts = [
    "I love this movie!",  # Positive
    "This was a terrible film.",  # Negative
    "What a fantastic experience!",  # Positive
    "I did not enjoy this at all.",  # Negative
]
labels = [1, 0, 1, 0]  # Corresponding labels

# Step 1: Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# Step 2: Tokenize and encode the dataset
# Tokenization converts text to input IDs and attention masks
def tokenize_data(texts, labels):
    encodings = tokenizer(
        texts,
        truncation=True,
        padding=True,
        max_length=32,
        return_tensors='tf'  # Return TensorFlow tensors
    )
    return encodings['input_ids'], encodings['attention_mask'], np.array(labels)

# Tokenize and encode the texts and labels
input_ids, attention_masks, labels = tokenize_data(texts, labels)

# Step 3: Split the dataset into training and validation sets
train_input_ids, val_input_ids, train_attention_masks, val_attention_masks, train_labels, val_labels = train_test_split(
    input_ids, attention_masks, labels, test_size=0.2, random_state=42
)

# Step 4: Create TensorFlow datasets for training and validation
train_dataset = tf.data.Dataset.from_tensor_slices((train_input_ids, train_attention_masks, train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((val_input_ids, val_attention_masks, val_labels))

# Batch and shuffle the training dataset
train_dataset = train_dataset.shuffle(100).batch(2)
val_dataset = val_dataset.batch(2)

# Step 5: Set training parameters
training_args = TrainingArguments(
    output_dir='./results',          # Output directory for model predictions and checkpoints
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=2,   # Batch size for training
    per_device_eval_batch_size=2,    # Batch size for evaluation
    warmup_steps=10,                  # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
)

# Step 6: Define the compute_metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)  # Get the predicted class indices
    return {
        'accuracy': accuracy_score(p.label_ids, preds)
    }

# Step 7: Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Step 8: Train the model
trainer.train()

# Step 9: Evaluate the model
eval_results = trainer.evaluate()
print(f"Validation Accuracy: {eval_results['eval_accuracy']:.4f}")


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TypeError: Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got array([3, 0, 2])