In [1]:
from datasets import load_dataset
dataset = load_dataset("stanfordnlp/sst2", cache_dir='./datasets')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
training_dataset = dataset["train"]
validation_dataset = dataset["validation"]

In [3]:
training_dataset_sentence = training_dataset['sentence']
training_dataset_label = training_dataset['label']
validation_dataset_sentence = validation_dataset['sentence']
validation_dataset_label = validation_dataset['label']

In [4]:
import pandas as pd
training_dataframe = pd.DataFrame.from_dict({
  "sentence": training_dataset_sentence,
  "label": training_dataset_label
})
validation_dataframe = pd.DataFrame.from_dict({
  "sentence": validation_dataset_sentence,
  "label": validation_dataset_label
})

In [5]:
import re
from bs4 import BeautifulSoup
import unicodedata
def cleandata(text):
  soup = BeautifulSoup(text)
  text = soup.getText()
  text = unicodedata.normalize("NFKC", text)
  text = re.sub(r"[\u0000-\u001F\u007F]+", "", text)
  text.strip()
  return str(text)

In [6]:
training_dataframe['clean'] = training_dataframe['sentence'].apply(cleandata).tolist()
validation_dataframe['clean'] = validation_dataframe['sentence'].apply(cleandata).tolist()

In [7]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", cache_dir="./tokenizer")

In [8]:
print(type(training_dataframe['clean']))
training_encoded = tokenizer(
  training_dataframe['clean'].tolist(),
  padding=True,
  truncation=True,
  max_length=128,
  return_tensors='tf'
)
validation_encoded = tokenizer(
  validation_dataframe['clean'].tolist(),
  padding=True,
  truncation=True,
  max_length=128,
  return_tensors='tf'
)

<class 'pandas.core.series.Series'>


2025-05-02 00:15:37.509321: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3
2025-05-02 00:15:37.509339: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-05-02 00:15:37.509343: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-05-02 00:15:37.509356: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-05-02 00:15:37.509366: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [9]:
index = 0
print(training_dataframe['sentence'][index])
print(training_encoded["input_ids"][index])
print(training_encoded['token_type_ids'][index])
print(training_encoded['attention_mask'][index])
print(tokenizer.decode(training_encoded["input_ids"][index]))

hide new secretions from the parental units 
tf.Tensor(
[  101  5342  2047  3595  8496  2013  1996 18643  3197   102     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0], shape=(66,), dtype=int32)
tf.Tensor(
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(66,), dtype=int32)
tf.Tensor(
[1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(66,), dtype=int32)
[CLS] hide new secretions from the parental units [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD

In [10]:
from transformers import TFBertForSequenceClassification
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", cache_dir="./model", num_labels=2)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from transformers import create_optimizer

batch_size = 32
epochs=4
train_data_size = len(training_dataset)
steps_per_epoch = train_data_size // batch_size
num_train_steps = steps_per_epoch * epochs 
num_warmup_steps = int(0.1 * num_train_steps)

optimizer, schedule = create_optimizer(
  init_lr=2e-5,
  num_train_steps= num_train_steps,
  num_warmup_steps=num_warmup_steps,
  weight_decay_rate=0.01
)


In [12]:
from tf_keras.losses import SparseCategoricalCrossentropy

model.compile(
  optimizer=optimizer,
  loss=SparseCategoricalCrossentropy(from_logits=True),
  metrics=['accuracy'], 
)

In [13]:
from tf_keras.callbacks import TensorBoard, ModelCheckpoint, BackupAndRestore, RemoteMonitor
tensorboard_callbacks = TensorBoard(
  log_dir="./logs", histogram_freq=1, write_graph=True,write_images=True,write_steps_per_second=True,
   
)
model_callbacks = ModelCheckpoint(
  filepath="./assets/checkpoint.keras",
  save_best_only=True,
  monitor="val_accuracy",
  mode="max"
)
backup_callbacks = BackupAndRestore(
  backup_dir="./backup/",
  save_freq=1,
  save_before_preemption=True
)

In [14]:
import tensorflow as tf
def batch_to_tf(encodings, labels):
    # Convert BatchEncoding to dict of NumPy arrays
    encodings['labels'] = tf.convert_to_tensor(labels)

    dataset = tf.data.Dataset.from_tensor_slices(dict(encodings))
    return dataset.batch(32).prefetch(tf.data.AUTOTUNE)


In [15]:
training_tf = batch_to_tf(training_encoded, training_dataframe["label"])
validation_tf = batch_to_tf(validation_encoded, validation_dataframe['label'])

In [18]:
history = model.fit(
  training_tf,
  validation_data=validation_tf,
  epochs=3,
  batch_size=32,
  callbacks=[backup_callbacks, tensorboard_callbacks]
)
model.save_pretrained("./pretrained/model")
tokenizer.save_pretrained("./pretrained/tokenizer")
model.save("./pretrained/all")

Epoch 1/3

KeyboardInterrupt: 