In [1]:
from datasets import load_dataset

ds = load_dataset("stanfordnlp/imdb", cache_dir="./datasets")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
REPOSITORY_ID="tianharjuno/imdb-bert-training"

In [3]:
training_split = ds["train"]
validation_split = ds["test"]
test_split = ds["unsupervised"]

def printline(index):
  print(training_split[index])

In [4]:
training_sentences = training_split["text"]
validation_sentences = validation_split["text"]
training_labels = training_split["label"]
validation_labels = validation_split['label']

sentences = training_sentences + validation_sentences
labels = training_labels + validation_labels

In [5]:
from sklearn.model_selection import train_test_split
import pandas as pd
training_sentence, validation_sentence, training_label, validation_label = train_test_split(
  sentences, labels, train_size=0.7, shuffle=True
)

print(f"Sentence total: {len(sentences)}\nLabel total: {len(labels)}\nTraining total: {len(training_sentence)}\nValidation total: {len(validation_sentence)}")

Sentence total: 50000
Label total: 50000
Training total: 35000
Validation total: 15000


In [6]:
import pandas as pd

training_df = pd.DataFrame.from_dict({
  "text": training_sentence,
  "label": training_label
})
validation_df = pd.DataFrame.from_dict({
  "text": validation_sentence,
  "label": validation_label
})

In [7]:
from bs4 import BeautifulSoup
import re
def cleanParagraph(text):
  soup = BeautifulSoup(text)
  innertext = soup.getText()
  innertext = re.sub(r'<[^>]+>', '', innertext)
  innertext.strip()
  return str(innertext)

In [8]:
training_df["cleaned"] = training_df["text"].apply(cleanParagraph).tolist()
validation_df["cleaned"] = validation_df["text"].apply(cleanParagraph).tolist()
print(training_df.head())
print(validation_df.head())

                                                text  label  \
0  I stumbled onto this movie when I was eBay'ing...      1   
1  So far Miguel Bardem's career it's been one of...      1   
2  Director Fabio Barreto got a strange Academy N...      0   
3  What an utter disappointment! The score of 6,1...      0   
4  Having spent all of her money caring for her t...      0   

                                             cleaned  
0  I stumbled onto this movie when I was eBay'ing...  
1  So far Miguel Bardem's career it's been one of...  
2  Director Fabio Barreto got a strange Academy N...  
3  What an utter disappointment! The score of 6,1...  
4  Having spent all of her money caring for her t...  
                                                text  label  \
0  "200l: A Space Odyssey" is a supremely intrigu...      1   
1  "Eh-heh eh-heh hey, dude - look at these alien...      0   
2  Being the second last of Chaplin's Essanay fil...      1   
3  here was no effort put into Valentin

In [9]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained(REPOSITORY_ID, cache_dir="./cache")

In [10]:
training_encoded = tokenizer(
  training_df['cleaned'].to_list(),
  padding=True,
  truncation=True,
  max_length=256,
  verbose=True,
  return_tensors="tf"
)
validation_encoded = tokenizer(
  validation_df["cleaned"].to_list(),
  padding=True,
  truncation=True,
  max_length=256,
  verbose=True,
  return_tensors='tf'
)

2025-05-01 18:39:24.107557: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4
2025-05-01 18:39:24.107584: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-05-01 18:39:24.107588: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-05-01 18:39:24.107604: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-05-01 18:39:24.107613: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [11]:
def printEncoded(index):
  print(training_df["cleaned"][index])
  print(training_encoded["input_ids"][index])
  print(tokenizer.decode(training_encoded["input_ids"][index]))
  print(training_encoded["attention_mask"][index])
  print(training_df["label"][index])

In [12]:
printEncoded(1)

So far Miguel Bardem's career it's been one of the more dreadful of recent Spanish cinema. He's made nothing but rubbish... until now. "Incautos" has been quite a surprise: it's a serious film, with rhythm, with a great cast and very entertaining.The art of robbing, that's what "Incautos" is about. A film much alike to David Mamet's "House of game" and stuff like that. A thousand of twists in the script, and a story where nothing's like it seems.The weak points in latest Bardem's movie may be the so-American language, that makes some of the characters look rather unnatural (especially Victoria Abril's. She's a hell of an actress, but in "incautos" she looks a little bit forced). Ernesto Alterio is not that bad, but he's not half as good actor as his father... And what to say about Luppi?? Well, he's the MAN.In short: a good movie. The best that Miguel Bardem has ever made. I hope this is the beginning of a brand new stage in his career.*My rate: 7/10
tf.Tensor(
[  101  2061  2521  8374

In [13]:
from transformers import TFBertForSequenceClassification

model = TFBertForSequenceClassification.from_pretrained(REPOSITORY_ID, num_labels=2)



Some layers from the model checkpoint at tianharjuno/imdb-bert-training were not used when initializing TFBertForSequenceClassification: ['dropout_113']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at tianharjuno/imdb-bert-training.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


In [14]:
from transformers import create_optimizer

batch_size = 32
epochs=4
train_data_size = len(training_df)
steps_per_epoch = train_data_size // batch_size
num_train_steps = steps_per_epoch * epochs 
num_warmup_steps = int(0.1 * num_train_steps)

optimizer, schedule = create_optimizer(
  init_lr=2e-5,
  num_train_steps= num_train_steps,
  num_warmup_steps=num_warmup_steps,
  weight_decay_rate=0.01
)


In [15]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy

model.compile(
  optimizer=optimizer,
  loss=SparseCategoricalCrossentropy(from_logits=True),
  metrics=['accuracy']
)

In [16]:
from tf_keras.callbacks import TensorBoard, ModelCheckpoint, BackupAndRestore, RemoteMonitor
tensorboard_callbacks = TensorBoard(
  log_dir="./logs", histogram_freq=1, write_graph=True,write_images=True,write_steps_per_second=True,
   
)
model_callbacks = ModelCheckpoint(
  filepath="./assets/checkpoint.keras",
  save_best_only=True,
  monitor="val_accuracy",
  mode="max"
)
backup_callbacks = BackupAndRestore(
  backup_dir="./backup",
  save_freq="batch",
  save_before_preemption=True
)

In [17]:
history = model.fit(
  training_encoded,
  training_df['label'],
  validation_data=(validation_encoded, validation_df["label"]),
  epochs=4,
  batch_size=32,
  callbacks=[model_callbacks, tensorboard_callbacks, tensorboard_callbacks]
)

Epoch 1/4
Cause: for/else statement not yet supported
Cause: for/else statement not yet supported


2025-05-01 18:39:59.579358: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


  37/1094 [>.............................] - ETA: 1:30:06 - loss: 0.1528 - accuracy: 0.9434

KeyboardInterrupt: 

In [18]:
model.save_pretrained("./pretrained_model")
tokenizer.save_pretrained("./pretrained_tokenizer")
model.save("./pretrained")
model.save_weights("./pretrained_weights/")

INFO:tensorflow:Assets written to: ./pretrained/assets


INFO:tensorflow:Assets written to: ./pretrained/assets


In [19]:
from huggingface_hub import upload_folder
message = "clean folder"
upload_folder(
  folder_path="./pretrained_model",
  repo_id=REPOSITORY_ID,
  commit_message=message,
)
upload_folder(
  folder_path="./pretrained_tokenizer",
  repo_id=REPOSITORY_ID,
  commit_message=message,
)
upload_folder(
  folder_path="./pretrained_weights",
  repo_id=REPOSITORY_ID,
  commit_message=message,
  path_in_repo="/pretrained_weights"
)
upload_folder(
  folder_path="./pretrained",
  repo_id=REPOSITORY_ID,
  commit_message=message,
  path_in_repo="/pretrained",
)

tf_model.h5: 100%|██████████| 438M/438M [00:31<00:00, 14.1MB/s] 
.data-00000-of-00001: 100%|██████████| 1.31G/1.31G [01:18<00:00, 16.6MB/s]
fingerprint.pb:   0%|          | 0.00/56.0 [00:00<?, ?B/s]
[A

[A[A


[A[A[A
fingerprint.pb: 100%|██████████| 56.0/56.0 [00:00<00:00, 173B/s]


[A[A[A
[A
fingerprint.pb: 100%|██████████| 56.0/56.0 [00:00<00:00, 101B/s]
keras_metadata.pb: 100%|██████████| 164k/164k [00:00<00:00, 262kB/s]  

[A
[A
[A
[A

[A[A
[A
[A
[A
[A
[A
saved_model.pb: 100%|██████████| 7.79M/7.79M [00:02<00:00, 3.16MB/s]

[A
[A
[A
[A

[A[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A


CommitInfo(commit_url='https://huggingface.co/tianharjuno/imdb-bert-training/commit/8476ef8b59ff34fc0b7a2d3d3b0cf69625fd4565', commit_message='clean folder', commit_description='', oid='8476ef8b59ff34fc0b7a2d3d3b0cf69625fd4565', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tianharjuno/imdb-bert-training', endpoint='https://huggingface.co', repo_type='model', repo_id='tianharjuno/imdb-bert-training'), pr_revision=None, pr_num=None)