In [1]:
from datasets import load_dataset

ds = load_dataset("stanfordnlp/imdb", cache_dir="./datasets")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
REPOSITORY_ID="tianharjuno/imdb-bert-training"

In [3]:
training_split = ds["train"]
validation_split = ds["test"]
test_split = ds["unsupervised"]

def printline(index):
  print(training_split[index])

In [4]:
training_sentences = training_split["text"]
validation_sentences = validation_split["text"]
training_labels = training_split["label"]
validation_labels = validation_split['label']

sentences = training_sentences + validation_sentences
labels = training_labels + validation_labels

In [5]:
from sklearn.model_selection import train_test_split
import pandas as pd
training_sentence, validation_sentence, training_label, validation_label = train_test_split(
  sentences, labels, train_size=0.7, shuffle=True
)

print(f"Sentence total: {len(sentences)}\nLabel total: {len(labels)}\nTraining total: {len(training_sentence)}\nValidation total: {len(validation_sentence)}")

Sentence total: 50000
Label total: 50000
Training total: 35000
Validation total: 15000


In [6]:
import pandas as pd

training_df = pd.DataFrame.from_dict({
  "text": training_sentence,
  "label": training_label
})
validation_df = pd.DataFrame.from_dict({
  "text": validation_sentence,
  "label": validation_label
})

In [7]:
from bs4 import BeautifulSoup
import re
def cleanParagraph(text):
  soup = BeautifulSoup(text)
  innertext = soup.getText()
  innertext = re.sub(r'<[^>]+>', '', innertext)
  innertext.strip()
  return str(innertext)

In [8]:
training_df["cleaned"] = training_df["text"].apply(cleanParagraph).tolist()
validation_df["cleaned"] = validation_df["text"].apply(cleanParagraph).tolist()
print(training_df.head())
print(validation_df.head())

                                                text  label  \
0  This is a hilarious film. Burt Reynolds is a N...      1   
1  This is the second Animatrix short, and the fi...      1   
2  I really wanted to like this, but in the end i...      0   
3  By no means is this movie as bad as 'Perfect S...      0   
4  I didn't expect much when I first saw the DVD ...      1   

                                             cleaned  
0  This is a hilarious film. Burt Reynolds is a N...  
1  This is the second Animatrix short, and the fi...  
2  I really wanted to like this, but in the end i...  
3  By no means is this movie as bad as 'Perfect S...  
4  I didn't expect much when I first saw the DVD ...  
                                                text  label  \
0  I'm afraid I must disagree with Mr. Radcliffe,...      1   
1  Homegrown is one of those movies which sort of...      1   
2  Michael Keaton has really never been a good ac...      0   
3  The film "Cross Eyed" by Adam Jones 

In [9]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", cache_dir="./cache")

In [10]:
training_encoded = tokenizer(
  training_df['cleaned'].to_list(),
  padding=True,
  truncation=True,
  max_length=256,
  verbose=True,
  return_tensors="tf"
)
validation_encoded = tokenizer(
  validation_df["cleaned"].to_list(),
  padding=True,
  truncation=True,
  max_length=256,
  verbose=True,
  return_tensors='tf'
)

2025-05-03 23:50:36.605281: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3
2025-05-03 23:50:36.605310: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-05-03 23:50:36.605315: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-05-03 23:50:36.605332: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-05-03 23:50:36.605343: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [11]:
def printEncoded(index):
  print(training_df["cleaned"][index])
  print(training_encoded["input_ids"][index])
  print(tokenizer.decode(training_encoded["input_ids"][index]))
  print(training_encoded["attention_mask"][index])
  print(training_df["label"][index])

In [12]:
printEncoded(1)

This is the second Animatrix short, and the first of them to be what one could call 'artistic'. It contains a lot of references, metaphors and symbols in the dense amount of material, especially with a running time of 9 minutes. I've heard some complaints that this is "anti-human", or tries to direct hate towards man, for their "sins against machine". I don't think that's true; it merely uses the robots to show us, that as humans, we aren't particularly accepting or open-minded towards anyone different from ourselves. I'd say it does a great job of that. The plot is good... it plays as a historical document, recounting what led to one of the main conflicts in the trilogy. Thus it holds clips from fictional news reports and the like. The voice acting is very good, if there is not a lot of it. The animation is nice, and the use of color, in spite of the usually realistic drawing style, makes it more open to do the smooth transitions and other surreal imagery. This has several bits of str

In [13]:
import tensorflow as tf
def batch_to_tf(encodings, labels):
    # Convert BatchEncoding to dict of NumPy arrays
    encodings['labels'] = tf.convert_to_tensor(labels)

    dataset = tf.data.Dataset.from_tensor_slices(dict(encodings))
    return dataset.batch(32).prefetch(tf.data.AUTOTUNE)
  
training_tf = batch_to_tf(training_encoded, training_df['label'])
validation_tf = batch_to_tf(validation_encoded, validation_df['label'])

In [14]:
from transformers import TFBertForSequenceClassification

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from transformers import create_optimizer

batch_size = 32
epochs=3
train_data_size = len(training_df)
steps_per_epoch = train_data_size // batch_size
num_train_steps = steps_per_epoch * epochs 
num_warmup_steps = int(0.1 * num_train_steps)

optimizer, schedule = create_optimizer(
  init_lr=2e-5,
  num_train_steps= num_train_steps,
  num_warmup_steps=num_warmup_steps,
  weight_decay_rate=0.01
)


In [16]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy

model.compile(
  optimizer=optimizer,
  loss=SparseCategoricalCrossentropy(from_logits=True),
  metrics=['accuracy']
)

In [19]:
from tf_keras.callbacks import TensorBoard, ModelCheckpoint, BackupAndRestore, RemoteMonitor
tensorboard_callbacks = TensorBoard(
  log_dir="./logs", histogram_freq=1, write_graph=True,write_images=True,write_steps_per_second=True,update_freq=1
   
)
backup_callbacks = BackupAndRestore(
  backup_dir="./backup",
  save_freq=20,
  save_before_preemption=True,
)

In [20]:
history = model.fit(
  training_tf,
  validation_data=training_tf,
  epochs=3,
  batch_size=32,
  callbacks=[backup_callbacks, tensorboard_callbacks]
)

Epoch 3/3


In [21]:
model.save_pretrained("./pretrained_imdb/model")
tokenizer.save_pretrained("./pretrained_imdb/tokenizer")
model.save("./pretrained_imdb/all")

INFO:tensorflow:Assets written to: ./pretrained_imdb/all/assets


INFO:tensorflow:Assets written to: ./pretrained_imdb/all/assets


In [3]:
from huggingface_hub import upload_folder
message = "clean folder"
upload_folder(
  folder_path="./pretrained_imdb/model",
  repo_id=REPOSITORY_ID,
  commit_message=message,
)
upload_folder(
  folder_path="./pretrained_imdb/tokenizer",
  repo_id=REPOSITORY_ID,
  commit_message=message,
)

tf_model.h5: 100%|██████████| 438M/438M [00:48<00:00, 8.99MB/s]  


CommitInfo(commit_url='https://huggingface.co/tianharjuno/imdb-bert-training/commit/7c4859d676f432b0761dba81e44af3dcd512b4c3', commit_message='clean folder', commit_description='', oid='7c4859d676f432b0761dba81e44af3dcd512b4c3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/tianharjuno/imdb-bert-training', endpoint='https://huggingface.co', repo_type='model', repo_id='tianharjuno/imdb-bert-training'), pr_revision=None, pr_num=None)

In [4]:
from transformers import BertTokenizer, TFBertForSequenceClassification, pipeline
trained_model = TFBertForSequenceClassification.from_pretrained("pretrained_imdb/model")
trained_tokenizer = BertTokenizer.from_pretrained("pretrained_imdb/tokenizer")
nlp = pipeline(task="text-classification", model=trained_model, tokenizer=trained_tokenizer, framework="tf")

2025-05-04 11:46:30.817733: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3
2025-05-04 11:46:30.817763: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-05-04 11:46:30.817769: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-05-04 11:46:30.817921: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-05-04 11:46:30.817931: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at 

In [6]:
result = nlp("This movie is bad compared to the previous releases.")
print(result)

[{'label': 'LABEL_0', 'score': 0.9794296026229858}]
