In [1]:
from datasets import load_dataset
dataset = load_dataset("stanfordnlp/sst2", cache_dir='./datasets')

ModuleNotFoundError: No module named 'datasets'

In [None]:
training_dataset = dataset["train"]
validation_dataset = dataset["validation"]


In [None]:
training_dataset_sentence = training_dataset['sentence']
training_dataset_label = training_dataset['label']
validation_dataset_sentence = validation_dataset['sentence']
validation_dataset_label = validation_dataset['label']

In [None]:
import pandas as pd
training_dataframe = pd.DataFrame.from_dict({
  "sentence": training_dataset_sentence,
  "label": training_dataset_label
})
validation_dataframe = pd.DataFrame.from_dict({
  "sentence": validation_dataset_sentence,
  "label": validation_dataset_label
})

In [None]:
import re
from bs4 import BeautifulSoup
import unicodedata
def cleandata(text):
  soup = BeautifulSoup(text)
  text = soup.getText()
  text = unicodedata.normalize("NFKC", text)
  text = re.sub(r"[\u0000-\u001F\u007F]+", "", text)
  text.strip()
  return str(text)

In [None]:
training_dataframe['clean'] = training_dataframe['sentence'].apply(cleandata).tolist()
validation_dataframe['clean'] = validation_dataframe['sentence'].apply(cleandata).tolist()

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", cache_dir="./tokenizer")

In [None]:
print(type(training_dataframe['clean']))
training_encoded = tokenizer(
  training_dataframe['clean'].tolist(),
  padding=True,
  truncation=True,
  max_length=128,
  return_tensors='tf'
)
validation_encoded = tokenizer(
  validation_dataframe['clean'].tolist(),
  padding=True,
  truncation=True,
  max_length=128,
  return_tensors='tf'
)

In [None]:
index = 0
print(training_dataframe['sentence'][index])
print(training_encoded["input_ids"][index])
print(training_encoded['token_type_ids'][index])
print(training_encoded['attention_mask'][index])
print(tokenizer.decode(training_encoded["input_ids"][index]))

In [None]:
from transformers import TFBertForSequenceClassification
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", cache_dir="./model", num_labels=2)

In [None]:
from transformers import create_optimizer

batch_size = 32
epochs=4
train_data_size = len(training_dataset)
steps_per_epoch = train_data_size // batch_size
num_train_steps = steps_per_epoch * epochs 
num_warmup_steps = int(0.1 * num_train_steps)

optimizer, schedule = create_optimizer(
  init_lr=2e-5,
  num_train_steps= num_train_steps,
  num_warmup_steps=num_warmup_steps,
  weight_decay_rate=0.01
)


In [None]:
from tf_keras.losses import SparseCategoricalCrossentropy

model.compile(
  optimizer=optimizer,
  loss=SparseCategoricalCrossentropy(from_logits=True),
  metrics=['accuracy'], 
)

In [None]:
from tf_keras.callbacks import TensorBoard, ModelCheckpoint, BackupAndRestore, RemoteMonitor
tensorboard_callbacks = TensorBoard(
  log_dir="./logs", histogram_freq=1, write_graph=True,write_images=True,write_steps_per_second=True,update_freq=1
   
)
model_callbacks = ModelCheckpoint(
  filepath="./assets/checkpoint.keras",
  save_best_only=True,
  monitor="val_accuracy",
  mode="max"
)
backup_callbacks = BackupAndRestore(
  backup_dir="./backup/",
  save_freq=50,
  save_before_preemption=True
)

In [None]:
import tensorflow as tf
def batch_to_tf(encodings, labels):
    # Convert BatchEncoding to dict of NumPy arrays
    encodings['labels'] = tf.convert_to_tensor(labels)

    dataset = tf.data.Dataset.from_tensor_slices(dict(encodings))
    return dataset.batch(32).prefetch(tf.data.AUTOTUNE)


In [None]:
training_tf = batch_to_tf(training_encoded, training_dataframe["label"])
validation_tf = batch_to_tf(validation_encoded, validation_dataframe['label'])

In [None]:
history = model.fit(
  training_tf,
  validation_data=validation_tf,
  epochs=3,
  batch_size=32,
  callbacks=[backup_callbacks, tensorboard_callbacks]
)
model.save_pretrained("./pretrained/model")
tokenizer.save_pretrained("./pretrained/tokenizer")
model.save("./pretrained/all")

In [None]:
from huggingface_hub import upload_folder
message = "initial commit"
REPOSITORY_ID="tianharjuno/sst2-bert-training"
upload_folder(
  folder_path="./pretrained/model",
  repo_id=REPOSITORY_ID,
  commit_message=message
)
upload_folder(
  folder_path="./pretrained/tokenizer",
  repo_id=REPOSITORY_ID,
  commit_message=message
)

In [None]:
import pandas as pd
import tensorflow as tf
test_dataset = dataset["test"]
test_df = pd.DataFrame.from_dict({
  "sentence" : test_dataset['sentence'],
  "label": test_dataset['label']
})
test_df['cleaned'] = test_df['sentence'].apply(cleandata).tolist()
inputs = tokenizer(
  test_df["cleaned"].tolist(),
  return_tensors='tf',
  padding=True,
  truncation=True
)
test_tf = tf.data.Dataset.from_tensor_slices((
  inputs,
  test_df["label"])).batch(32).prefetch(tf.data.AUTOTUNE)
outputs = model(**inputs)

In [None]:
import tensorflow as tf
logits = outputs.logits
results = tf.argmax(logits, axis=1).numpy()

In [None]:
def viewPrediction(index):
  print(f'{results[index]}: {test_df["sentence"][index]}')
  
for i in range(50):
  viewPrediction(i)

In [None]:
from transformers import TFBertForSequenceClassification, BertTokenizer, pipeline

load_pretrained_model = TFBertForSequenceClassification.from_pretrained("tianharjuno/sst2-bert-training", cache_dir="./model")
load_pretrained_tokenizer = BertTokenizer.from_pretrained("tianharjuno/sst2-bert-training", cache_dir="./tokenizer")
nlp = pipeline("text-classification", model=load_pretrained_model, tokenizer=load_pretrained_tokenizer, framework='tf')

In [None]:
pipeline_result = nlp.predict("i would not date you even if you become a princess")
print(pipeline_result)

trained tokenize and model are saved to huggingface.co

tianharjuno/sst2-bert-training