In [1]:
from datasets import load_dataset

ds = load_dataset("stanfordnlp/sst2", cache_dir="./datasets")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
TRAINING_DATASET = ds["train"]
VALIDATION_DATASET = ds["validation"]
TEST_DATASET = ds["test"]

In [3]:
idx = 1

print(TRAINING_DATASET[idx])
print(VALIDATION_DATASET[idx])
print(TEST_DATASET[idx])

{'idx': 1, 'sentence': 'contains no wit , only labored gags ', 'label': 0}
{'idx': 1, 'sentence': 'unflinchingly bleak and desperate ', 'label': 0}
{'idx': 1, 'sentence': "this film 's relationship to actual tension is the same as what christmas-tree flocking in a spray can is to actual snow : a poor -- if durable -- imitation .", 'label': -1}


In [4]:
from sklearn.model_selection import train_test_split

training_sentence = TRAINING_DATASET["sentence"]
training_labels = TRAINING_DATASET["label"]

validation_sentence = VALIDATION_DATASET["sentence"]
validation_labels = VALIDATION_DATASET["label"]


In [6]:
from sklearn.model_selection import train_test_split
import pandas as pd

sentence_concat = training_sentence + validation_sentence
label_concat = training_labels + validation_labels

sentence_training, sentence_validation, label_training, label_validation = train_test_split(sentence_concat, label_concat, test_size=0.2, random_state=42, stratify=label_concat)

training_dataframe = pd.DataFrame.from_dict({"sentence": sentence_training, "label": label_training})
validation_dataframe = pd.DataFrame.from_dict({"sentence": sentence_validation, "label": label_validation})
test_dataframe = pd.DataFrame.from_dict({"sentence": TEST_DATASET["sentence"], "label": TEST_DATASET["label"]})

print(training_dataframe.head())
print(validation_dataframe.head())
print(test_dataframe.head())

                                            sentence  label
0  an adventure story and history lesson all in one       1
1                                  's good enough .       1
2  the color sense of stuart little 2 is its most...      1
3                     a lot of stamina and vitality       1
4             sucks , but has a funny moment or two       0
                                            sentence  label
0                          our infantilized culture       0
1                                a luis buñuel film       1
2  a macabre and very stylized swedish fillm abou...      1
3                               outrageous or funny       1
4                            comes from the heart .       1
                                            sentence  label
0             uneasy mishmash of styles and genres .     -1
1  this film 's relationship to actual tension is...     -1
2  by the end of no such thing the audience , lik...     -1
3  director rob marshall went out gunnin

In [7]:
import re
def clean_text(text):
  text = text.strip()
  return str(text)

In [8]:
training_dataframe["cleaned"] = training_dataframe["sentence"].apply(clean_text).tolist()
validation_dataframe["cleaned"] = validation_dataframe["sentence"].apply(clean_text).tolist()
test_dataframe["cleaned"] = test_dataframe["sentence"].apply(clean_text).tolist()

In [9]:
print(set(training_dataframe['label']))
print(training_dataframe['label'].value_counts())

{0, 1}
label
1    30410
0    24166
Name: count, dtype: int64


In [10]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

In [11]:
sentence_training_encoded = tokenizer(
  training_dataframe["cleaned"].tolist(),
  padding=True,
  truncation=True,
  return_tensors='tf',
  max_length=128,
  verbose=True
)
sentence_validation_encoded = tokenizer(
  validation_dataframe["cleaned"].tolist(),
  padding=True,
  truncation=True,
  return_tensors="tf",
  max_length=128,
  verbose=True
)
sentence_test_encoded = tokenizer(
  test_dataframe["cleaned"].tolist(),
  padding=True,
  truncation=True,
  return_tensors="tf",
  max_length=128,
  verbose=True
)

2025-05-01 14:35:21.578000: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4
2025-05-01 14:35:21.578026: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-05-01 14:35:21.578030: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-05-01 14:35:21.578047: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-05-01 14:35:21.578056: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [12]:
k = 100
print(training_dataframe["cleaned"][k])
print(sentence_training_encoded["input_ids"][k])
print(tokenizer.decode(sentence_training_encoded["input_ids"][k]))
print(sentence_training_encoded["attention_mask"][k])
print(training_dataframe["label"][k])
print(sentence_test_encoded)

makes you feel like a chump
tf.Tensor(
[  101  3084  2017  2514  2066  1037 14684  8737   102     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0], shape=(66,), dtype=int32)
[CLS] makes you feel like a chump [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
tf.Tensor(
[1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], sh

In [13]:
from transformers import TFBertForSequenceClassification

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy

model.compile(
    optimizer="adam",
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
    
)


In [15]:
history = model.fit(
  [sentence_training_encoded["input_ids"],sentence_training_encoded["token_type_ids"],sentence_training_encoded["attention_mask"]],
  training_dataframe["label"],
  validation_data=(
    [sentence_validation_encoded["input_ids"],sentence_validation_encoded["token_type_ids"],sentence_validation_encoded["attention_mask"]],validation_dataframe["label"]
  ),
  batch_size=32,
  epochs=5,
  shuffle=True
)

model.save_pretrained("./self_training_model")
tokenizer.save_pretrained("./self_training_tokenizer")

Epoch 1/5
Cause: for/else statement not yet supported
Cause: for/else statement not yet supported


2025-05-01 14:36:10.885329: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


 331/1706 [====>.........................] - ETA: 17:56 - loss: 0.7237 - accuracy: 0.5186

KeyboardInterrupt: 