In [2]:
from datasets import load_dataset

ds = load_dataset("stanfordnlp/sst2", cache_dir="./datasets")

In [3]:
TRAINING_DATASET = ds["train"]
VALIDATION_DATASET = ds["validation"]
TEST_DATASET = ds["test"]

In [4]:
idx = 1

print(TRAINING_DATASET[idx])
print(VALIDATION_DATASET[idx])
print(TEST_DATASET[idx])

{'idx': 1, 'sentence': 'contains no wit , only labored gags ', 'label': 0}
{'idx': 1, 'sentence': 'unflinchingly bleak and desperate ', 'label': 0}
{'idx': 1, 'sentence': "this film 's relationship to actual tension is the same as what christmas-tree flocking in a spray can is to actual snow : a poor -- if durable -- imitation .", 'label': -1}


In [6]:
from sklearn.model_selection import train_test_split

training_sentence = TRAINING_DATASET["sentence"]
training_labels = TRAINING_DATASET["label"]

validation_sentence = VALIDATION_DATASET["sentence"]
validation_labels = VALIDATION_DATASET["label"]


In [7]:
from sklearn.model_selection import train_test_split
import pandas as pd

sentence_concat = training_sentence + validation_sentence
label_concat = training_labels + validation_labels

sentence_training, sentence_validation, label_training, label_validation = train_test_split(sentence_concat, label_concat, test_size=0.25, random_state=42)

training_dataframe = pd.DataFrame.from_dict({"sentence": sentence_training, "label": label_training})
validation_dataframe = pd.DataFrame.from_dict({"sentence": sentence_validation, "label": label_validation})
test_dataframe = pd.DataFrame.from_dict({"sentence": TEST_DATASET["sentence"], "label": TEST_DATASET["label"]})

print(training_dataframe.head())
print(validation_dataframe.head())
print(test_dataframe.head())

                                            sentence  label
0  lies in the ease with which it integrates thou...      1
1       is funny , touching , smart and complicated       1
2  juliette binoche 's sand is vivacious , but it...      1
3         seen it all before in one form or another       0
4                                           literal       1
                                            sentence  label
0                                   the best script       1
1                                      vividly back       1
2               than most third-rate horror sequels       0
3  , byler reveals the ways in which a sultry eve...      1
4                    is funny , charming and quirky       1
                                            sentence  label
0             uneasy mishmash of styles and genres .     -1
1  this film 's relationship to actual tension is...     -1
2  by the end of no such thing the audience , lik...     -1
3  director rob marshall went out gunnin

In [18]:
import re
def clean_text(text):
  text = re.sub(r' , ', ", ", text)
  text = re.sub(r' . ', ". ", text)
  text = re.sub(r'\. $', ".", text)
  text = re.sub(r' $', "", text)
  text = re.sub(r" 's", "'s", text)
  text = re.sub(r" n't", "n't", text)
  text = re.sub(r" 'll", "'ll", text)
  text = re.sub(r'^,\s*', '', text)
  text = re.sub(r"^'s. ", '', text)
  text = re.sub(r"('' || ``)", '', text)
  text = re.sub(r'(?<=\w)\.\s(?=\w)', ' ', text)
  text = re.sub(r',\.', ' ', text)
  text = re.sub(r'\s*--\s*', ' ', text)
  text = re.sub(r',$', '', text)
  text = text.strip()
  return str(text)

In [19]:
training_dataframe["cleaned"] = training_dataframe["sentence"].apply(clean_text).to_list()
validation_dataframe["cleaned"] = validation_dataframe["sentence"].apply(clean_text).to_list()
test_dataframe["cleaned"] = test_dataframe["sentence"].apply(clean_text).to_list()

In [20]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

In [21]:
sentence_training_encoded = tokenizer.batch_encode_plus(training_dataframe["cleaned"].tolist(),
                                                        padding=True,
                                                        truncation=True,
                                                        return_tensors="tf"
                                                        )
sentence_validation_encoded = tokenizer.batch_encode_plus(validation_dataframe["cleaned"].tolist(),
                                                          padding=True,
                                                          truncation=True,
                                                          return_tensors="tf"
                                                          )
sentence_test_encoded = tokenizer.batch_encode_plus(test_dataframe["cleaned"].tolist(),
                                                    padding=True,
                                                    truncation=True,
                                                    return_tensors="tf"
                                                    )


In [22]:
k = 0
print(training_dataframe["cleaned"][k])
print(sentence_training_encoded["input_ids"][k])
print(tokenizer.decode(sentence_training_encoded["input_ids"][k]))
print(sentence_training_encoded["attention_mask"][k])
print(training_dataframe["label"][k])

lies in the ease with which it integrates thoughtfulness and pasta-fagioli comedy.
tf.Tensor(
[  101  3658  1999  1996  7496  2007  2029  2009 17409  2015 16465  2791
  1998 24857  1011  6904 11411  3669  4038  1012   102     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0], shape=(61,), dtype=int32)
[CLS] lies in the ease with which it integrates thoughtfulness and pasta - fagioli comedy. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
tf.Tensor(
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0], shape=(61,), dtype=int32)
1


In [28]:
from transformers import TFBertForSequenceClassification

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
import tensorflow as tf
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)


ValueError: Could not interpret optimizer identifier: <keras.src.optimizers.adam.Adam object at 0x3c7f8fcd0>

In [25]:
history = model.fit(
  [sentence_training_encoded["input_ids"],sentence_training_encoded["token_type_ids"],sentence_training_encoded["attention_mask"]],
  training_dataframe["label"],
  validation_data=(
    [sentence_validation_encoded["input_ids"],sentence_validation_encoded["token_type_ids"],sentence_validation_encoded["attention_mask"]],validation_dataframe["label"]
  ),
  batch_size=32,
  epochs=3,
)

model.save_pretrained("./self_training_model")
tokenizer.save_pretrained("./self_training_tokenizer")

Epoch 1/3


TypeError: in user code:

    File "/opt/anaconda3/envs/sentiment/lib/python3.9/site-packages/tf_keras/src/engine/training.py", line 1398, in train_function  *
        return step_function(self, iterator)
    File "/opt/anaconda3/envs/sentiment/lib/python3.9/site-packages/tf_keras/src/engine/training.py", line 1381, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/anaconda3/envs/sentiment/lib/python3.9/site-packages/tf_keras/src/engine/training.py", line 1370, in run_step  **
        outputs = model.train_step(data)
    File "/opt/anaconda3/envs/sentiment/lib/python3.9/site-packages/transformers/modeling_tf_utils.py", line 1691, in train_step
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    File "/opt/anaconda3/envs/sentiment/lib/python3.9/site-packages/tf_keras/src/engine/compile_utils.py", line 620, in update_state
        metric_obj.update_state(y_t, y_p, sample_weight=mask)
    File "/opt/anaconda3/envs/sentiment/lib/python3.9/site-packages/tf_keras/src/utils/metrics_utils.py", line 77, in decorated
        result = update_state_fn(*args, **kwargs)
    File "/opt/anaconda3/envs/sentiment/lib/python3.9/site-packages/tf_keras/src/metrics/base_metric.py", line 140, in update_state_fn
        return ag_update_state(*args, **kwargs)
    File "/opt/anaconda3/envs/sentiment/lib/python3.9/site-packages/tf_keras/src/metrics/base_metric.py", line 722, in update_state  **
        matches = ag_fn(y_true, y_pred, **self._fn_kwargs)

    TypeError: 'str' object is not callable
