In [1]:
from datasets import load_dataset

ds = load_dataset("stanfordnlp/sst2", cache_dir="./datasets")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
TRAINING_DATASET = ds["train"]
VALIDATION_DATASET = ds["validation"]
TEST_DATASET = ds["test"]

In [3]:
idx = 1

print(TRAINING_DATASET[idx])
print(VALIDATION_DATASET[idx])
print(TEST_DATASET[idx])

{'idx': 1, 'sentence': 'contains no wit , only labored gags ', 'label': 0}
{'idx': 1, 'sentence': 'unflinchingly bleak and desperate ', 'label': 0}
{'idx': 1, 'sentence': "this film 's relationship to actual tension is the same as what christmas-tree flocking in a spray can is to actual snow : a poor -- if durable -- imitation .", 'label': -1}


In [4]:
from sklearn.model_selection import train_test_split

training_sentence = TRAINING_DATASET["sentence"]
training_labels = TRAINING_DATASET["label"]

validation_sentence = VALIDATION_DATASET["sentence"]
validation_labels = VALIDATION_DATASET["label"]


In [5]:
from sklearn.model_selection import train_test_split
import pandas as pd

sentence_concat = training_sentence + validation_sentence
label_concat = training_labels + validation_labels

sentence_training, sentence_validation, label_training, label_validation = train_test_split(sentence_concat, label_concat, test_size=0.15, random_state=100, stratify=label_concat)

training_dataframe = pd.DataFrame.from_dict({"sentence": sentence_training, "label": label_training})
validation_dataframe = pd.DataFrame.from_dict({"sentence": sentence_validation, "label": label_validation})
test_dataframe = pd.DataFrame.from_dict({"sentence": TEST_DATASET["sentence"], "label": TEST_DATASET["label"]})

print(training_dataframe.head())
print(validation_dataframe.head())
print(test_dataframe.head())

                                            sentence  label
0                                        much humor       1
1                  an overstylized , puréed mélange       0
2  the film 's hackneyed message is not helped by...      0
3  is very choppy and monosyllabic despite the fact       0
4                 more than a whiff of exploitation       0
                                            sentence  label
0                                , color and cringe       0
1  not only of one man 's quest to be president ,...      1
2  sends you away a believer again and quite chee...      1
3                  this delicate coming-of-age tale       1
4  airs just about every cliche in the war movie ...      0
                                            sentence  label
0             uneasy mishmash of styles and genres .     -1
1  this film 's relationship to actual tension is...     -1
2  by the end of no such thing the audience , lik...     -1
3  director rob marshall went out gunnin

In [22]:
import re
from bs4 import BeautifulSoup
def clean_text(text):
  soup = BeautifulSoup(text)
  textcleaned = soup.getText()
  text = textcleaned.strip()
  return str(text)

In [25]:
training_dataframe["cleaned"] = training_dataframe["sentence"].apply(clean_text).tolist()
validation_dataframe["cleaned"] = validation_dataframe["sentence"].apply(clean_text).tolist()
test_dataframe["cleaned"] = test_dataframe["sentence"].apply(clean_text).tolist()

print(training_dataframe["cleaned"])

0                                               much humor
1                         an overstylized , puréed mélange
2        the film 's hackneyed message is not helped by...
3         is very choppy and monosyllabic despite the fact
4                        more than a whiff of exploitation
                               ...                        
57982    what ( frei ) gives us ... is a man who uses t...
57983                  could have guessed at the beginning
57984    reveals the ways in which a sultry evening or ...
57985    is also elevated by it -- the kind of movie th...
57986    must be given to the water-camera operating te...
Name: cleaned, Length: 57987, dtype: object


In [26]:
print(set(training_dataframe['label']))
print(training_dataframe['label'].value_counts())

{0, 1}
label
1    32311
0    25676
Name: count, dtype: int64


In [27]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased", do_lower_case=True)

In [28]:
sentence_training_encoded = tokenizer(
  training_dataframe["cleaned"].tolist(),
  padding=True,
  truncation=True,
  return_tensors='tf',
)
sentence_validation_encoded = tokenizer(
  validation_dataframe["cleaned"].tolist(),
  padding=True,
  truncation=True,
  return_tensors="tf",
)
sentence_test_encoded = tokenizer(
  test_dataframe["cleaned"].tolist(),
  padding=True,
  truncation=True,
  return_tensors="tf",
)

In [29]:
k = 100
print(training_dataframe["cleaned"][k])
print(sentence_training_encoded["input_ids"][k])
print(tokenizer.decode(sentence_training_encoded["input_ids"][k]))
print(sentence_training_encoded["attention_mask"][k])
print(training_dataframe["label"][k])
print(sentence_test_encoded)

exude an air of dignity that 's perfect for the proud warrior that still lingers in the souls of these characters
tf.Tensor(
[  101  4654 12672  2019  2250  1997 13372  2008  1005  1055  3819  2005
  1996  7098  6750  2008  2145 26577  2015  1999  1996  9293  1997  2122
  3494   102     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0], shape=(66,), dtype=int32)
[CLS] exude an air of dignity that ' s perfect for the proud warrior that still lingers in the souls of these characters [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
tf.Tensor(
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 

In [30]:
from transformers import TFBertForSequenceClassification

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [31]:
from transformers import create_optimizer

batch_size = 32
epochs=4
train_data_size = len(training_dataframe)
steps_per_epoch = train_data_size // batch_size
num_train_steps = steps_per_epoch * epochs 
num_warmup_steps = int(0.1 * num_train_steps)

optimizer, schedule = create_optimizer(
  init_lr=2e-5,
  num_train_steps= num_train_steps,
  num_warmup_steps=num_warmup_steps,
  weight_decay_rate=0.02
)


In [32]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import SparseCategoricalCrossentropy

model.compile(
    optimizer=optimizer,
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy'],
    
)


In [33]:
from tf_keras.callbacks import TensorBoard, ModelCheckpoint, BackupAndRestore, RemoteMonitor
tensorboard_callbacks = TensorBoard(
  log_dir="./logs", histogram_freq=1, write_graph=True,write_images=True,write_steps_per_second=True, update_freq=1
   
)
model_callbacks = ModelCheckpoint(
  filepath="./assets/checkpoint.keras",
  save_best_only=True,
  monitor="val_accuracy",
  mode="max"
)
backup_callbacks = BackupAndRestore(
  backup_dir="./backup",
  save_freq=1,
  save_before_preemption=True
)

In [34]:
history = model.fit(
  sentence_training_encoded,
  training_dataframe["label"],
  validation_data=(
    sentence_validation_encoded,validation_dataframe["label"]
  ),
  batch_size=32,
  epochs=5,
  shuffle=True,
  callbacks=[tensorboard_callbacks, model_callbacks, backup_callbacks]
)

model.save_pretrained("./self_training_model")
tokenizer.save_pretrained("./self_training_tokenizer")

Epoch 1/5

KeyboardInterrupt: 