In [2]:
!pip install transformers datasets -q

[K     |████████████████████████████████| 3.4 MB 5.4 MB/s 
[K     |████████████████████████████████| 311 kB 46.6 MB/s 
[K     |████████████████████████████████| 67 kB 5.1 MB/s 
[K     |████████████████████████████████| 596 kB 51.9 MB/s 
[K     |████████████████████████████████| 3.3 MB 41.7 MB/s 
[K     |████████████████████████████████| 895 kB 53.8 MB/s 
[K     |████████████████████████████████| 133 kB 48.3 MB/s 
[K     |████████████████████████████████| 1.1 MB 31.8 MB/s 
[K     |████████████████████████████████| 243 kB 50.6 MB/s 
[K     |████████████████████████████████| 271 kB 55.4 MB/s 
[K     |████████████████████████████████| 144 kB 55.0 MB/s 
[K     |████████████████████████████████| 94 kB 3.2 MB/s 
[?25h

In [5]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import load_dataset 

In [6]:
raw_data = load_dataset('glue', 'sst2')

Reusing dataset glue (/root/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
raw_data

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [8]:
raw_data['train'].features

{'idx': Value(dtype='int32', id=None),
 'label': ClassLabel(num_classes=2, names=['negative', 'positive'], names_file=None, id=None),
 'sentence': Value(dtype='string', id=None)}

In [9]:
check_point = 'bert-base-uncased'

In [10]:
tokenizer = AutoTokenizer.from_pretrained(check_point)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [11]:
print(tokenizer(raw_data['train']['sentence'][0]))

{'input_ids': [101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [12]:
def tokenize_data(example):
  return tokenizer(example['sentence'], truncation=True,)

In [13]:
raw_data = raw_data.map(tokenize_data, batched=True)

  0%|          | 0/68 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [14]:
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")

In [19]:
tf_train_dataset = raw_data["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset = raw_data["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

In [20]:
model = TFAutoModelForSequenceClassification.from_pretrained(check_point, num_labels=2)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
from tensorflow.keras.losses import SparseCategoricalCrossentropy

model.compile(
    optimizer="adam",
    loss=SparseCategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"],
)

In [None]:
model.fit(
    tf_train_dataset,
    validation_data=tf_validation_dataset,
    epochs=3
)

Epoch 1/3
Epoch 2/3
 655/8418 [=>............................] - ETA: 24:29 - loss: 0.6917 - accuracy: 0.5435