In [1]:
from tensorflow import keras
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TFAutoModelForSequenceClassification, create_optimizer
import datasets
import numpy as np

# Load IMDb dataset
![](images/imdb.png)
Website: [https://www.imdb.com/](https://www.imdb.com/)
Dataset Card: [https://huggingface.co/datasets/imdb](https://huggingface.co/datasets/imdb)

In [2]:
imdb = datasets.load_dataset("imdb")
imdb

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text to C:/Users/Timo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to C:/Users/Timo/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

# Tokenization
Each HuggingFace Language Model comes with a tokenizer.
The tokenizer tokenizes text into tokens and sub-tokens.
We will use the model [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France).

In [3]:
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')


def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)


tokenized_imdb = imdb.map(preprocess_function, batched=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [4]:
# Raw text
imdb['test'][0]['text']

'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as they have

In [5]:
# Tokenized text
tokenized_imdb['test'][0]['input_ids'][:16], tokenized_imdb['test'][0][
                                                 'attention_mask'][:16]

([101,
  1045,
  2293,
  16596,
  1011,
  10882,
  1998,
  2572,
  5627,
  2000,
  2404,
  2039,
  2007,
  1037,
  2843,
  1012],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

# Model definition
Load pre-trained model distilbert-base-uncased.

In [1]:
model = TFAutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=2)

NameError: name 'TFAutoModelForSequenceClassification' is not defined

Define training parameters and compile model.

In [None]:
batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_imdb['train']) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0,
                                       num_train_steps=total_train_steps)
# Feature freezing if desired
model.get_layer('distilbert').trainable = False

model.compile(optimizer=optimizer, metrics='accuracy')
model.summary()

Datasets for training and validation will be prepared, this includes padding to equal length of all sequences of a batch.

In [24]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer,
                                        return_tensors="tf")

tf_train_set = model.prepare_tf_dataset(
    tokenized_imdb["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_imdb["test"],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

In [25]:
data_collator

DataCollatorWithPadding(tokenizer=PreTrainedTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='tf')

# Model training

In [19]:
# Load pre-computed model (training will take some time...)

#model.set_weights(keras.models.load_model('distilbert-imdb', custom_objects={
#    'dummy_loss': None}).get_weights())

train = True
if train:
    model.fit(x=tf_train_set, validation_data=tf_validation_set,
              epochs=1)
    model.save('distilbert-imdb')





INFO:tensorflow:Assets written to: distilbert-imdb\assets


INFO:tensorflow:Assets written to: distilbert-imdb\assets


In [20]:
def predict(t):
    tt = tokenizer(t, return_tensors='tf')
    x = model(tt).logits.numpy().reshape((2,))
    return np.exp(x) / sum(np.exp(x))

In [21]:
predict('I really liked that film.')

array([0.28644052, 0.7135595 ], dtype=float32)

In [22]:
predict('This movie was lame.')

array([0.82576483, 0.17423517], dtype=float32)

In [23]:
t='This movie was lame.'
tt = tokenizer(t, return_tensors='tf')