### Importing libraries

In [2]:
from datasets import load_dataset, Features, Dataset, DatasetDict
from transformers import AutoTokenizer, DataCollatorWithPadding
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from keras import backend as K
K.clear_session()

### Setting Up GPU as a training device

In [4]:
my_gpu =  tf.config.list_physical_devices('GPU')[0]
print(my_gpu)

tf.config.set_logical_device_configuration(my_gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=4096)])
tf.config.set_visible_devices(my_gpu, 'GPU')

# tf.config.experimental.set_memory_growth(my_gpu, True)


PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


## Processing data

### Loading datasets

In [5]:
raw_dataset = load_dataset('shawhin/imdb-truncated')
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

### Shuffling data

In [6]:
N_TRAIN_EXAMPLES = 100
N_VALIDATION_EXAMPLES = 100
N_UNSUPERVISED_EXAMPLES = 100

In [7]:
from sklearn.model_selection import train_test_split

X = np.concatenate((raw_dataset['train']['text'], raw_dataset['validation']['text']))
y = np.concatenate((raw_dataset['train']['label'], raw_dataset['validation']['label']))

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=N_TRAIN_EXAMPLES, test_size=N_VALIDATION_EXAMPLES, random_state=1)

df_train = pd.DataFrame(columns=['text', 'label'])
df_train['text'] = X_train
df_train['label'] = y_train

df_test = pd.DataFrame(columns=['text', 'label'])
df_test['text'] = X_test
df_test['label'] = y_test

In [8]:
dataset = DatasetDict()
dataset['train'] = Dataset.from_pandas(df_train)
dataset['test'] = Dataset.from_pandas(df_test)

dataset

  if _pandas_api.is_sparse(col):


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 100
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 100
    })
})

### Tokenizing data

In [9]:
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# Define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative": 0, "Positive": 1}

In [10]:
def tokenize_function(examples):
  # extract text
  text = examples["text"]
  
  # Tokenize and truncate text
  tokenizer.truncation_side = "left"
  tokenized_inputs = tokenizer(
    text,
    return_tensors="np",
    truncation=True,
    max_length=512,
  )
  
  return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 100/100 [00:00<00:00, 4347.96 examples/s]
Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map: 100%|██████████| 100/100 [00:00<00:00, 3225.22 examples/s]


## Creating a model

In [11]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
  columns=["attention_mask", "input_ids"],
  label_cols=["labels"],
  shuffle=True,
  collate_fn=data_collator,
  batch_size=8,
)

tf_validation_dataset = tokenized_dataset['test'].to_tf_dataset(
  columns=["attention_mask", "input_ids"],
  label_cols=["labels"],
  shuffle=False,
  collate_fn=data_collator,
  batch_size=8,
)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


### Testing untrained model

In [13]:
# define list of examples
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
  
  # tokenize text
  inputs = tokenizer.encode(text, return_tensors="tf")
  
  # # compute logits
  logits = model.predict(inputs).logits
  
  # convert logits to label
  predictions = np.argmax(logits)
  
  # print(logits)
  print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
It was good. - Positive
Not a fan, don't recommed. - Positive
Better than the first one. - Positive
This is not worth watching even once. - Positive
This one is a pass. - Positive


## Training model

### Seting up optimizer (with progressive learning rate)

In [14]:
from keras.optimizers import schedules
from keras.optimizers import Adam

batch_size = 8
num_epochs = 1

# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
# num_train_steps = len(tf_train_dataset) * num_epochs

# lr_scheduler = schedules.PolynomialDecay(
#   initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
# )

# opt = Adam(learning_rate=lr_scheduler)
model.compile(optimizer='adam', metrics=["accuracy"])

In [16]:
model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
 dropout_19 (Dropout)        multiple                  0         
                                                                 
Total params: 66,955,010
Trainable params: 66,955,010
Non-trainable params: 0
_________________________________________________________________


In [43]:
with tf.device('/cpu:0'):
  model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=num_epochs, batch_size=batch_size)



In [44]:
# define list of examples
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass.", "The best movie!"]

print("Trained model predictions:")
print("----------------------------")
for text in text_list:
  
  # tokenize text
  inputs = tokenizer.encode(text, return_tensors="tf")
  
  # compute logits
  logits = model.predict(inputs)['logits']
  
  # convert logits to label
  predictions = np.argmax(logits)
  
  # print(logits)
  print(text + " - " + id2label[predictions.tolist()], " with logits: ", logits)

Trained model predictions:
----------------------------
It was good. - Positive  with logits:  [[-0.17935103  0.11032692]]
Not a fan, don't recommed. - Positive  with logits:  [[-0.17919557  0.11016697]]
Better than the first one. - Positive  with logits:  [[-0.17931211  0.11032647]]
This is not worth watching even once. - Positive  with logits:  [[-0.1794228   0.11036927]]
This one is a pass. - Positive  with logits:  [[-0.17896593  0.1100044 ]]
The best movie! - Positive  with logits:  [[-0.17992589  0.11080353]]


In [48]:
for text in df_test.head(10)['text']:
  # tokenize text
  inputs = tokenizer.encode(text, return_tensors="tf")
  
  # compute logits
  logits = model.predict(inputs)['logits']
  
  # convert logits to label
  predictions = np.argmax(logits)
  
  print(predictions)
  # print(" - " + id2label[predictions.tolist()], " with logits: ", logits)

1
1
1
1
1
1
1
1
1
1
