## Preparing Environment


### Importing libraries


In [141]:
!pip install datasets transformers evaluate peft -q
!pip install accelerate -U



In [142]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
  AutoTokenizer,
  DataCollatorWithPadding,
  TrainingArguments,
  Trainer,
  DistilBertForSequenceClassification
)
import evaluate

from huggingface_hub import from_pretrained_keras
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd

In [143]:
from keras import backend as K
K.clear_session()

### Setting Up GPU as a training device


In [144]:
devices = tf.config.list_physical_devices()
my_gpu =  tf.config.list_physical_devices('GPU')[0]
print(devices)

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

# tf.config.set_logical_device_configuration(my_gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=4096)])
# tf.config.set_visible_devices(my_gpu, 'GPU')

# tf.config.experimental.set_memory_growth(my_gpu, True)


[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
cuda


In [145]:
torch.cuda.is_available()

True

## Processing data


### Loading datasets


In [146]:
raw_dataset = load_dataset('shawhin/imdb-truncated')
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

### Shuffling data


In [147]:
N_TRAIN_EXAMPLES = 1000
N_VALIDATION_EXAMPLES = 1000
N_UNSUPERVISED_EXAMPLES = 1000

In [148]:
from sklearn.model_selection import train_test_split

X = np.concatenate((raw_dataset['train']['text'], raw_dataset['validation']['text']))
y = np.concatenate((raw_dataset['train']['label'], raw_dataset['validation']['label']))

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=N_TRAIN_EXAMPLES, test_size=N_VALIDATION_EXAMPLES, random_state=1)

df_train = pd.DataFrame(columns=['text', 'label'])
df_train['text'] = X_train
df_train['label'] = y_train

df_test = pd.DataFrame(columns=['text', 'label'])
df_test['text'] = X_test
df_test['label'] = y_test

In [149]:
dataset = DatasetDict()
dataset['train'] = Dataset.from_pandas(df_train)
dataset['test'] = Dataset.from_pandas(df_test)

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
})

### Tokenizing data


In [150]:
MODEL_CHECKPOINT = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# Define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative": 0, "Positive": 1}

In [151]:
def tokenize_function(examples):
  # extract text
  text = examples["text"]

  # Tokenize and truncate text
  tokenizer.truncation_side = "left"
  tokenized_inputs = tokenizer(
    text,
    return_tensors="np",
    truncation=True,
    max_length=512,
  )

  return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [152]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
  columns=["attention_mask", "input_ids"],
  label_cols=["label"],
  shuffle=True,
  collate_fn=data_collator,
  batch_size=8,
)

tf_validation_dataset = tokenized_dataset['test'].to_tf_dataset(
  columns=["attention_mask", "input_ids"],
  label_cols=["label"],
  shuffle=False,
  collate_fn=data_collator,
  batch_size=8,
)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


## Configuring Model


### Loading raw model

In [153]:
# Generate classification model from checkpoint
print(MODEL_CHECKPOINT)
model_raw = DistilBertForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=2)
model_raw.to('cuda')

distilbert-base-uncased


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

### Configuring Lora

In [154]:
from peft import get_peft_model, LoraConfig

In [155]:
peft_config = LoraConfig(
  task_type="SEQ_CLS", # Sequence classification
  r=4, # Intrictic rank of trainable weiht matrix
  lora_alpha=32, # This is like a learning rate
  lora_dropout=0.01, # Probability of dropout (zero-ing random weights)
  target_modules = ['q_lin'] # Which layers do we apply LORA to
)

peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules=['q_lin'], lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)

In [156]:
model = get_peft_model(model_raw, peft_config)
model.print_trainable_parameters()

trainable params: 1,221,124 || all params: 67,584,004 || trainable%: 1.8068239934408148


### Configuring evaluation metrics

In [157]:
# Import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

In [158]:
# Define an evaluation function to pass into trainer later
# def compute_metrics(p):
#   predictions, labels = p
#   predictions = np.argmax(predictions, axis=1)

#   return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

### Configuring Trainer


In [159]:
# hyperparameters
lr = 1e-3
batch_size = 8
num_epochs = 2

In [160]:
# define training arguments
training_args = TrainingArguments(
  output_dir= MODEL_CHECKPOINT + "-lora-text-classification",
  learning_rate=lr,
  per_device_train_batch_size=batch_size,
  per_device_eval_batch_size=batch_size,
  num_train_epochs=num_epochs,
  weight_decay=0.01,
  evaluation_strategy="epoch",
  save_strategy="epoch",
  load_best_model_at_end=True,
)

# creater trainer object
trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=tokenized_dataset['train'],
  eval_dataset=tokenized_dataset['test'],
  tokenizer=tokenizer,
  data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
  compute_metrics=compute_metrics,
)

## Training Model

In [161]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.277326,0.892
2,No log,0.30412,0.901


TrainOutput(global_step=250, training_loss=0.33813250732421873, metrics={'train_runtime': 102.8895, 'train_samples_per_second': 19.438, 'train_steps_per_second': 2.43, 'total_flos': 253635273881664.0, 'train_loss': 0.33813250732421873, 'epoch': 2.0})

In [162]:
import torch

# define list of examples
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]

print("Model predictions:")
print("----------------------------")
for text in text_list:

  # tokenize text
  inputs = tokenizer.encode(text, return_tensors="pt")
  inputs = inputs.to(DEVICE)

  # compute logits
  logits = model(inputs).logits

  # convert logits to label
  predictions = torch.argmax(logits)

  print(text + " - " + id2label[predictions.tolist()])

Model predictions:
----------------------------
It was good. - Positive
Not a fan, don't recommed. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Negative
This one is a pass. - Positive
