## Preparing Environment


### Importing libraries


In [62]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import (
  AutoTokenizer,
  DataCollatorWithPadding,
  TrainingArguments,
  Trainer,
  DistilBertForSequenceClassification
)
import evaluate

from huggingface_hub import from_pretrained_keras
import numpy as np
import tensorflow as tf
from tensorflow import keras
import pandas as pd

In [63]:
from keras import backend as K
K.clear_session()

### Setting Up GPU as a training device


In [64]:
my_gpu =  tf.config.list_physical_devices('GPU')[0]
print(my_gpu)

tf.config.set_logical_device_configuration(my_gpu, [tf.config.LogicalDeviceConfiguration(memory_limit=4096)])
tf.config.set_visible_devices(my_gpu, 'GPU')

# tf.config.experimental.set_memory_growth(my_gpu, True)


PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')


## Processing data


### Loading datasets


In [65]:
raw_dataset = load_dataset('shawhin/imdb-truncated')
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

### Shuffling data


In [66]:
N_TRAIN_EXAMPLES = 100
N_VALIDATION_EXAMPLES = 100
N_UNSUPERVISED_EXAMPLES = 100

In [67]:
from sklearn.model_selection import train_test_split

X = np.concatenate((raw_dataset['train']['text'], raw_dataset['validation']['text']))
y = np.concatenate((raw_dataset['train']['label'], raw_dataset['validation']['label']))

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=N_TRAIN_EXAMPLES, test_size=N_VALIDATION_EXAMPLES, random_state=1)

df_train = pd.DataFrame(columns=['text', 'label'])
df_train['text'] = X_train
df_train['label'] = y_train

df_test = pd.DataFrame(columns=['text', 'label'])
df_test['text'] = X_test
df_test['label'] = y_test

In [68]:
dataset = DatasetDict()
dataset['train'] = Dataset.from_pandas(df_train)
dataset['test'] = Dataset.from_pandas(df_test)

dataset

  if _pandas_api.is_sparse(col):


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 100
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 100
    })
})

### Tokenizing data


In [69]:
MODEL_CHECKPOINT = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# Define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative": 0, "Positive": 1}

In [70]:
def tokenize_function(examples):
  # extract text
  text = examples["text"]
  
  # Tokenize and truncate text
  tokenizer.truncation_side = "left"
  tokenized_inputs = tokenizer(
    text,
    return_tensors="np",
    truncation=True,
    max_length=512,
  )
  
  return tokenized_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True)


Map: 100%|██████████| 100/100 [00:00<00:00, 4999.95 examples/s]

Map: 100%|██████████| 100/100 [00:00<00:00, 4255.20 examples/s]


In [71]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")

tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
  columns=["attention_mask", "input_ids"],
  label_cols=["label"],
  shuffle=True,
  collate_fn=data_collator,
  batch_size=8,
)

tf_validation_dataset = tokenized_dataset['test'].to_tf_dataset(
  columns=["attention_mask", "input_ids"],
  label_cols=["label"],
  shuffle=False,
  collate_fn=data_collator,
  batch_size=8,
)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


## Configuring Model


### Loading raw model

In [72]:
# Generate classification model from checkpoint
print(MODEL_CHECKPOINT)
model_raw = DistilBertForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=2)

distilbert-base-uncased


### Configuring Lora

In [73]:
from peft import get_peft_model, LoraConfig

In [74]:
peft_config = LoraConfig(
  task_type="SEQ_CLS", # Sequence classification
  r=4, # Intrictic rank of trainable weiht matrix
  lora_alpha=32, # This is like a learning rate
  lora_dropout=0.01, # Probability of dropout (zero-ing random weights)
  target_modules = ['q_lin'] # Which layers do we apply LORA to
)

peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules=['q_lin'], lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None)

In [75]:
model = get_peft_model(model_raw, peft_config)
model.print_trainable_parameters()

trainable params: 36,864 || all params: 66,399,744 || trainable%: 0.05551828633556177


### Configuring evaluation metrics

In [76]:
# Import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

In [82]:
# Define an evaluation function to pass into trainer later
# def compute_metrics(p):
#   predictions, labels = p
#   predictions = np.argmax(predictions, axis=1)

#   return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
  logits, labels = eval_pred
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)

### Configuring Trainer


In [83]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 1

In [87]:
# define training arguments
training_args = TrainingArguments(
  output_dir= MODEL_CHECKPOINT + "-lora-text-classification",
  learning_rate=lr,
  per_device_train_batch_size=batch_size,
  per_device_eval_batch_size=batch_size,
  num_train_epochs=num_epochs,
  weight_decay=0.01,
  evaluation_strategy="epoch",
  save_strategy="epoch",
  load_best_model_at_end=True,
)

# creater trainer object
trainer = Trainer(
  model=model,
  args=training_args,
  train_dataset=tokenized_dataset['train'],
  eval_dataset=tokenized_dataset['test'],
  tokenizer=tokenizer,
  data_collator=data_collator, # this will dynamically pad examples in each batch to be equal length
  compute_metrics=compute_metrics,
)

## Training Model

In [85]:
trainer.train()

  0%|          | 0/25 [03:30<?, ?it/s]
  0%|          | 0/25 [00:00<?, ?it/s]

TypeError: DistilBertModel.forward() got an unexpected keyword argument 'labels'