### Import libraries

In [2]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import(
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [3]:
# load dataset
dataset = load_dataset('shawhin/imdb-truncated')
dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

The dataset has 1000 in the training datasets and validation also has 1000. This is okay because we dont need much data since we are not training from scratch we are just fine tuning.

### Model selection

In [4]:
model_checkpoint = 'distilbert-base-uncased'

# define label maps
id2label = {0: "Negative", 1: "Positive"}
label2id = {"Negative": 0, "Positive": 1}


# generate classification model from checkpoint 
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2, id2label=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

### Import Tokenizer:
this is to convert text to numerical values as models dont understand string.

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

In [7]:
# Tokenization function

def tokenize(examples):
    # extract text
    text = examples["text"] # the colums with the reviews is called 'text'


    # tokenize and truncate
    tokenizer.truncation_side= "left"
    tokenized_input = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=512
        )
    return tokenized_input

In [8]:
# add pad token if none exists

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [9]:
# tokenize training and validation

tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [10]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

A data collator will dynamically pad examples in a given batch to be as long as the longest sequence in the batch. If the sequence is padded with collator its more computationally efficient than to pad the whole dataset.

### Evaluation Metrics

In [11]:
# import accuracy evaluation

accuracy = evaluate.load("accuracy")

In [12]:
# define an evaluation function to pass into training later

def compute_metrics(p):
    predictions, labels = p # model output
    predictions = np.argmax(predictions, axis=1) # Anyone thats larger will be returned

    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

### Apply model to Text

In [13]:
# define list of examples
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]

print("Untrained model predictions:")
print("----------------------------")
for text in text_list:
    # tokenize text
    inputs = tokenizer.encode(text, return_tensors="pt")
    # compute logits
    logits = model(inputs).logits
    # convert logits to label
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
----------------------------
It was good. - Positive
Not a fan, don't recommed. - Positive
Better than the first one. - Positive
This is not worth watching even once. - Positive
This one is a pass. - Positive


The model lacking fine tuning doesnt do a good job with the predictions. From the example above we see it predicting all positive.

### Training the model

#### Fine Tuning with LoRA

In [14]:
peft_config = LoraConfig(task_type="SEQ_CLS", # sequence classification
                        r=4, # intrinsic risk if trainable weight matrix
                        lora_alpha=32, # this is like a learning rate
                        lora_dropout=0.01, # probability of dropout
                        target_modules = ['q_lin']) # we apply lora to query layer

In [15]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))

In [16]:
modeltune = get_peft_model(model, peft_config)
modeltune.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [17]:
# hyperparameters

lr = 1e-3 # size of parameter step
batch_size = 4 # number of examples processed per optimization step
num_epochs = 10 # number of times model runs through training data

In [18]:
# define training arguments

training_args = TrainingArguments(
    output_dir= model_checkpoint + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)



In [19]:
# creator trainer object

trainer = Trainer(
    model=modeltune, # Our peft model
    args=training_args, # hyper parameter
    train_dataset=tokenized_dataset['train'], # training data
    eval_dataset=tokenized_dataset['validation'], # validation Data
    tokenizer=tokenizer, # tokenizer
    data_collator=data_collator, # this will dynamically pad examples in each batch
    compute_metrics=compute_metrics # evaluate model using compute_metrics() function
)

  trainer = Trainer(


In [20]:
# Train model
trainer.train()

  0%|          | 0/2500 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.5092339515686035, 'eval_accuracy': {'accuracy': 0.854}, 'eval_runtime': 374.0053, 'eval_samples_per_second': 2.674, 'eval_steps_per_second': 0.334, 'epoch': 1.0}
{'loss': 0.4381, 'grad_norm': 0.1591380536556244, 'learning_rate': 0.0008, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.5283243060112, 'eval_accuracy': {'accuracy': 0.865}, 'eval_runtime': 448.5723, 'eval_samples_per_second': 2.229, 'eval_steps_per_second': 0.279, 'epoch': 2.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.6352825164794922, 'eval_accuracy': {'accuracy': 0.885}, 'eval_runtime': 412.8871, 'eval_samples_per_second': 2.422, 'eval_steps_per_second': 0.303, 'epoch': 3.0}
{'loss': 0.1825, 'grad_norm': 0.0071978759951889515, 'learning_rate': 0.0006, 'epoch': 4.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.6676529049873352, 'eval_accuracy': {'accuracy': 0.893}, 'eval_runtime': 457.4627, 'eval_samples_per_second': 2.186, 'eval_steps_per_second': 0.273, 'epoch': 4.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.845109224319458, 'eval_accuracy': {'accuracy': 0.883}, 'eval_runtime': 464.7249, 'eval_samples_per_second': 2.152, 'eval_steps_per_second': 0.269, 'epoch': 5.0}
{'loss': 0.0571, 'grad_norm': 1.8349426984786987, 'learning_rate': 0.0004, 'epoch': 6.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.8428108096122742, 'eval_accuracy': {'accuracy': 0.888}, 'eval_runtime': 441.9914, 'eval_samples_per_second': 2.262, 'eval_steps_per_second': 0.283, 'epoch': 6.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.939274787902832, 'eval_accuracy': {'accuracy': 0.888}, 'eval_runtime': 366.8645, 'eval_samples_per_second': 2.726, 'eval_steps_per_second': 0.341, 'epoch': 7.0}
{'loss': 0.0149, 'grad_norm': 0.0016348258359357715, 'learning_rate': 0.0002, 'epoch': 8.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.957373857498169, 'eval_accuracy': {'accuracy': 0.885}, 'eval_runtime': 367.3362, 'eval_samples_per_second': 2.722, 'eval_steps_per_second': 0.34, 'epoch': 8.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 0.9886038899421692, 'eval_accuracy': {'accuracy': 0.888}, 'eval_runtime': 365.2277, 'eval_samples_per_second': 2.738, 'eval_steps_per_second': 0.342, 'epoch': 9.0}
{'loss': 0.0084, 'grad_norm': 0.0008641273598186672, 'learning_rate': 0.0, 'epoch': 10.0}


  0%|          | 0/125 [00:00<?, ?it/s]

{'eval_loss': 1.0054031610488892, 'eval_accuracy': {'accuracy': 0.887}, 'eval_runtime': 412.0993, 'eval_samples_per_second': 2.427, 'eval_steps_per_second': 0.303, 'epoch': 10.0}
{'train_runtime': 11143.9016, 'train_samples_per_second': 0.897, 'train_steps_per_second': 0.224, 'train_loss': 0.1401919818878174, 'epoch': 10.0}


TrainOutput(global_step=2500, training_loss=0.1401919818878174, metrics={'train_runtime': 11143.9016, 'train_samples_per_second': 0.897, 'train_steps_per_second': 0.224, 'total_flos': 1112883852759936.0, 'train_loss': 0.1401919818878174, 'epoch': 10.0})

In [21]:
model.to('cpu') # moving to mps for Mac (can alternatively do 'cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu") # moving to mps for Mac (can alternatively do 'cpu')

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
It was good. - Positive
Not a fan, don't recommed. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Positive
This one is a pass. - Positive


Clear Improvement of the model, Fine tuning is essential in making a pretrained model more efficient and able to handle different task as it cuts the training time of these models.