In [1]:
!pip install datasets



In [2]:
!pip install peft



In [3]:
!pip install evaluate



In [4]:
from datasets import load_dataset, DatasetDict, Dataset

from transformers import (

    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [5]:
# load dataset
dataset = load_dataset('shawhin/imdb-truncated')
dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [6]:
#display % of training data with label = 1
np.array(dataset['train']['label']).sum()/len(dataset['train']['label'])

0.5

#Model

In [7]:
base_model = 'distilbert-base-uncased'

#define label maps
id2label = {0: 'Negative', 1: 'Positive'}
label2id = {'Negative': 0, 'Positive': 1}

#generate classification model from base_model
model = AutoModelForSequenceClassification.from_pretrained(base_model, num_labels = 2, id2label = id2label, label2id = label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

#Preprocess data

In [9]:
#create tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, add_prefix_space = True)

#add pad token if none exist
if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
  model.resize_token_embeddings(len(tokenizer))

In [10]:
#create tokenize function
def tokenize_function(examples):
  #extract text
  text = examples['text']

  #tokenize and truncate the text
  tokenizer.truncation_side = "left"
  tokenized_inputs = tokenizer(
      text,
      return_tensors = "np",
      truncation = True,
      max_length = 512
  )
  return tokenized_inputs

In [11]:
#tokenize training and validation datasets
tokenized_dataset = dataset.map(tokenize_function, batched = True)
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [12]:
#create data collator
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

#Evaluation

In [13]:
#import accuracy evaluation metric
accuracy = evaluate.load("accuracy")

In [14]:
#define an evaluation function to pass into trainer later
def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis = 1)
  return {"accuracy": accuracy.compute(predictions=predictions, references = labels)}

In [15]:
id2label

{0: 'Negative', 1: 'Positive'}

#Apply untrained model to text

In [16]:
#define list of examples
text_list = ["It was good.", "Not a fan, don't recommend.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass.", "The action scene was amazing"]

print("Untrained model predictions:")
print("----------------------------")

for text in text_list:
  #tokenize text
  inputs = tokenizer.encode(text, return_tensors = "pt")
  #compute logits
  logits = model(inputs).logits
  #convert logits to labels
  predictions = torch.argmax(logits)

  print(text + " - " + id2label[predictions.tolist()])


Untrained model predictions:
----------------------------
It was good. - Positive
Not a fan, don't recommend. - Positive
Better than the first one. - Positive
This is not worth watching even once. - Positive
This one is a pass. - Positive
The action scene was amazing - Positive


#Train model

In [17]:
peft_config = LoraConfig(
    task_type = "SEQ_CLS",
    r = 4,
    lora_alpha = 32,
    lora_dropout = 0.01,
    target_modules = ['q_lin']
)

In [18]:
peft_config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type='SEQ_CLS', inference_mode=False, r=4, target_modules={'q_lin'}, lora_alpha=32, lora_dropout=0.01, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None)

In [19]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [20]:
# hyperparameters
lr = 1e-3
batch_size = 4
num_epochs = 10

In [76]:
!pip install transformers[torch]



In [77]:
import transformers

In [21]:
!pip show accelerate

Name: accelerate
Version: 0.31.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: zach.mueller@huggingface.co
License: Apache
Location: /usr/local/lib/python3.10/dist-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: peft


In [None]:
# !pip uninstall accelerate -y
# !pip install accelerate
# !pip install transformers[torch] -U

# # Restart runtime
# import os
# os._exit(00)

Found existing installation: accelerate 0.31.0
Uninstalling accelerate-0.31.0:
  Successfully uninstalled accelerate-0.31.0
Collecting accelerate
  Using cached accelerate-0.31.0-py3-none-any.whl (309 kB)
Installing collected packages: accelerate
Successfully installed accelerate-0.31.0


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 3108, in _dep_map
    return self.__dep_map
  File "/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py", line 2901, in __getattr__
    raise AttributeError(attr)
AttributeError: _DistInfoDistribution__dep_map

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py", line 169, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py", line 242, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 441, in run
    conflicts = self._determine_conflicts(to_install)
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py", line 

In [22]:
# define training arguments
training_args = TrainingArguments(
    output_dir= base_model + "-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [26]:
#create trainer object
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["validation"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

#train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.328326,{'accuracy': 0.891}
2,0.428500,0.446888,{'accuracy': 0.865}
3,0.428500,0.595706,{'accuracy': 0.881}
4,0.221500,0.626899,{'accuracy': 0.886}
5,0.221500,0.657517,{'accuracy': 0.902}
6,0.075100,0.814712,{'accuracy': 0.898}
7,0.075100,0.910821,{'accuracy': 0.894}
8,0.014000,0.972097,{'accuracy': 0.901}
9,0.014000,1.006714,{'accuracy': 0.896}
10,0.007300,0.994521,{'accuracy': 0.9}


Trainer is attempting to log a value of "{'accuracy': 0.891}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.865}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.881}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.886}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'accuracy': 0.902}" of type <class 'dict'> for key "eval/accuracy" as a scalar. This i

TrainOutput(global_step=2500, training_loss=0.14929916496276854, metrics={'train_runtime': 460.1168, 'train_samples_per_second': 21.734, 'train_steps_per_second': 5.433, 'total_flos': 1112883852759936.0, 'train_loss': 0.14929916496276854, 'epoch': 10.0})

#Generate predictions

In [33]:
model.to('cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to("cpu")

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
It was good. - Positive
Not a fan, don't recommend. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Negative
This one is a pass. - Negative
The action scene was amazing - Positive
