In [2]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (AutoTokenizer, AutoConfig,
    AutoModelForSequenceClassification, Trainer, TrainingArguments,
    DataCollatorWithPadding)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import torch
import numpy as np
import evaluate

In [3]:
model_checkpoint = "distilbert-base-uncased"

id2label = {0: 'Negative', 1: 'Positive'}
label2id = {'Negative': 0, 'Positive': 1}

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels = 2, id2label=id2label, label2id=label2id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
dataset = load_dataset("shawhin/imdb-truncated")
print(dataset)

README.md:   0%|          | 0.00/592 [00:00<?, ?B/s]

(…)-00000-of-00001-5a744bf76a1d84b2.parquet:   0%|          | 0.00/836k [00:00<?, ?B/s]

(…)-00000-of-00001-a3a52fabb70c739f.parquet:   0%|          | 0.00/853k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})


In [5]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)

# create tokenize function
def tokenize_function(examples):
    text = examples["text"]

    # tokenize and truncate the text if it is too long
    tokenizer.truncation_side = 'left'
    tokenized_inputs = tokenizer(text, return_tensors='np', truncation=True, max_length=512)

    return tokenized_inputs

# add pad token if None exists
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# tokenize the training and validation sets
tokenized_dataset = dataset.map(tokenize_function, batched=True)
print(tokenized_dataset)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})


In [6]:
# create data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [7]:
accuracy = evaluate.load("accuracy")

# define evaluation function to pass into trainer later
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [8]:
# Untrained model predictions/performance

text_list = ["It was good.", "Not a fan, don't recommend.",
             "Better than the first one.", "This is not worth watching even once.",
             "This one is a pass."]

print("Untrained model predictions:")
print("---------------------------")

for text in text_list:
    # tokenize the text
    inputs = tokenizer.encode(text, return_tensors='pt')
    # compute logits
    logits =  model(inputs).logits
    # convert logits to labels
    predictions = torch.argmax(logits)

    print(text + " - " + id2label[predictions.tolist()])

Untrained model predictions:
---------------------------
It was good. - Positive
Not a fan, don't recommend. - Positive
Better than the first one. - Positive
This is not worth watching even once. - Positive
This one is a pass. - Positive


In [9]:
peft_config = LoraConfig(task_type='SEQ_CLS', # sequence classification
                         r = 4, # intrinsic rank of trainable weight matrix
                         lora_alpha=32,  # this is like learning rate
                         lora_dropout=0.01, # probability of dropout
                         target_modules=['q_lin'])  # we apply lora to query layer

In [10]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [11]:
# hyperparameters
lr = 1e-4
batch_size = 8
num_epochs = 10

# define training arguments
training_args = TrainingArguments(output_dir=model_checkpoint + '-lora-text-classification',
                learning_rate=lr,
                per_device_train_batch_size=batch_size,
                per_device_eval_batch_size=batch_size,
                num_train_epochs=num_epochs,
                weight_decay=0.01,
                evaluation_strategy='epoch',
                save_strategy='epoch',
                load_best_model_at_end=True,
                report_to="none")



In [12]:
# create trainer object
trainer = Trainer(model=model, args=training_args, data_collator=data_collator,
                  train_dataset=tokenized_dataset['train'], eval_dataset=tokenized_dataset['validation'],
                  tokenizer = tokenizer, compute_metrics=compute_metrics)

trainer.train()

  trainer = Trainer(model=model, args=training_args, data_collator=data_collator,


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.434072,{'accuracy': 0.856}
2,No log,0.260274,{'accuracy': 0.891}
3,No log,0.257122,{'accuracy': 0.897}
4,0.380100,0.256306,{'accuracy': 0.902}
5,0.380100,0.282092,{'accuracy': 0.903}
6,0.380100,0.269165,{'accuracy': 0.905}
7,0.380100,0.278006,{'accuracy': 0.904}
8,0.227000,0.28366,{'accuracy': 0.906}
9,0.227000,0.289213,{'accuracy': 0.904}
10,0.227000,0.290627,{'accuracy': 0.903}


TrainOutput(global_step=1250, training_loss=0.2824243591308594, metrics={'train_runtime': 485.3625, 'train_samples_per_second': 20.603, 'train_steps_per_second': 2.575, 'total_flos': 1253694805157184.0, 'train_loss': 0.2824243591308594, 'epoch': 10.0})

In [18]:
print("Trained model predictions:")
print("-------------------------")

device = next(model.parameters()).device

for text in text_list:
    # tokenize the text
    inputs = tokenizer.encode(text, return_tensors='pt').to(device)
    # compute logits
    logits =  model(inputs).logits
    # convert logits to labels
    predictions = torch.argmax(logits, 1)

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
-------------------------
It was good. - Positive
Not a fan, don't recommend. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Negative
This one is a pass. - Positive
