In [None]:
from datasets import load_dataset,DatasetDict,Dataset

from transformers import(
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel,PeftConfig,get_peft_model,LoraConfig
import evaluate
import torch
import numpy as np

In [None]:
!pip install evaluate




In [None]:
###base model

model_checkpoint='distilbert-base-uncased'

##define label maps
id2label={0:"Negative",1:"Positive"}
label2id={"Negative":0,"Positive":1}

##generate classification model fro model_checkpoint
model=AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint,num_labels=2,id2label=id2label,label2id=label2id
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
##load dataset
dataset=load_dataset("shawhin/imdb-truncated")

dataset

README.md:   0%|          | 0.00/592 [00:00<?, ?B/s]

data/train-00000-of-00001-5a744bf76a1d84(…):   0%|          | 0.00/836k [00:00<?, ?B/s]

data/validation-00000-of-00001-a3a52fabb(…):   0%|          | 0.00/853k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text'],
        num_rows: 1000
    })
})

In [None]:
pip install -U transformers peft accelerate




In [None]:
###preprocess data

##create tokenizer

tokenizer=AutoTokenizer.from_pretrained(model_checkpoint,add_prefix_space=True)


##create tokenize function
def tokenize_function(examples):
    ##extract text
  text=examples["text"]

  ##tokenize  snd truncate text
  tokenizer.truncation_side="left"
  tokenized_inputs=tokenizer(
      text,
      return_tensors="np",
      truncation=True,
      max_length=512
  )
  return tokenized_inputs

##add pad token if none exists
  if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token":"[PAD]"})
    model.resize_token_embeddings(len(tokenizer))


In [None]:
##tokenize training and validation datasets
tokenized_datasets=dataset.map(tokenize_function,batched=True)
tokenized_datasets

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [None]:
##create datacollator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
##evaluation metrics
accuracy=evaluate.load("accuracy")


###define an evaluation function to pass into trainer later
def compute_metrics(p):
  predictions,labels=p
  predictions=np.argmax(predictions,axis=1)

  return accuracy.compute(predictions=predictions,references=labels)



Apply untrained model to text



In [None]:
##define list of examples
text_list = ["It was good.", "Not a fan, don't recommed.", "Better than the first one.", "This is not worth watching even once.", "This one is a pass."]

print("Untrained model predictions:")
print("----------------------------")

for text in text_list:
  ##tokenize text
  inputs=tokenizer.encode(text,return_tensors="pt")
  ##compute logits
  logits=model(inputs).logits
  ##convert logits to label
  predictions=torch.argmax(logits)

  print(text + " - " + id2label[predictions.tolist()])



Untrained model predictions:
----------------------------
It was good. - Positive
Not a fan, don't recommed. - Positive
Better than the first one. - Positive
This is not worth watching even once. - Positive
This one is a pass. - Positive


Fine Tuning with LoRA

In [None]:
peft_config=LoraConfig(task_type="SEQ_CLS",
                       r=4,##intrinsic rank of trainable weights
                       lora_alpha=32,##like learning rate
                       lora_dropout=0.01,##probability of dropout
                       target_modules=['q_lin'])##we apply Lora to the query linear layer

In [None]:
model=get_peft_model(model,peft_config)
model.print_trainable_parameters()

trainable params: 628,994 || all params: 67,584,004 || trainable%: 0.9307


In [None]:
###hyperparameters
lr=2e-5
batch_size=4
num_epochs=10

In [None]:
##define traing arguments
from transformers import EarlyStoppingCallback
training_args=TrainingArguments(
    output_dir=model_checkpoint+"-lora-text-classification",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)


In [None]:
###creator trainer object
trainer=Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]

)

# train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.586486,0.891
2,0.051000,0.610091,0.888
3,0.051000,0.63087,0.889
4,0.105900,0.643768,0.887


TrainOutput(global_step=1000, training_loss=0.07846935653686524, metrics={'train_runtime': 181.6012, 'train_samples_per_second': 55.066, 'train_steps_per_second': 13.766, 'total_flos': 444610902443520.0, 'train_loss': 0.07846935653686524, 'epoch': 4.0})

In [None]:
#model.to('cpu') # moving to mps for Mac (can alternatively do 'cpu')

print("Trained model predictions:")
print("--------------------------")
for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt") # moving to mps for Mac (can alternatively do 'cpu')
    inputs = inputs.to(model.device) # Move inputs to the same device as the model

    logits = model(inputs).logits
    predictions = torch.max(logits,1).indices

    print(text + " - " + id2label[predictions.tolist()[0]])

Trained model predictions:
--------------------------
It was good. - Positive
Not a fan, don't recommed. - Negative
Better than the first one. - Positive
This is not worth watching even once. - Negative
This one is a pass. - Negative


In [None]:
# Define the directory to save your model
save_directory = "./my_fine_tuned_model"

# Save the model
trainer.save_model(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

('./my_fine_tuned_model/tokenizer_config.json',
 './my_fine_tuned_model/special_tokens_map.json',
 './my_fine_tuned_model/vocab.txt',
 './my_fine_tuned_model/added_tokens.json',
 './my_fine_tuned_model/tokenizer.json')

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `hf auth whoami` to get more information or `hf auth logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
The token `ne

In [None]:
repo_name = "fine-tuned_distilbert-base-uncased_model"

# Push the model and tokenizer to the Hub
# The commit message and private flag are optional
trainer.push_to_hub(commit_message="Fine-tuned model")


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...ification/adapter_model.safetensors:  22%|##1       |  547kB / 2.52MB            

  ...vents.1755600263.60cd175791b9.926.0:  22%|##1       | 1.99kB / 9.14kB            

  ...vents.1755601612.60cd175791b9.926.1:  22%|##1       | 2.11kB / 9.71kB            

  ...vents.1755602744.60cd175791b9.926.2:  22%|##1       | 1.55kB / 7.14kB            

  ...vents.1755603406.60cd175791b9.926.3:  22%|##1       | 1.55kB / 7.14kB            

  ...xt-classification/training_args.bin:  22%|##1       | 1.18kB / 5.43kB            

CommitInfo(commit_url='https://huggingface.co/Prerna43/distilbert-base-uncased-lora-text-classification/commit/597ca5f53987a47452ec2c7409cf5556eae5d919', commit_message='Fine-tuned model', commit_description='', oid='597ca5f53987a47452ec2c7409cf5556eae5d919', pr_url=None, repo_url=RepoUrl('https://huggingface.co/Prerna43/distilbert-base-uncased-lora-text-classification', endpoint='https://huggingface.co', repo_type='model', repo_id='Prerna43/distilbert-base-uncased-lora-text-classification'), pr_revision=None, pr_num=None)

In [None]:
%pip install peft

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->peft)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->peft)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.13.0->peft)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.13.0->peft)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting

In [None]:
%pip install datasets

