In [1]:
%%capture
!pip install transformers==4.29.0  peft==0.3.0 datasets==2.12.0

# New Section

In [2]:
!pip list |egrep transformers
!pip list |egrep peft
!pip list |egrep ^datasets


transformers                     4.29.0
peft                             0.3.0
datasets                         2.12.0


In [3]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from transformers import (default_data_collator,
                          get_linear_schedule_with_warmup)
from peft import (get_peft_config,
                  get_peft_model,
                  get_peft_model_state_dict,
                  LoraConfig,
                  TaskType)
import torch, os
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd
from datasets import (load_dataset,
                      Dataset, DatasetDict)

# Loading SNLI dataset from huggingface dataset hub

In [4]:
dataset = load_dataset("snli")
snli_sampled=pd.DataFrame(dataset["train"])
snli_sampled=snli_sampled.sample(frac=0.01, random_state=123)

Downloading builder script:   0%|          | 0.00/3.82k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.90k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/14.1k [00:00<?, ?B/s]

Downloading and preparing dataset snli/plain_text to /root/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b...


Downloading:   0%|          | 0.00/1.93k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Dataset snli downloaded and prepared to /root/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
snli_sampled.shape

(5502, 3)

In [6]:
snli_sampled.label.value_counts()

 2    1866
 0    1828
 1    1803
-1       5
Name: label, dtype: int64

In [7]:
snli_sampled= snli_sampled[snli_sampled.label>-1]

In [8]:
names=dataset["train"].features["label"].names
names

['entailment', 'neutral', 'contradiction']

In [9]:
mapp=dict(enumerate(names))
mapp

{0: 'entailment', 1: 'neutral', 2: 'contradiction'}

## Preparing dataset for training

In [10]:
snli_sampled_df= pd.DataFrame(snli_sampled)
snli_sampled_df["text"]= snli_sampled_df\
      .apply(lambda x: "S1:" +x.premise
             +" S2:"+x.hypothesis+
             ". The relation between S1 and S2 is labeled "+
             "as entailment, neutral or contradiction ?",
            axis=1)
snli_sampled_df["label"]=snli_sampled_df\
        .apply(lambda x: f"It is {mapp[x.label]}",
               axis=1)

In [11]:
snli_sampled_df.head(3)

Unnamed: 0,premise,hypothesis,label,text
190268,Two firefighters clad in protective gear are e...,Two firefighters are entering a house.,It is entailment,S1:Two firefighters clad in protective gear ar...
300525,Two men work together on a construction project.,Two men are working.,It is entailment,S1:Two men work together on a construction pro...
465851,Three men in uniform walk around town.,Three men rob the residents.,It is contradiction,S1:Three men in uniform walk around town. S2:T...


In [12]:
CUT=snli_sampled_df.shape[0]*7//10
print(f"Training set size is {CUT}")
print(f"Validation set size is \
        {snli_sampled_df.shape[0]-CUT}")
print(f"Total size is {snli_sampled_df.shape[0]}")

snli_sampled_dict= DatasetDict({"train":
                         Dataset.from_pandas(snli_sampled_df[:CUT]),
                        "validation":
                         Dataset.from_pandas(snli_sampled_df[CUT:]),
                        })

Training set size is 3847
Validation set size is         1650
Total size is 5497


In [13]:
def preprocess_function(examples):
    inputs = examples["text"]
    targets = examples["label"]
    model_inputs = tokenizer(inputs, max_length=max_length,
                             padding="max_length",
                             truncation=True,
                             return_tensors="pt")
    labels = tokenizer(targets,
                       max_length=max_target_len,
                       padding="max_length",
                       truncation=True,
                       return_tensors="pt")
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs

In [14]:
!nvidia-smi

Mon Sep 11 12:18:49 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   60C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [15]:
model_name_or_path="google/flan-t5-base" # 250M parameters
#model_name_or_path="google/flan-t5-large" # 780M parameters
#model_name_or_path="google/flan-t5-xl" # 3B parameters
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [16]:
max_length = 150
max_target_len=10

snli_processed = snli_sampled_dict.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=snli_sampled_dict["train"].column_names,
    load_from_cache_file=False,
  )
train_dataset = snli_processed["train"]
eval_dataset = snli_processed["validation"]

Map:   0%|          | 0/3847 [00:00<?, ? examples/s]

Map:   0%|          | 0/1650 [00:00<?, ? examples/s]

In [17]:
pd.DataFrame(train_dataset).head(3)

Unnamed: 0,input_ids,attention_mask,labels
0,"[180, 536, 10, 382, 210, 32, 29764, 3, 4651, 2...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[94, 19, 3, 35, 5756, 297, 1, -100, -100, -100]"
1,"[180, 536, 10, 382, 210, 32, 1076, 161, 544, 3...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[94, 19, 3, 35, 5756, 297, 1, -100, -100, -100]"
2,"[180, 536, 10, 11889, 15, 15, 1076, 16, 7117, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[94, 19, 27252, 1, -100, -100, -100, -100, -10..."


In [18]:
batch_size = 32
train_dataloader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=default_data_collator,
    batch_size=batch_size,
    pin_memory=True
)
eval_dataloader = DataLoader(
    eval_dataset,
    collate_fn=default_data_collator,
    batch_size=batch_size,
    pin_memory=True)

# Initilizing and Training PEFT model with LORA

In [19]:
# creating model with peft
# if you want to train the pipline in vanilla mode, set with_peft=False
with_peft=True
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
lr=2e-5
if with_peft:
  lr=1e-3
  peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM,
                           inference_mode=False,
                           r=8,
                           lora_alpha=32,
                           lora_dropout=0.1)
  model = get_peft_model(model, peft_config)
  model.print_trainable_parameters()

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

trainable params: 884736 || all params: 248462592 || trainable%: 0.3560841867092814


In [None]:
device="cuda"
model = model.to(device)
num_epochs = 3

# optimizer and lr scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

import time
st = time.time()
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(
                torch.argmax(outputs.logits, -1)\
                .detach().cpu().numpy(),
                skip_special_tokens=True)
        )
    eval_loss_avg = eval_loss / len(eval_dataloader)
    train_loss_avg = total_loss / len(train_dataloader)
    print(f"{epoch=}-> {train_loss_avg=}\t {eval_loss_avg=}")
et = time.time()
elapsed_time = et - st

100%|██████████| 121/121 [00:25<00:00,  4.83it/s]
100%|██████████| 52/52 [00:05<00:00, 10.20it/s]


epoch=0-> train_loss_avg=tensor(0.3226, device='cuda:0')	 eval_loss_avg=tensor(0.0877, device='cuda:0')


100%|██████████| 121/121 [00:25<00:00,  4.83it/s]
100%|██████████| 52/52 [00:05<00:00, 10.22it/s]


epoch=1-> train_loss_avg=tensor(0.1090, device='cuda:0')	 eval_loss_avg=tensor(0.0846, device='cuda:0')


100%|██████████| 121/121 [00:25<00:00,  4.83it/s]
100%|██████████| 52/52 [00:05<00:00, 10.19it/s]

epoch=2-> train_loss_avg=tensor(0.0981, device='cuda:0')	 eval_loss_avg=tensor(0.0810, device='cuda:0')





In [None]:
zipped=zip(eval_preds, snli_sampled_dict["validation"]["label"])
q=[real.strip() in pred.strip() for pred,real in zipped]
print(f"{model_name_or_path=}")
print(f"{num_epochs=}")
print(f"{elapsed_time=:.2f} seconds"
     + (" with PEFT" if with_peft else  " without PEFT"))
print(f"Accuracy:{sum(q)/len(q):.2f}")

# Saving PEFT models

In [None]:
# saving model
peft_model_path="my_lora_model"
model.save_pretrained(peft_model_path)

In [None]:
!ls -lh $peft_model_path

total 3.5M
-rw-r--r-- 1 root root  332 May 28 09:49 adapter_config.json
-rw-r--r-- 1 root root 3.5M May 28 09:49 adapter_model.bin


# Alternative Saving with merge_and_unload
This function merges the adapter weights with the base model, making it easier to use the merged model as a standalone model.



```
model = model.merge_and_unload()
model.save_pretrained("my_lora_model_merged")
```




# Loading Saved PEFT model

In [None]:
from peft import PeftModel, PeftConfig
config = PeftConfig.from_pretrained(peft_model_path)
config

PeftConfig(peft_type='LORA', base_model_name_or_path='google/flan-t5-base', task_type='SEQ_2_SEQ_LM', inference_mode=True)

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, peft_model_path)
model.eval()

In [None]:
my_text= snli_sampled_dict["validation"]["text"][0]
my_label= snli_sampled_dict["validation"]["label"][0]
print(f"{my_text=}")
print(f"{my_label=}")

my_text='S1:The young man wearing a blue sweatshirt and blue jeans is hopping over the railing on the fence. S2:The young man is hopping over the railing to save the young girl from drowning in the pool.. The relation between S1 and S2 is labeled as entailment, neutral or contradiction ?'
my_label='It is neutral'


In [None]:
inputs = tokenizer(my_text, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  180,   536,    10,   634,  1021,   388,  5119,     3,     9,  1692,
         10242,  9486,    11,  1692, 14118,    19,     3, 21714,   147,     8,
          6579,    53,    30,     8,  8227,     5,   180,   357,    10,   634,
          1021,   388,    19,     3, 21714,   147,     8,  6579,    53,    12,
          1097,     8,  1021,  3202,    45, 24614,    53,    16,     8,  2201,
             5,     5,    37,  4689,   344,   180,   536,    11,   180,   357,
            19,  3783,    15,    26,    38,     3,    35,  5756,   297,     6,
          7163,    42, 27252,     3,    58,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1]])}


In [None]:
with torch.no_grad():
    outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
    print(outputs)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

tensor([[   0,   94,   19, 7163,    1]])
['It is neutral']


# Loading merged model

If you applied merge_and_unload function, you should call the following code line



```
model = AutoModelForSeq2SeqLM.from_pretrained("my_lora_model_merged", load_in_8bit=True)
```



# QLORA
To use QLora, we only initialize the model with the following piece of code. The rest of the code is same with the code above! The code needs *bitsandbytes* module. So we start with installing it

In [None]:
!pip install bitsandbytes

In [None]:
from transformers import BitsAndBytesConfig
from peft import prepare_model_for_kbit_training
from transformers import AutoModelForSeq2SeqLM

model_name = "google/flan-t5-base"


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path,
                                              quantization_config=bnb_config,
                                              device_map={"":0})

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)