# Lora for Book


In [42]:
%%capture
pip install -q transformers peft datasets accelerate bitsandbytes

In [43]:
#LORA.py
#from tqdm import tqdm
import os, torch, pandas as pd  
from google.colab import drive
drive.mount("/content/drive")
#os.chdir("/content/drive/MyDrive/DataScienceLab/LLM")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#model_names=["google/flan-t5-xxl","google/flan-t5-xl","google/flan-t5-large"]
from transformers import AutoModelForSeq2SeqLM
model_names=["t5-base","t5-large","google/flan-t5-xl"]
for model_name in model_names:
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
  model_8bit = AutoModelForSeq2SeqLM.from_pretrained(model_name, 
                                                     load_in_8bit=True, 
                                                     torch_dtype=torch.float16, 
                                                     device_map='auto')
  m_size=model.get_memory_footprint()
  m_8bit_size=model_8bit.get_memory_footprint()
  print(f"Model {model_name}: The original size is {m_size/2**30:.2f} GB, and 8bit:{m_8bit_size/2**30:.2f} GB")
  del model
  del model_8bit

Model t5-base: The original size is 0.83 GB, and 8bit:0.39 GB
Model t5-large: The original size is 2.75 GB, and 8bit:1.28 GB


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Model google/flan-t5-xl: The original size is 10.62 GB, and 8bit:4.18 GB


In [44]:
model_name_or_path = "t5-small"

In [45]:
# data preprocessing
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

def preprocess_function(examples):
    inputs = examples[text_column]
    targets = examples[label_column]
    model_inputs = tokenizer(inputs, max_length=max_length, 
                             padding="max_length", 
                             truncation=True, 
                             return_tensors="pt")
    labels = tokenizer(targets, 
                       max_length=max_target_len, 
                       padding="max_length", 
                       truncation=True, 
                       return_tensors="pt")
    labels = labels["input_ids"]
    labels[labels == tokenizer.pad_token_id] = -100
    model_inputs["labels"] = labels
    return model_inputs

In [46]:
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, LoraConfig, TaskType
from transformers import AutoConfig, AutoModelForSeq2SeqLM
import torch

peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, 
                         inference_mode=False, 
                         r=8, lora_alpha=32, lora_dropout=0.1)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path, 
                                              load_in_8bit=True,
                                              torch_dtype=torch.float16, 
                                              device_map='auto')
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 294912 || all params: 60801536 || trainable%: 0.4850403779272945


In [47]:
from datasets import load_dataset, Dataset, DatasetDict
dataset = load_dataset("snli")



  0%|          | 0/3 [00:00<?, ?it/s]

In [48]:
import pandas as pd
snli_data=pd.DataFrame(dataset["train"])
snli_data=snli_data.sample(frac=0.01)

In [49]:
snli_data.shape

(5502, 3)

In [50]:
snli_data.head(2)

Unnamed: 0,premise,hypothesis,label
451411,An older gentleman in an orange jumpsuit and g...,A man is cleaning up around his house.,1
70668,Boy and girl running along the beach.,Two people run on the beach.,0


In [51]:
snli_data.label.value_counts()

 2    1904
 0    1812
 1    1783
-1       3
Name: label, dtype: int64

In [52]:
snli_data= snli_data[snli_data.label>-1]

In [53]:
names=dataset["train"].features["label"].names
names

['entailment', 'neutral', 'contradiction']

In [54]:
mapp=dict(enumerate(names))
mapp

{0: 'entailment', 1: 'neutral', 2: 'contradiction'}

In [55]:
snli_dataframe= pd.DataFrame(snli_data)
snli_dataframe["text"]= snli_dataframe\
      .apply(lambda x: "S1:" +x.premise +"S2:"+x.hypothesis+
             " S1 and S2 are labeled as entailment, neutral or contradiction", 
            axis=1)
snli_dataframe["label"]=snli_dataframe\
        .apply(lambda x: f"They are {mapp[x.label]}", axis=1)

In [56]:
snli_dataframe.shape[0]

5499

In [57]:
from datasets import load_dataset, Dataset, DatasetDict
snli_data2= DatasetDict({"train": Dataset.from_pandas(snli_dataframe[:400]),
                        "validation": Dataset.from_pandas(snli_dataframe[400:]),
                        })

In [58]:
import datasets
text_column = "text"
label_column = "label"
max_length = 512
max_target_len=32

processed_datasets = snli_data2.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=snli_data2["train"].column_names,
    load_from_cache_file=False,
  )

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/5099 [00:00<?, ? examples/s]

In [59]:
snli_data2["train"].column_names

['premise', 'hypothesis', 'label', 'text', '__index_level_0__']

In [60]:
from torch.utils.data import DataLoader
from transformers import default_data_collator,  get_linear_schedule_with_warmup
batch_size=16
train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)

In [61]:
# optimizer and lr scheduler
from tqdm import tqdm

device="cuda"
global_step = 0
num_epochs=3
lr = 1e-3

optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader) * num_epochs),
)

#model.base_model.peft_config.total_step = len(train_dataloader) * num_epochs


# training and evaluation
#model = model.to(device)

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for step, batch in enumerate(tqdm(train_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        total_loss += loss.detach().float()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        # Update the importance of low-rank matrices
        # and allocate the budget accordingly.
        model.base_model.update_and_allocate(global_step)
        optimizer.zero_grad()
        global_step += 1

    model.eval()
    eval_loss = 0
    eval_preds = []
    for step, batch in enumerate(tqdm(eval_dataloader)):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        loss = outputs.loss
        eval_loss += loss.detach().float()
        eval_preds.extend(
            tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(), skip_special_tokens=True)
        )

    eval_epoch_loss = eval_loss / len(train_dataloader)
    eval_ppl = torch.exp(eval_epoch_loss)
    train_epoch_loss = total_loss / len(eval_dataloader)
    train_ppl = torch.exp(train_epoch_loss)
    print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")


  0%|          | 0/25 [00:00<?, ?it/s]


In [24]:
#pip install evaluate rouge_score

In [30]:
#import evaluate
#metric1 = evaluate.load("rouge")
#metric2 = evaluate.load("f1")

#import nltk
import numpy as np
from transformers import AutoTokenizer, DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer


    
 #model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
   save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    #fp16=True,
    #load_best_model_at_end=True,
    predict_with_generate=False
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset= train_dataset,
    eval_dataset= eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

res_t=trainer.train()



Epoch,Training Loss,Validation Loss


In [None]:
eval_dataset

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 150
})

In [None]:
import datasets
text_column = "text"
label_column = "label"
max_length = 512
max_target_len=32

processed_datasets = snli_data2.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    #remove_columns=snli_data2["train"].column_names,
    load_from_cache_file=False,
  )

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [None]:
eval_dataset

Dataset({
    features: ['premise', 'hypothesis', 'label', 'text', '__index_level_0__', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 150
})

In [None]:
text_column

'text'

In [None]:
model.eval()
i = 13
inputs = tokenizer(eval_dataset[text_column][i], return_tensors="pt")
print(eval_dataset[text_column][i])
print(eval_dataset[label_column][i])

#print(inputs)

with torch.no_grad():
    outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
    print(outputs)
    print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))

S1:Two young men and a woman are standing at the back of a blue pickup truck sorting through vegetables with smiles on their faces.S2:Two young men and a woman are looking at vegetables and smiling. S1 and S2 are labeled as entailment, neutral or contradiction
They are entailment
tensor([[   0,  328,   33, 7163,    1]])
['They are neutral']
