In [2]:
!cp /opt/conda/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cpu.so /opt/conda/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cpu_bk.so
!cp /opt/conda/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cuda120.so /opt/conda/lib/python3.11/site-packages/bitsandbytes/libbitsandbytes_cpu.so


In [12]:
!nvidia-smi

Wed Jun 12 07:05:48 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.12             Driver Version: 535.104.12   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-80GB          On  | 00000000:00:05.0 Off |                    0 |
| N/A   36C    P0             152W / 500W |  80872MiB / 81920MiB |     33%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [1]:
from datasets import load_dataset

samsum_dataset = load_dataset('samsum', trust_remote_code=True)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(f'Length of train data: {len(samsum_dataset["train"])}')
print(f'Length of validation data: {len(samsum_dataset["test"])}')

print(f'Example of train data: {samsum_dataset["train"][0]}')

Length of train data: 14732
Length of validation data: 819
Example of train data: {'id': '13818513', 'dialogue': "Amanda: I baked  cookies. Do you want some?\r\nJerry: Sure!\r\nAmanda: I'll bring you tomorrow :-)", 'summary': 'Amanda baked cookies and will bring Jerry some tomorrow.'}


In [3]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id = 'google/flan-t5-small'
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [4]:
from datasets import concatenate_datasets
import numpy as np

input_ids = concatenate_datasets([samsum_dataset['train'], samsum_dataset['test']]).map(lambda x: tokenizer(x['dialogue'], truncation = True), batched = True, remove_columns = ['summary', 'dialogue'])
inp_lenghts = [len(x) for x in input_ids['input_ids']]
max_source_len = int(np.percentile(inp_lenghts, 85))
print(f"Max source length is: {max_source_len}")

target_ids = concatenate_datasets([samsum_dataset['train'], samsum_dataset['test']]).map(lambda x: tokenizer(x['summary'], truncation = True), batched = True, remove_columns = ['summary', 'dialogue'])
target_lenghts = [len(x) for x in target_ids['input_ids']]
max_target_len = int(np.percentile(target_lenghts, 90))
print(f"Max target length is: {max_target_len}")

Max source length is: 255
Max target length is: 50


In [5]:
def preprocessing_inputs(sample, padding = 'max_length'):
  inputs = ['summarize: ' + d for d in sample['dialogue']]
  label = sample['summary']

  model_inputs = tokenizer(inputs, padding = padding, max_length = max_source_len, truncation = True)
  label_ids = tokenizer(label, padding = padding, max_length = max_target_len, truncation = True)

  if padding == 'max_length':
    label_ids['input_ids'] = [[(l if l!= tokenizer.pad_token_id else -100) for l in label] for label in label_ids['input_ids']]

  model_inputs['labels'] = label_ids['input_ids']
  return model_inputs

tokenized_inputs = samsum_dataset.map(preprocessing_inputs, batched = True, remove_columns = ['dialogue', 'summary', 'id'])
print(tokenized_inputs['train'][0])
print(f"Keys of tokenized dataset: {list(tokenized_inputs['train'].features)}")

Map: 100%|██████████| 14732/14732 [00:04<00:00, 3246.92 examples/s]

{'input_ids': [21603, 10, 21542, 10, 27, 13635, 5081, 5, 531, 25, 241, 128, 58, 16637, 10, 10625, 55, 21542, 10, 27, 31, 195, 830, 25, 5721, 3, 10, 18, 61, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,




In [6]:
tokenizer.pad_token_id

0

In [7]:
tokenized_inputs['train'].save_to_disk('data/train')
tokenized_inputs['test'].save_to_disk('data/test')


Saving the dataset (1/1 shards): 100%|██████████| 14732/14732 [00:00<00:00, 42715.50 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 819/819 [00:00<00:00, 5742.04 examples/s]


In [8]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit = True, device_map = {"":0})

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [11]:
# model.modules

In [11]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
 
# Define LoRA Config
lora_config = LoraConfig(
 r=16,
 lora_alpha=32,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)
# prepare int-8 model for training
#quant_model = prepare_model_for_int8_training(model)
 
# add LoRA adaptor
quant_model = get_peft_model(model, lora_config)
quant_model.print_trainable_parameters()
 

trainable params: 688,128 || all params: 77,649,280 || trainable%: 0.8862


In [14]:
from transformers import DataCollatorForSeq2Seq

label_pad_token_id = -100

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model = quant_model,
    label_pad_token_id = label_pad_token_id,
    pad_to_multiple_of = 8)


In [15]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir="lora-flan-t5-xxl"

training_args = Seq2SeqTrainingArguments(
    output_dir = output_dir,
    learning_rate = 1e-3,
    num_train_epochs = 1,
    auto_find_batch_size = True,
    logging_steps = 300,
    logging_dir = f'{output_dir}/logs',
    logging_strategy = 'steps',
    save_strategy = 'no',
    report_to = 'tensorboard'
)

trainer = Seq2SeqTrainer(
    model = quant_model,
    args = training_args,
    data_collator = data_collator,
    train_dataset = tokenized_inputs['train']
)

model.config.use_cache = False
    

In [16]:
trainer.train()

Step,Training Loss
300,1.9333
600,1.9113
900,1.9186
1200,1.9238
1500,1.9057
1800,1.8774


TrainOutput(global_step=1842, training_loss=1.9108523556256787, metrics={'train_runtime': 348.575, 'train_samples_per_second': 42.264, 'train_steps_per_second': 5.284, 'total_flos': 1384840660254720.0, 'train_loss': 1.9108523556256787, 'epoch': 1.0})

In [17]:
peft_model_id = "t5-small-fine-tuned"
trainer.model.save_pretrained(peft_model_id)
tokenizer.save_pretrained(peft_model_id)



('t5-small-fine-tuned/tokenizer_config.json',
 't5-small-fine-tuned/special_tokens_map.json',
 't5-small-fine-tuned/tokenizer.json')

In [1]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

peft_model_id = "t5-small-fine-tuned"
peft_config = PeftConfig.from_pretrained(peft_model_id)
    
model = AutoModelForSeq2SeqLM.from_pretrained(peft_config.base_model_name_or_path, load_in_8bit = True, device_map = {"":0})
tokenizer = AutoTokenizer.from_pretrained(peft_config.base_model_name_or_path)

peft_model = PeftModel.from_pretrained(model, peft_model_id, device_map = {"":0})
peft_model.eval()

  from .autonotebook import tqdm as notebook_tqdm
The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): T5ForConditionalGeneration(
      (shared): Embedding(32128, 512)
      (encoder): T5Stack(
        (embed_tokens): Embedding(32128, 512)
        (block): ModuleList(
          (0): T5Block(
            (layer): ModuleList(
              (0): T5LayerSelfAttention(
                (SelfAttention): T5Attention(
                  (q): lora.Linear8bitLt(
                    (base_layer): Linear8bitLt(in_features=512, out_features=384, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=512, out_features=16, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=16, out_features=384, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()


In [2]:
from datasets import load_dataset
from random import randrange

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
samsum_dataset = load_dataset('samsum', trust_remote_code=True)
rand_idx = int(randrange(len(samsum_dataset['test'])))
sample = samsum_dataset['test'][rand_idx]
tokenized_sample = tokenizer('summarize: ' + sample['dialogue'], return_tensors = 'pt', truncation = True)
output = model.generate(tokenized_sample['input_ids'].to(device), max_new_tokens = 50, do_sample = True, top_p = 0.9)
decoded_output = tokenizer.batch_decode(output.detach().cpu().numpy(), skip_special_tokens = True)
decoded_output

["Diana's going for a hip-hop workshop. She will meet Frann in a week, but he hasn't danced hip-hop for years."]

In [3]:
sample

{'id': '13813695',
 'dialogue': "Diana: Remeber our last conversation? \r\nDiana: I'm going for hip-hop workshop and thought about you. \r\nDiana: Would you like to come with me?\r\nFrann: I don't know. Is there an open level or beginner level class?\r\nFrann: I haven't danced hip-hop for years! :O\r\nDiana: There is. \r\nFrann: Okey, I can go. When and where:\r\nDiana: <file_other>\r\nDiana: There a form to fill in at the bottom.\r\nFrann: OK, I see it. Thx! \r\nDiana: No problem :)\r\nFrann: And see you in a week then!\r\nDiana: Yeah, see u! Kisses!\r\nFrann: :*",
 'summary': 'Diana and Fran are going to a hip-hop workshop in a week.'}

In [9]:
from datasets import load_from_disk
from tqdm import tqdm
import evaluate

metric = evaluate.load('rouge')
def decode_using_model(sample, model, tokenizer, max_target_length = 50):
    sample_inp = sample['input_ids'].unsqueeze(0).to(device)
    sample_attn = sample['attention_mask'].unsqueeze(0).to(device)
    sample_ref = torch.where(sample['labels'] == -100, tokenizer.pad_token_id, sample['labels'])
    output = model.generate(sample_inp, attention_mask = sample_attn, max_new_tokens = max_target_length, do_sample = True, top_p = 0.9)
    decoded_output = tokenizer.decode(output[0].detach().cpu().numpy(), skip_special_tokens = True)
    decoded_ref = tokenizer.decode(sample_ref.numpy(), skip_special_tokens = True)
    return decoded_output, decoded_ref

test_data = load_from_disk('data/test').with_format('torch')
predictions, references = [],[]
for sample in tqdm(test_data):    
    prediction, reference = decode_using_model(sample, model, tokenizer)
    predictions.append(prediction) 
    references.append(reference)
    
rogue = metric.compute(predictions = predictions, references = references, use_stemmer = True)
print(f"Rogue 1 score is: {rogue['rouge1']*100:2f}%")
print(f"Rogue 2 score is: {rogue['rouge2']*100:2f}%")
print(f"Rogue L score is: {rogue['rougeL']*100:2f}%")
print(f"Rogue L sum score is: {rogue['rougeLsum']*100:2f}%")



Rogue 1 score is: 39.572076%
Rogue 2 score is: 14.842914%
Rogue L score is: 31.379393%
Rogue L sum score is: 31.362569%
