In [None]:
! pip install transformers peft datasets

In [2]:
from huggingface_hub import notebook_login,login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,default_data_collator, get_linear_schedule_with_warmup
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, LoraConfig, TaskType
from torch.utils.data import DataLoader
from datasets import load_dataset
from tqdm import tqdm
import torch
import os

os.environ['TOKENIZERS_PARALLELISM'] ='false'

model_name='bigscience/mt0-large'
tokenzer='bigscience/mt0-large'
text_column='sentence'
label_column='text_label'
max_length=128
lr=1e-3
num_epochs=3
batch_size=8

In [5]:
# Creating model

# Enable training mode
peft_config=LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8,lora_alpha=32,lora_dropout=0.1)
model=AutoModelForSeq2SeqLM.from_pretrained(model_name)
model=get_peft_model(model,peft_config)
model.print_trainable_parameters()
model

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/800 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

trainable params: 2,359,296 || all params: 1,231,940,608 || trainable%: 0.1915


PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): MT5ForConditionalGeneration(
      (shared): Embedding(250112, 1024)
      (encoder): MT5Stack(
        (embed_tokens): Embedding(250112, 1024)
        (block): ModuleList(
          (0): MT5Block(
            (layer): ModuleList(
              (0): MT5LayerSelfAttention(
                (SelfAttention): MT5Attention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=1024, out_features=1024, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=1024, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=1024, bias=False)
                    )
                    (lora_embedding_A): ParameterDict()
  

In [8]:
# Load Model
dataset=load_dataset('financial_phrasebank','sentences_allagree')
dataset=dataset['train'].train_test_split(test_size=0.1)
dataset['validation'] = dataset['test']
del dataset['test']

classes=dataset['train'].features['label'].names
dataset=dataset.map(lambda x:{'text_label':[classes[label] for label in x['label']]},batched=True,num_proc=1)

Map:   0%|          | 0/2037 [00:00<?, ? examples/s]

Map:   0%|          | 0/227 [00:00<?, ? examples/s]

In [12]:
# Data Preprocessing
tokenizer=AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
  inputs=examples[text_column]
  targets=examples[label_column]
  model_inputs=tokenizer(inputs, max_length=max_length, padding='max_length',truncation=True, return_tensors='pt')
  labels=tokenizer(targets,max_length=3,padding='max_length',truncation=True,return_tensors='pt')
  labels=labels['input_ids']
  labels[labels==tokenizer.pad_token_id] = -100
  model_inputs['labels'] =labels
  return model_inputs

processed_datasets=dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset['train'].column_names,
    load_from_cache_file=False,
    desc='Running tokenizer on dataset'
)

train_dataset=processed_datasets['train']
eval_dataset=processed_datasets['validation']

train_dataloader=DataLoader(train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size,pin_memory=True)
eval_dataloader=DataLoader(eval_dataset, collate_fn=default_data_collator,batch_size=batch_size,pin_memory=True)

Running tokenizer on dataset:   0%|          | 0/2037 [00:00<?, ? examples/s]

Running tokenizer on dataset:   0%|          | 0/227 [00:00<?, ? examples/s]

In [13]:
# Optimizer and LR
optimizer=torch.optim.AdamW(model.parameters(),lr=lr)
lr_scheduler=get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=(len(train_dataloader)*num_epochs)
)


In [None]:
# Train and Eval
for epoch in range(num_epochs):
  model.train()
  total_loss=0
  for step, batch in enumerate(tqdm(train_dataloader)):
    batch={k: v for k,v in batch.items()}
    outputs=model(**batch)
    loss=outputs.loss
    total_loss += loss.detach().float()
    loss.backward()
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()

  model.eval()
  eval_loss=0
  eval_preds=[]
  for step, batch in enumerate(tqdm(eval_dataloader)):
    batch ={k:v for k,v in batch.items()}
    with torch.no_grad():
      outputs=model(**batch)
    loss=outputs.loss
    eval_loss += loss.detach().float()
    eval_preds.extend(tokenizer.batch_decode(torch.argmax(outputs.logits,-1).detach().cpu().numpt(),skip_special_tokens=True))

  eval_epoch_loss =eval_loss/len(eval_dataloader)
  eval_ppl=torch.exp(eval_epoch_loss)
  train_epoch_loss=total_loss/len(train_dataloader)
  train_ppl=torch.exp(train_epoch_loss)

  print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")

  0%|          | 0/255 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
 64%|██████▍   | 163/255 [1:26:41<47:07, 30.73s/it]

In [None]:
# Accuracy
correct=0
total=0
for pred, true in zip(eval_preds, dataset['validation']['text_label']):
  if pred.strip() ==true.strip():
    correct +=1
  total +=1
accuracy =correct/total *100
print(f"{accuracy=} % on the evaluation dataset")
print(f"{eval_preds[:10]=}")
print(f"{dataset['validation']['text_label'][:10]=}")

In [None]:
# Save model
peft_model_id = f"{model_name}_{peft_config.peft_type}_{peft_config.task_type}"
model.save_pretrained(peft_model_id)