In [2]:
# !pip install --upgrade pip
# !pip install --disable-pip-version-check \
#     torch==1.13.1 \
#     torchdata==0.5.1 --quiet

# !pip install \
#     transformers==4.27.2 \
#     datasets==2.11.0 \
#     evaluate==0.4.0 \
#     rouge_score==0.1.2 \
#     loralib==0.1.1 \
#     peft==0.3.0 --quiet

In [3]:
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
import time
import evaluate
import pandas as pd
import numpy as np

In [4]:
dataset=load_dataset("csv",data_files="/content/train_alpaca_dataset_summary.csv")

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-c0add3ad325c1f07/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-c0add3ad325c1f07/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 178
    })
})

In [6]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [7]:
def trainable_model_parameters(model):
  # Function to check the number of trainable parameters
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"


In [8]:
def tokenize_function(example):
    start_prompt = 'Summarize the following conversation.\n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["input"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["output"], padding="max_length", truncation=True, return_tensors="pt").input_ids

    return example
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/178 [00:00<?, ? examples/s]

In [9]:
print(f"Shapes of the datasets:")
print(f"Training: {tokenized_datasets['train'].shape}")

Shapes of the datasets:
Training: (178, 4)


In [10]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [12]:
peft_model = get_peft_model(original_model,
                            lora_config)
print(trainable_model_parameters(peft_model))

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%


In [13]:
output_dir = f'./alpaca-summary-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3, # Higher learning rate than full fine-tuning.
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1
)

peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

In [14]:
peft_trainer.train()

peft_model_path="./peft-alpaca-summary-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)



Step,Training Loss
1,36.5


('./peft-alpaca-summary-checkpoint-local/tokenizer_config.json',
 './peft-alpaca-summary-checkpoint-local/special_tokens_map.json',
 './peft-alpaca-summary-checkpoint-local/tokenizer.json')

Inferenceing

In [15]:
from peft import PeftModel, PeftConfig

peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

peft_model = PeftModel.from_pretrained(peft_model_base,
                                       '/content/peft-alpaca-summary-checkpoint-local/',
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

In [16]:
dataset_test=load_dataset("csv",data_files="/content/test_alpaca_dataset_summary.csv")

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-3cf1807363829b98/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-3cf1807363829b98/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [17]:
dataset_test

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 14
    })
})

In [18]:
dash_line = '*'.join('' for x in range(100))

In [27]:
index = 1
test_input_data = dataset_test['train'][index]['input']
baseline_human_summary = dataset_test['train'][index]['output']

prompt = f"""
Summarize the following conversation.

{test_input_data}

Summary: """

input_ids = tokenizer(prompt, return_tensors="pt").input_ids

peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=300, num_beams=1))
peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)

print(dash_line)
print(f'REFERENCE SUMMARY:\n{baseline_human_summary}')
print(dash_line)
print(f'PEFT MODEL SUMMARY: {peft_model_text_output}')
print(dash_line)

***************************************************************************************************
REFERENCE SUMMARY:
E-commerce is the purchasing of goods and services online, which has grown from its early days to today's socially interactive experience. Companies such as Amazon and eBay have been established to connect buyers and sellers in an online marketplace, while many other companies have incorporated an online model that is either their main form of commerce or expands and supplements existing brick-and-mortar stores.
***************************************************************************************************
PEFT MODEL SUMMARY: Understand the concept of e-commerce.
***************************************************************************************************


In [21]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [22]:
peft_model.push_to_hub("flan-t5_fine_tuned_summarization_alpaca_updated_final")

adapter_model.bin:   0%|          | 0.00/14.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Sakil/flan-t5_fine_tuned_summarization_alpaca_updated_final/commit/c89cd683cc76198afcb19309912696a8aa322246', commit_message='Upload model', commit_description='', oid='c89cd683cc76198afcb19309912696a8aa322246', pr_url=None, pr_revision=None, pr_num=None)