In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import TrainingArguments
from transformers import Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import get_peft_model, PromptTuningConfig, TaskType, PromptTuningInit
from peft import PeftModel, LoraConfig
from trl import SFTTrainer
import warnings
warnings.filterwarnings("ignore")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("knkarthick/dialogsum")
dataset_train, dataset_test, dataset_val = dataset["train"], dataset["test"], dataset["validation"]
dataset_train[0]

{'id': 'train_0',
 'dialogue': "#Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?\n#Person2#: I found it would be a good idea to get a check-up.\n#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.\n#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?\n#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.\n#Person2#: Ok.\n#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?\n#Person2#: Yes.\n#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.\n#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.\n#Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave.\n#Person2#: Ok, thanks doctor.",
 'summary': "Mr. Smith'

In [3]:
model_name = "facebook/opt-350m"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)

In [4]:
def generate_prompt(dialogue, summary=None, eos_token="</s>"):
    instruction = "Summarize the following:\n"
    input = f"{dialogue}\n"
    summary = f"Summary: {summary + ' ' + eos_token if summary else ''} "
    prompt = (" ").join([instruction, input, summary])
    return prompt

print(generate_prompt(dataset_train[0]["dialogue"], dataset_train[0]["summary"]))

Summarize the following:
 #Person1#: Hi, Mr. Smith. I'm Doctor Hawkins. Why are you here today?
#Person2#: I found it would be a good idea to get a check-up.
#Person1#: Yes, well, you haven't had one for 5 years. You should have one every year.
#Person2#: I know. I figure as long as there is nothing wrong, why go see the doctor?
#Person1#: Well, the best way to avoid serious illnesses is to find out about them early. So try to come at least once a year for your own good.
#Person2#: Ok.
#Person1#: Let me see here. Your eyes and ears look fine. Take a deep breath, please. Do you smoke, Mr. Smith?
#Person2#: Yes.
#Person1#: Smoking is the leading cause of lung cancer and heart disease, you know. You really should quit.
#Person2#: I've tried hundreds of times, but I just can't seem to kick the habit.
#Person1#: Well, we have classes and some medications that might help. I'll give you more information before you leave.
#Person2#: Ok, thanks doctor.
 Summary: Mr. Smith's getting a check-up, 

In [5]:
input_prompt = generate_prompt(dataset_train[50]["dialogue"])
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"]
generation_output = model.generate(
    input_ids=input_tokens,
    max_new_tokens=1000,
    do_sample=True,
    top_k=10,
    top_p=0.9,
    temperature=0.3,
    repetition_penalty=1.15,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
  )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)

Summarize the following:
 #Person1#: You have the right to remain silent. Anything you say can and will be used against you in a court of law. You have the right to have an attorney present during questioning. If you cannot afford an attorney, one will be appointed for you. Do you understand?
#Person2#: Yes.
#Person1#: What's your name?
#Person2#: My name is James.
#Person1#: What's your nationality?
#Person2#: American.
#Person1#: What's your relationship with the victim?
#Person2#: I don't know him.
#Person1#: Why did you attack the victim?
#Person2#: Because he beat me first when I tried to stop him from grabbing my bag and running away.
#Person1#: How many times did you stab the victim?
#Person2#: I stabbed his belly three times.
#Person1#: Did you know that your actions might cause serous injuries or death?
#Person2#: I knew, but I couldn't control myself.
#Person1#: Was it your intention to kill the victim?
#Person2#: No. I didn't kill him on purpose, madam. It's him who caused t

In [6]:
print(model)

OPTForCausalLM(
  (model): OPTModel(
    (decoder): OPTDecoder(
      (embed_tokens): Embedding(50272, 512, padding_idx=1)
      (embed_positions): OPTLearnedPositionalEmbedding(2050, 1024)
      (project_out): Linear(in_features=1024, out_features=512, bias=False)
      (project_in): Linear(in_features=512, out_features=1024, bias=False)
      (layers): ModuleList(
        (0-23): 24 x OPTDecoderLayer(
          (self_attn): OPTAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (activation_fn): ReLU()
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=409

In [7]:
lora_config = LoraConfig(
        r=8,
        lora_alpha=8,
        lora_dropout=0.1,
        target_modules=["q_proj","k_proj","v_proj","o_proj"],
        bias="none",
        task_type="CAUSAL_LM",
    )

In [8]:
tokenizer.add_special_tokens({"pad_token": "<PAD>"})
model.resize_token_embeddings(len(tokenizer))

Embedding(50266, 512)

In [9]:
model = get_peft_model(model, lora_config)

In [10]:
output_dir = "output"
per_device_train_batch_size = 1 #4
gradient_accumulation_steps = 1 #4
per_device_eval_batch_size = 2 #4
eval_accumulation_steps = 2 #4
optim = "adamw_hf"
save_steps = 10
logging_steps = 10
learning_rate = 5e-3
max_grad_norm = 0.3
max_steps = 10
warmup_ratio = 0.03
evaluation_strategy="steps"
lr_scheduler_type = "constant"

training_args = TrainingArguments(
            output_dir=output_dir,
            per_device_train_batch_size=per_device_train_batch_size,
            gradient_accumulation_steps=gradient_accumulation_steps,
            optim=optim,
            evaluation_strategy=evaluation_strategy,
            save_steps=save_steps,
            learning_rate=learning_rate,
            logging_steps=logging_steps,
            max_grad_norm=max_grad_norm,
            max_steps=max_steps,
            warmup_ratio=warmup_ratio,
            group_by_length=True,
            lr_scheduler_type=lr_scheduler_type,
            ddp_find_unused_parameters=False,
            eval_accumulation_steps=eval_accumulation_steps,
            per_device_eval_batch_size=per_device_eval_batch_size,
        )

In [11]:
def formatting_func(prompt):
    output = []

    for d, s in zip(prompt["dialogue"], prompt["summary"]):
        op = generate_prompt(d, s)
    output.append(op)

    return output


trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    peft_config=lora_config,
    formatting_func=formatting_func,
    max_seq_length=1024,
    tokenizer=tokenizer,
    args=training_args
)

# We will also pre-process the model by upcasting the layer norms in float 32 for more stable training
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

trainer.train()
trainer.save_model(f"{output_dir}/final")

Map: 100%|██████████| 500/500 [00:00<00:00, 19895.00 examples/s]
100%|██████████| 10/10 [06:29<00:00, 47.51s/it]

{'loss': 2.4789, 'grad_norm': 1.3748090267181396, 'learning_rate': 0.005, 'epoch': 0.77}


                                               
100%|██████████| 10/10 [06:33<00:00, 47.51s/it]

{'eval_loss': 1.755002498626709, 'eval_runtime': 3.9947, 'eval_samples_per_second': 0.25, 'eval_steps_per_second': 0.25, 'epoch': 0.77}


100%|██████████| 10/10 [06:36<00:00, 39.63s/it]


{'train_runtime': 396.2925, 'train_samples_per_second': 0.025, 'train_steps_per_second': 0.025, 'train_loss': 2.4789077758789064, 'epoch': 0.77}


In [13]:
peft_model_id = "output/checkpoint-10"
peft_model = PeftModel.from_pretrained(model, peft_model_id, torch_dtype=torch.float16, offload_folder="lora_results/lora_7/temp")

In [None]:
input_prompt = generate_prompt(dataset_train[50]["dialogue"])
input_tokens = tokenizer(input_prompt, return_tensors="pt")["input_ids"]
generation_output = peft_model.generate(
    input_ids=input_tokens,
    max_new_tokens=100,
    do_sample=True,
    top_k=10,
    top_p=0.9,
    temperature=0.3,
    repetition_penalty=1.15,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
  )
op = tokenizer.decode(generation_output[0], skip_special_tokens=True)
print(op)