In [7]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
import wandb

In [8]:
import os
import torch

os.environ["CUDA_VISIBLE_DEVICES"] = "6"
print(f"Available CUDA devices: {torch.cuda.device_count()}")


Available CUDA devices: 1


In [13]:
# set the training arguments
# Step 3: Define the training arguments
# defaults of the training arguments can be found here - https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py
# code for the sft trainer can be found here - https://github.com/huggingface/trl/blob/main/trl/trainer/sft_trainer.py#L162
training_args = TrainingArguments(
    seed=0,

    # training
    do_train=True,
    learning_rate=6e-5,
    num_train_epochs=3,
    lr_scheduler_type="cosine",
    warmup_ratio=0.10,

    # evaluation
    # do_eval=True,
    # evaluation_strategy="steps",
    # eval_steps=1000,

    # batch size
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,

    # model/results saving
    save_strategy="steps",
    save_steps=1000,
    save_total_limit=10,
    output_dir='/home/public/smuckati/instr_fine_tuning/checkpoints',

    # logging
    report_to='wandb',
    logging_steps=1,

    # TODO: uncomment that on the next release
    # gradient_checkpointing_kwargs=script_args.gradient_checkpointing_kwargs,
)

In [14]:
dataset = load_dataset("lucasmccabe-lmi/CodeAlpaca-20k", split="train")

model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")

def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['instruction'])):
        text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}"
        output_texts.append(text)
    return output_texts

response_template = " ### Answer:"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

# initialize wandb tracking
wandb.init(project="Instruction Fine Tuning")

trainer = SFTTrainer(
    model,
    train_dataset=dataset,
    args=training_args,
    tokenizer=tokenizer,
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    max_seq_length=1024,
)

trainer.train()

0,1
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▇▆▄▅▆▅▇█▄▅▃▂▃▅▃▃▃▃▂▁▂▃▂▂▇▂▂▂▂▂▁▁▁▁▂▁▁▂▂▁
train/learning_rate,▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇████████████
train/loss,▅▃▃▃▂▂▂▅▂▃▁▁▂▂▂▂▃▅▃▁▂▁▄▅▂▄▃▃▄▄▂▄▄▃█▄▇▁▆▄

0,1
train/epoch,0.4
train/global_step,1000.0
train/grad_norm,4.95337
train/learning_rate,0.0006
train/loss,2.3939


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
1,2.6825
2,1.7688
3,2.3476
4,2.4419
5,2.6472
6,1.5694
7,2.243
8,2.7091
9,2.826
10,2.8717


In [5]:
model.save_pretrained('sft_model', safe_serialization=False)

### Look into the difference between fine-tuned model and base model

In [1]:
# load the sft model and check if instruction following is any better
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained('/home/public/smuckati/instr_fine_tuning/checkpoints/checkpoint-7000')

  from .autonotebook import tqdm as notebook_tqdm


In [10]:
prompt = 'She was happy, the emotion expressed here is happy. She was devastated, the emotion expressed here is: \n ### Answer:'
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m")
input_ids = tokenizer(prompt, return_tensors='pt').input_ids
print(input_ids)
output = model.generate(input_ids, max_length=50)
tokenizer.decode(output[0])

tensor([[    2,  2515,    21,  1372,     6,     5, 11926,  2327,   259,    16,
          1372,     4,   264,    21, 11521,     6,     5, 11926,  2327,   259,
            16,    35,  1437, 50118, 22560, 31652,    35]])


'</s>She was happy, the emotion expressed here is happy. She was devastated, the emotion expressed here is: \n ### Answer: She was happy, the emotion expressed here is: \n\nShe was sad, the emotion expressed here is:'

In [5]:
model_base = AutoModelForCausalLM.from_pretrained("facebook/opt-125m")
prompt = 'What is the capital of France?\nAnswer'
input_ids = tokenizer(prompt, return_tensors='pt').input_ids
print(input_ids)
output = model_base.generate(input_ids, max_length=50)
tokenizer.decode(output[0])

tensor([[    2,  2264,    16,     5,   812,     9,  1470,   116, 50118, 33683]])


'</s>What is the capital of France?\nAnswer: France.\nWhat is the capital of France?\nAnswer: France.\nWhat is the capital of France?\nAnswer: France.\nWhat is the capital of France?\nAnswer:'