In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import LoraConfig, get_peft_model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "microsoft/phi-2"

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

Loading tokenizer...


In [13]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [4]:
print("Loading model in 4-bit...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto"
)

Loading model in 4-bit...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 2/2 [02:32<00:00, 76.39s/it] 


In [6]:
# LoRA config
lora = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    task_type="CAUSAL_LM"
)

In [7]:
model = get_peft_model(model, lora)

In [8]:
# Load dataset
dataset = load_dataset("json", data_files="controller_interview_data.jsonl")

Generating train split: 6 examples [00:00, 66.96 examples/s]


In [9]:
def format(row):
    # Convert classification to text generation format
    return {
        "text": f"User answer: {row['instruction']}\nAction: {row['response']}"
    }

In [10]:
dataset = dataset.map(format)


Map: 100%|██████████| 6/6 [00:00<00:00, 169.08 examples/s]


In [11]:
# Tokenization
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding=True, max_length=256)


In [14]:

dataset = dataset.map(tokenize, batched=True)


Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map: 100%|██████████| 6/6 [00:00<00:00, 99.42 examples/s]


In [15]:
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)


In [16]:
training_args = TrainingArguments(
    output_dir="./controller-phi2",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=4,
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    save_steps=200,
)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    data_collator=data_collator
)

In [18]:
print("Training...")
trainer.train()

Training...




Step,Training Loss


TrainOutput(global_step=8, training_loss=3.343465805053711, metrics={'train_runtime': 1442.4646, 'train_samples_per_second': 0.017, 'train_steps_per_second': 0.006, 'total_flos': 8448935362560.0, 'train_loss': 3.343465805053711, 'epoch': 4.0})

In [19]:
print("Saving model...")
model.save_pretrained("./controller-phi2")
tokenizer.save_pretrained("./controller-phi2")

Saving model...


('./controller-phi2\\tokenizer_config.json',
 './controller-phi2\\special_tokens_map.json',
 './controller-phi2\\vocab.json',
 './controller-phi2\\merges.txt',
 './controller-phi2\\added_tokens.json',
 './controller-phi2\\tokenizer.json')